crm114-20100106-BlameMichelson.src/0002755000000000017500000000000011321154266014777 5ustar rootwsycrm114-20100106-BlameMichelson.src/osbf-util.c0000644000000000017500000003351611321154266017055 0ustar rootwsy// osbf-util.c - utility for munging css files, version X0.1 // Copyright 2004 Fidelis Assis // Copyright 2004-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // OBS: This program is a modified version of the original cssutil, // specific for the new osbf format. It is not compatible with // the original css format. -- Fidelis Assis // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" #include "crm114_osbf.h" char version[] = "1.1"; void helptext () { fprintf (stdout, "osbf-util version %s - generic osbf file utility.\n" "Usage: osbfutil [options]... css-filename\n" " -b - brief; print only summary\n" " -h - print this help\n" " -q - quite mode; no warning messages\n" " -r - report then exit (no menu)\n" " -s css-size - if no css file found, create new\n" " one with this many buckets.\n" " -S css-size - same as -s, but round up to next\n" " 2^n + 1 boundary.\n" " -v - print version and exit\n" " -D - dump css file to stdout in CSV format.\n" " -R csv-file - create and restore css from CSV.\n" " Options -s and -S are ignored when" " restoring.\n", VERSION); } int main (int argc, char **argv) { long i, k; // some random counters, when we need a loop long v; long sparse_spectrum_file_length = OSBF_DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH; long user_set_css_length = 0; long hfsize; long long sum; // sum of the hits... can be _big_. int brief = 0, quiet = 0, dump = 0, restore = 0; int opt, fields; int report_only = 0; long *bcounts; long maxchain; long curchain; long totchain; long fbuckets; long nchains; long ofbins; char cmdstr[255]; char cssfile[255]; char csvfile[255]; unsigned char cmdchr[2]; char crapchr[2]; float cmdval; int zloop, cmdloop, version_index; // the following for crm114.h's happiness char *newinputbuf; newinputbuf = (char *) &hfsize; bcounts = malloc (sizeof (unsigned long) * OSBF_FEATUREBUCKET_VALUE_MAX); { struct stat statbuf; // filestat buffer OSBF_FEATURE_HEADER_STRUCT *header; // the header of the hash file OSBF_FEATUREBUCKET_STRUCT *hashes; // the text of the hash file // parse cmdline options while ((opt = getopt (argc, argv, "bDhR:rqs:S:v")) != -1) { switch (opt) { case 'b': brief = 1; // brief, no 'bin value ...' lines break; case 'D': dump = 1; // dump css file, no cmd menu break; case 'q': quiet = 1; // quiet mode, no warning messages break; case 'R': { FILE *f; unsigned long key, hash, value; OSBF_FEATURE_HEADER_STRUCT h; // count lines to determine the number of buckets and check CSV format if (user_trace) fprintf (stderr, "Opening OSBF file %s for read\n", optarg); if ((f = fopen (optarg, "rb")) != NULL) { // try to find the header reading first 2 "buckets" if (fscanf (f, "%lu;%lu;%lu\n", (unsigned long *) h.version, &(h.flags), &(h.buckets_start)) != 3) { fprintf (stderr, "\n %s is not in the right CSV format.\n", optarg); exit (EXIT_FAILURE); } if (*((unsigned long *) h.version) != OSBF_VERSION) { fprintf (stderr, "\n %s is not an OSBF CSV file.\n", optarg); fclose (f); exit (EXIT_FAILURE); } if (fscanf (f, "%lu;%lu;%lu\n", &(h.buckets), &hash, &value) != 3) { fprintf (stderr, "\n %s is not in the right CSV format.\n", optarg); exit (EXIT_FAILURE); } // start with -headersize buckets, discounting 2 "buckets" alread read sparse_spectrum_file_length = 2 - h.buckets_start; while (!feof (f)) if (fscanf (f, "%lu;%lu;%lu\n", &key, &hash, &value) == 3) sparse_spectrum_file_length++; else { fprintf (stderr, "\n %s is not in the right CSV format.\n", optarg); exit (EXIT_FAILURE); } fclose (f); // check the number of buckets if (sparse_spectrum_file_length != h.buckets) { fprintf (stderr, "\n Wrong number of buckets! %s is not in the right CSV format.\n", optarg); exit (EXIT_FAILURE); } strcpy (csvfile, optarg); } else { fprintf (stderr, "\n Couldn't open csv file %s; errno=%d.\n", optarg, errno); exit (EXIT_FAILURE); } } restore = 1; // restore css file, no cmd menu break; case 'r': report_only = 1; // print stats only, no cmd menu. break; case 's': // set css size to option value case 'S': // same as above but round up to next 2^n+1 if (restore) { fprintf (stderr, "\nOptions -s, -S ignored when restoring.\n"); break; } if (sscanf (optarg, "%ld", &sparse_spectrum_file_length)) { if (!quiet) fprintf (stderr, "\nOverride css creation length to %ld\n", sparse_spectrum_file_length); user_set_css_length = 1; } else { fprintf (stderr, "On -%c flag: Missing or incomprehensible number of buckets.\n", opt); exit (EXIT_FAILURE); } if (opt == 'S') // round up to next 2^n+1 { int k; k = (long) floor (log10 (sparse_spectrum_file_length - 1) / log10 (2.0)); while ((2 << k) + 1 < sparse_spectrum_file_length) k++; sparse_spectrum_file_length = (2 << k) + 1; user_set_css_length = 1; } break; case 'v': fprintf (stderr, " This is osbf-util, version %s\n", version); fprintf (stderr, " Copyright 2004-2006 William S. Yerazunis.\n"); fprintf (stderr, " This software is licensed under the GPL with ABSOLUTELY NO WARRANTY\n"); exit (EXIT_SUCCESS); default: helptext (); exit (EXIT_SUCCESS); break; } } if (optind < argc) strncpy (cssfile, argv[optind], sizeof (cssfile)); else { helptext (); exit (EXIT_SUCCESS); } // and stat it to get it's length k = stat (cssfile, &statbuf); // quick check- does the file even exist? if (k == 0) { if (restore) { fprintf (stderr, "\n.CSS file %s exists! Restore operation aborted.\n", cssfile); exit (EXIT_FAILURE); } hfsize = statbuf.st_size; if (!quiet && user_set_css_length) fprintf (stderr, "\n.CSS file %s exists; -s, -S options ignored.\n", cssfile); } else { // file didn't exist... create it if (!quiet && !restore) fprintf (stdout, "\nHad to create .CSS file %s with %lu buckets\n", cssfile, sparse_spectrum_file_length); if (crm_osbf_create_cssfile (cssfile, sparse_spectrum_file_length, OSBF_VERSION, 0, OSBF_CSS_SPECTRA_START) != EXIT_SUCCESS) exit (EXIT_FAILURE); k = stat (cssfile, &statbuf); hfsize = statbuf.st_size; } // // mmap the hash file into memory so we can bitwhack it header = crm_mmap_file ( cssfile, 0, hfsize, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); if (header == MAP_FAILED) { fprintf (stderr, "\n Couldn't mmap file %s into memory; errno=%d .\n", cssfile, errno); exit (EXIT_FAILURE); } if (*((unsigned long *) (header->version)) != OSBF_VERSION) { fprintf (stderr, "\n %s is the wrong version. We're expecting a %s css file.\n", cssfile, CSS_version_name[OSBF_VERSION]); crm_munmap_file ((void *) header); exit (EXIT_FAILURE); } hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start; if (hashes == MAP_FAILED) { fprintf (stderr, "\n Couldn't open RW file %s; errno=%d .\n", cssfile, errno); exit (EXIT_FAILURE); } // from now on, hfsize is buckets, not bytes. hfsize = statbuf.st_size / sizeof (OSBF_FEATUREBUCKET_STRUCT); if (dump) { /* dump the css file */ OSBF_FEATUREBUCKET_STRUCT *bucket; unsigned long *p; bucket = (OSBF_FEATUREBUCKET_STRUCT *) header; for (i = 0; i < hfsize; i++) { p = (unsigned long *) &bucket[i]; printf ("%lu;%lu;%lu\n", p[0], p[1], p[2]); } } if (restore) { FILE *f; OSBF_FEATUREBUCKET_STRUCT *bucket; unsigned long *p; // restore the css file - note that if we DIDN'T create // it already, then this will fail. // if ((f = fopen (csvfile, "rb")) == NULL) { fprintf (stderr, "\n Couldn't open csv file %s; errno=%d.\n", csvfile, errno); exit (EXIT_FAILURE); } bucket = (OSBF_FEATUREBUCKET_STRUCT *) header; for (i = 0; i < hfsize; i++) { p = (unsigned long *) &bucket[i]; dontcare = fscanf (f, "%lu;%lu;%lu\n", &p[0], &p[1], &p[2]); } fclose (f); } zloop = 1; while (zloop == 1 && !restore && !dump) { zloop = 0; crm_osbf_packcss (header, 0, header->buckets - 1); sum = 0; maxchain = 0; curchain = 0; totchain = 0; fbuckets = 0; nchains = 0; ofbins = 0; for (i = 0; i < header->buckets; i++) { sum += GET_BUCKET_VALUE(hashes[i]); if (GET_BUCKET_VALUE(hashes[i]) != 0) { fbuckets++; curchain++; if (GET_BUCKET_VALUE(hashes[i]) >= OSBF_FEATUREBUCKET_VALUE_MAX) ofbins++; } else { if (curchain > 0) { totchain += curchain; nchains++; if (curchain > maxchain) maxchain = curchain; curchain = 0; } } } version_index = *((unsigned long *) header->version); if (version_index < 0 || version_index > UNKNOWN_VERSION) version_index = UNKNOWN_VERSION; fprintf (stdout, "\n Sparse spectra file %s statistics: \n", cssfile); fprintf (stdout, "\n CSS file version : %12s", CSS_version_name[version_index]); fprintf (stdout, "\n Header size (bytes) : %12ld", header->buckets_start * sizeof (OSBF_FEATUREBUCKET_STRUCT)); fprintf (stdout, "\n Bucket size (bytes) : %12lu", (unsigned long)sizeof(OSBF_FEATUREBUCKET_STRUCT)); fprintf (stdout, "\n Total available buckets : %12ld", header->buckets); fprintf (stdout, "\n Total buckets in use : %12ld", fbuckets); fprintf (stdout, "\n Number of trainings : %12lu", header->learnings); fprintf (stdout, "\n Total buckets with value >= max : %12ld", ofbins); fprintf (stdout, "\n Total hashed datums in file : %12lld", sum); fprintf (stdout, "\n Average datums per bucket : %12.2f", (fbuckets > 0) ? (sum * 1.0) / (fbuckets * 1.0) : 0); fprintf (stdout, "\n Number of chains : %12ld", nchains); fprintf (stdout, "\n Maximum length of overflow chain : %12ld", maxchain); fprintf (stdout, "\n Average length of overflow chain : %12.2f", nchains > 0 ? (totchain * 1.0) / (nchains * 1.0) : 0); fprintf (stdout, "\n Average packing density : %12.2f\n", (fbuckets * 1.0) / (header->buckets * 1.0)); for (i = 0; i < OSBF_FEATUREBUCKET_VALUE_MAX; i++) bcounts[i] = 0; for (v = 0; v < header->buckets; v++) { if (GET_BUCKET_VALUE(hashes[v]) < OSBF_FEATUREBUCKET_VALUE_MAX) bcounts[GET_BUCKET_VALUE(hashes[v])]++; } if (!brief) for (i = 0; i < OSBF_FEATUREBUCKET_VALUE_MAX; i++) { if (bcounts[i] > 0) { fprintf (stdout, "\n bin value %8ld found %9ld times", i, bcounts[i]); } } fprintf (stdout, "\n"); cmdloop = 1; while (!report_only && cmdloop) { // clear command buffer cmdchr[0] = '\0'; fprintf (stdout, "Options:\n"); fprintf (stdout, " Z n - zero bins at or below a value\n"); fprintf (stdout, " S n - subtract a constant from all bins\n"); fprintf (stdout, " D n - divide all bins by a constant\n"); fprintf (stdout, " R - rescan\n"); fprintf (stdout, " P - pack\n"); fprintf (stdout, " Q - quit\n"); fprintf (stdout, ">>> "); clearerr (stdin); dontcare = fscanf (stdin, "%[^\n]", cmdstr); dontcare = fscanf (stdin, "%c", crapchr); fields = sscanf (cmdstr, "%s %f", cmdchr, &cmdval); if (strlen ( (char *)cmdchr) != 1) { fprintf (stdout, "Unknown command: %s\n", cmdchr); continue; } switch (tolower ((int)cmdchr[0])) { case 'z': if (fields != 2) fprintf (stdout, "Z command requires a numeric argument!\n"); else { fprintf (stdout, "Working..."); for (i = 0; i < header->buckets; i++) if (GET_BUCKET_VALUE(hashes[i]) <= cmdval) BUCKET_RAW_VALUE(hashes[i]) = 0; fprintf (stdout, "done.\n"); } break; case 's': if (fields != 2) fprintf (stdout, "S command requires a numeric argument!\n"); else { fprintf (stdout, "Working..."); for (i = 0; i < header->buckets; i++) { if (GET_BUCKET_VALUE(hashes[i]) > (int) cmdval) { BUCKET_RAW_VALUE(hashes[i]) = GET_BUCKET_VALUE(hashes[i]) - cmdval; } else { BUCKET_RAW_VALUE(hashes[i]) = 0; } } fprintf (stdout, "done.\n"); } break; case 'd': if (fields != 2) fprintf (stdout, "D command requires a numeric argument!\n"); else if (cmdval == 0) fprintf (stdout, "You can't divide by zero, nimrod!\n"); else { fprintf (stdout, "Working..."); for (i = 0; i < header->buckets; i++) BUCKET_RAW_VALUE(hashes[i]) = GET_BUCKET_VALUE(hashes[i]) / cmdval; fprintf (stdout, "done.\n"); } break; case 'r': zloop = 1; cmdloop = 0; break; case 'p': fprintf (stdout, "Working..."); crm_osbf_packcss (header, 0, header->buckets - 1); zloop = 1; cmdloop = 0; break; case 'q': fprintf (stdout, "Bye! \n"); cmdloop = 0; break; default: fprintf (stdout, "Unknown command: %c\n", cmdchr[0]); break; } } } } return 0; } crm114-20100106-BlameMichelson.src/CRM114_Mailfilter_HOWTO.txt0000644000000000017500000022521411321154266021543 0ustar rootwsy# # CRM114_Mailfilter_HOWTO.txt - The CRM114 & Mailfilter HOWTO # # Copyright 2003-2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # The CRM114 & Mailfilter HOWTO -Bill Yerazunis, 2003-09-18 (last update 2009-03-02) This is the CRM114 Mailfilter HOWTO. It describes how to set up CRM114 and Mailfilter to filter your incoming mail, as of the version CRM114-20060209-ReaverSecondBreakfast. This HOWTO doesn't describe _how_ CRM114, Mailfilter, Mailtrainer, or Mailreaver works. This just will set you up enough so that you can start using CRM114 and Mailfilter to filter your mail. It assumes you are running on a Linux box; getting the system running on *BSD, MacOS, or Windows will require considerably more work than we describe here (and is a subject for future HOWTOs). ------------------------------------------------------ Remember, the CRM114 package is released under the GPL (license is enclosed in any of the downloads). There is NO WARRANTY WHATSOEVER for this software to be useful in any way; it's going to tamper with your incoming mail and you can easily imagine the dangers in that. ---------------------------------------------------------- That said, I hope CRM114, Mailreaver, and Mailreaver is useful to you; it's been very useful to me. It's been keeping my mailbox clear of clutter for since 2002; I'm convinced it has better performance than I-the-human at killing spam without accidentally deleting important mail. I've tested myself, and I-the-human is only about 99.7% or 99.8% accurate at best; CRM114 is considerably more accurate than that - easily two or three times more accurate. (as of December 2003, it was 99.95% accurate (N+1 statistics) on my incoming mail stream to a non-business account. Something to Remember: CRM114 is a *language* designed to write text filters and classifiers in. It makes it easy to tweak code. Mailfilter is just _one_ of the possible filters; there are many more out there and if Mailfilter doesn't do what you want, it's easy to create one that does. Mailreaver is another one of the filters, with different (and better, I hope) designs, that can use Mailtrainer (yet another filter) to build even better statistics files. There are yet other filters written in CRM114; you can read all about them on the web page: crm114.sourceforge.net (and if you create one, and want to share it, put it on a web page and send me an email so I can add a pointer.) - Bill Yerazunis (wsy@merl.com) ------------------------------------------------------------------- Step 0: Scientes Inamicae (Know Thy Enemy) These are the major steps in using CRM114 Mailfilter. The steps are pretty simple: 1) Downloading what you need (it's just 1 or 2 megabytes in a single .gz file) 2) Setting up the executables (not more than ten commands to type, even if you're building from the fresh source) 3) Configuring Mailfilter or Mailreaver (editing one file, most likely change is ONE line, and we tell you which one) 3) Setting up the needed auxilliary files (not more than 2 files to edit of no more than 5 lines each, plus typing one or two commands) 5) Engaging Mailfilter (if you are using Procmail, this is cut-and-paste about ten lines, otherwise it's create one file containing one line, and typing up to three commands) 6) Training CRM114 and Mailfilter (whenever you get an error, you send it back to yourself, using your current mail tool. How hard can that be? Now, you can also use mailtrainer to bulk-train in whole directories of your old spam and good email.) 7) Adding Priority Lists, Whitelists, and Blacklists Mailfilter supports whitelists, blacklists, term rewriting, and some other features. You can use these for "guaranteed delivery" from people you really trust - or really hate. 8) Useful Utilities Details on the cssutil, cssdiff, and cssmerge utilities. You don't need to know this, but you may find it useful. ------------------------------------------------------------------------- Step 1: Downloading. Get yourself a copy of a CRM114 kit. The kits can always be found by visiting the CRM114 homepage at: http://crm114.sourceforge.net You will need at least the statically-linked binary kit (compiled to run on any i386 or better Linux box); for best performance it is suggested you get the source kit and compile it on the processor you will be running CRM114 on. If you do not have root privs on the box you will be running CRM114 on, it is suggested you stay with the statically linked binaries (this is because the recommended "TRE" REGEX library requires either root to install, or major workaround mojo). The kits are named: crm114-.i386.tar.gz (statically linked binaries) and crm114-.src.tar.gz (complete source code + tests) These kit .gz files are fairly small; usually less than one megabyte (currently around 800 Kbytes) so they will download quickly. You will need to decide if you will be starting off with a pre-learned set of .css files (.css means CRM114 Sparse Spectra) or if you will be creating your own .css files from your own samples of spam and nonspam. You can think of a .css file as being a "cerebral memory" of what a particular kind of mail (good or spam) looks like; .css files are how CRM114 remembers what spam and good mail look like. With empty .css files a CRM114 system acts like a total amnesiac - it has absolutely no conception of "good" or "bad". In general, the pre-learned .css files will give you an initially more accurate filter, but after some use and training the self-created filter files will catch up with pre-learned files, and then the two filters will achieve about equal accuracy. However, there may be some "glitches" in the mid-term while some edge cases in the prelearned files are _unlearned_. If you decide not to take our advice, you will also need to download a set of pretrained .css files like these: crm114-.css.tar.gz The .css files are rather large; this download may approach 50 megabytes. (currently it's 8+ megabytes) Download the kits you will need (at least one of .src.tar.gz or .i386.tar.gz or .i386.rpm) and then proceed to "Step 2: Setting Up the Executables" -------------------------------------------------------------------------- Step 2: Setting Up the Executables In this step, you will install four binaries into your system. The four binaries are: crm - the CRM114 "compute engine". It's called "crm" because "crm114" is too hard to type. cssutil - the .css file check/verify/edit program cssdiff - the .css file diff program cssmerge - the .css file merging program One important point: do NOT install CRM114 or any of it's utilites setuid or sgid to root. If you do, that's just an invitation for someone to utterly hose your system without even trying. We're not talking an intentional attack, just an inadvertent command or script gone wierd could do it. This is also why we recommend using a _static_ linking of the executable, so that a LD_LIBRARY_PATH attack can't falsely insert a subversive version of a library. ----- There are three ways you can set up these executables. You can: a) install with a .rpm kit b) install with a .i386.tar.gz (tarball of statically linked binaries) c) install with a .src.tar.gz (tarball of complete source) Note 1: If you do not have root on the machine you are installing on, you may have some problems during the installation. You may want to reconsider using the statically linked binaries instead of compiling from sources. ----- Step 2 Method A: Installing from .i386.tar.gz First, untar the binary release. Type: tar -zxvf crm114-.i386.tar.gz You should now become root. If you do not have root on your machine, you _can_ execute CRM114 programs directly from your home directory, by changing your $PATH appropriately; see your shell man page for how to do this for your particular shell (it varies with the shell, so I can't tell you here how to do it) and skip to the end of this step. Or- you can run the binary explicitly from your current directory by invoking it as ./crm114_tre. If you're installing, become root, then type: cd crm114- make install_binary_only This will install the pre-built binaries of CRM1114 and the utilities into /usr/bin. This is the default install location for CRM114. If you want them installed in a different place, edit the Makefile and change BINDIR (near the top of the Makefile) to a different directory. Note that if you type "make clean" you'll _delete_ your prebuilt binaries, so don't do that! Now, you can test your work. Type crm -v which will cause CRM114 to print out the version of itself you just installed. You can also run a quick "Hello, world!" by typing: crm '-{ output /Hello, world! This is CRM114 version :*:_crm_version: .\n/}' then hit ^D (end-of-file on *nix). You;ll get back a response like: Hello, world! This is CRM114 version 20040118-BlameEric . Congratulations! You've now completed the installation of CRM114 and utilities from prebuilt binaries. Proceed to "Step 3: Setting Up Needed Files. ----- Step 2 Method B: Compiling from .src.tar.gz (source) This method is the most complex. Start by uncompressing and untarring the big .src.tar.gz with the command: tar -zxvf crm114-.src.tar.gz Now cd down into the crm114- directory. You will see many files here. You now have a choice: you can build CRM114 with either the GNU regex libraries (not recommended, as GNU regex can't handle embedded NULL bytes and has other issues), or with the TRE regex library (recommended; this is what you get with the precompiled binary kit). By default, you will use the TRE regex library; however, this means you have to build and install TRE. You can either grab the most recent version from the TRE homepage at http://laurikari.net/tre, OR you can use the version that is pre-packaged with your CRM114 download. (The pre-packaged version is tested against CRM114 and will have all appropriate patches installed, while the fresh one may have new features. Take your choice- it's good stuff either way) Fortunately, building and installing TRE is easy. The TRE regex library will peacefully coexist on the same system as the GNU regex library. Caution: if you are building from sources, you should install the TRE regex library ***first***. TRE is the recommended regex library for CRM114 (fewer bugs and more features than Gnu Regex). To install TRE, become root, then type this ( BIG BIG WARNING - DO NOT FORGET to tell configure to "--enable-static" ) : cd crm114- cd tre- ./configure --enable-static make make install You have now installed the TRE regex library as /usr/local/lib/libtre . If you make a mistake and need to rerun the make commands, be aware that in some versions of TRE, a 'make clean' command will delete test files that are needed when running the build process again. Unfortunately, the safest course of action is to delete the TRE source directory and restore it from the tar ball. Depending on your choices in static versus dynamic linking, you _may_ need to also add /usr/local/lib to /etc/ld.so.conf, and then run ldconfig as root. Or not. If, during the next steps, you get annoying messages on the order of "can't find ltre" then this is the thing to try. Once TRE is built and installed you can compile CRM114 and the accompanying utilities (cssutil, cssdiff, and cssmerge). By default, CRM114 installs into /usr/bin (_not_ /usr/local/bin - if you want to change this, change the definition of BINDIR near the top of the file "Makefile"). Cssutil gives you some insight into the state of a .css file, cssdiff lets you check the differences between two .css files, and cssmerge lets you merge two css files. Change directory back up to the CRM114 directory, then become root, then (noting that no .configure step is necessary; the CRM114 Makefile is self-contained and presupplied) type: cd .. make This will compile and link the CRM114 executable and the utilities. You can test this executable if you want. Just type: make megatest which will run for about a minute and exercise most of the code paths inside CRM114. This tests the version of CRM114 in your local directory. Note that this only works if you've installed the TRE engine. The GNU regex engine has enough "fascinating behaviors" that it will get a lot of things wrong; the GNU regex package also doesn't handle approximate regexes at all, and since those are in the test set, you'll error out on each of those as well. If "megatest" reports any differences between the supplied "megatest_knowngood.log" and your own results, OTHER than on lines tht say "OK_IF_blahblahblah" results, please file a bug report to me and we'll figure out what went wrong. If you are happy with the executable, type make install This will install the executable into /usr/bin/crm (by default). If you want another install location, you can change it in the Makefile. You can now check to see that the install version by: crm -v and CRM114 will report back the version of the install. You can also run a quick "Hello, world!" by typing: crm '-{ output /Hello, world! This is CRM114 version :*:_crm_version: .\n/}' then hit ^D (end-of-file on *nix). You;ll get back a response like: Hello, world! This is CRM114 version 20040118-BlameEric . Congratulations! You've now completed the installation of CRM114 and utilities from source. Move on to the next step - "Step 3: Setting Up Your .CSS Files" . ------------------------------------------------------------------------ ------------------------------------------------------------------------ Step 3: Configuring Mailfilter or Mailreaver In this step you will tell Mailfilter or MailReaver what you want it to do with your mail. All of the options are controlled by editing one file, named "mailfilter.cf" . Mailfilter and MailReaver use most of the same flags (and all of the same important ones) so both use the same mailfilter.cf file. By default, both Mailfilter and Mailreaver look for the file mailfilter.cf in the initial directory. If you want to change that, use "--fileprefix=/some/where/else/" on the command line, so these filters will look for mailfilter.cf (and the other runtime filtering files!) in the "/some/where/else/" directory. This --fileprefix mode is handy when you are setting up many users. (remember to use a final closing slash on the directory name or you will end up nowhere) The format of mailfilter.cf itself is pretty simple. 0) blank lines are OK. 1) comments start with a # in column 1. 2) Anything not a comment is a var setting, in the format: :var_to_set: /Value_to_set_goes_here/ All of the user-settable configuration vars have setup lines in mailfilter.cf, and you only need to change three lines for a "default" average setup: one is a password you make up, and the other two have only a few possibilities each, and we list those possibilities for you. The Three Things you MUST do in mailfilter.cf : 1: First, you MUST change the secret password. This is defined near the top of the file. Your password may contain a-z, A-Z, 0-9, but no blanks or punctuation (at least for now). You _must_ set this password to something not easily guessable. If you don't set it, you won't be able to use mailfilter's remote commanding facility. 2: Second, you MUST set whether to use base64 decodes, or not, and if so, which decoder your system supports. Just type the options into BASH, one at a time (like "mewdecode ") and use the first one that doesn't give you an error message. 3: Third: you MUST set the cache_dupe_command according to whether your system supports linking (as in, has an "ln" command; *NIX does, but Windows doesn't) or whether full copies of texts need to be used in the reaver. Other than that, everything else in the mailfilter.cf file can be left alone, at least for initial testing. At first, you will probably want to leave the "log_to_allmail.txt" enabled while you get used to CRM114. Likewise, leave "log_rejections" set to yes as well; that way you can easily see (with "less" or "tail") just what is being rejected. Once you get more experience with CRM114, you can set these to "no" and not use up disk space in these "extra safety" logs. You can skim-read the rest of mailfilter.cf . There are three typical cases for most users: 1) If you ARE using Procmail or another filtering MDA: --> You probably will NOT need to change any of the other options. 2) If you ARE NOT using Procmail, but your mail reading program can sort out email into folders based on whether the SUBJECT header contains the telltale string "ADV:" (most mail readers can do this): --> You probably will NOT need to change any of the other options. 3) You are NOT using Procmail, and your mail reading program is "dumb" (cannot sort email into folders based on subject line): --> You probably will want to define a separate account that will recieve all spam caught (otherwise, you'll just get all your spam delivered as usual, with additional headers telling you it was spam). To do this, look down to ":general_fails_to:". Insert the full username@domainname.tld mail address where you want your spam to be sent. Note on mime decoders: There are a number of them available; the defaults given in mailfilter.cf may or may not be valid on your system. Further, it may have a different path than the default given in mailfilter.cf. Yet further, you may want to load your own, like "normalizemime" (see the crm114.sourceforge.net web page for details on the download). You can also configure the verboseness (or not) of your filtered results. You can go from "no changes" (not even a statistical label in the headers) to complete results including an expansion of any base64 texts and HTML decommented strings. Feel free to change things to get the look and feel you want; after all, what good is open source if you don't change it? :-) HOWEVER, Please don't muck with variables that aren't in the mailfilter.cf file. "You make a mess, you clean it up." :-( After making these changes, write out "mailfilter.cf". You may later go back and change the configuration options, but the options as already set are good for most users. You do not need to do anything to "load in" the new options, as CRM114 reads them in fresh from the file during initialization for each email. Now, proceed to "Step 4: Setting Up Other Needed Files" . -------------------------------------------------------------------- -------------------------------------------------------------------- Step 4: Setting Up Other Needed Files Now that the crm114 language is working, you need to set up your .css files, your rewrites.mfp file, and your priolist.mfp file. All of these files need to exist (either by being there, or by being symlinked to) the directory where CRM114 will "run in" when an actual mail comes in. Usually this is your per-user directory on the mail server (if your mail server is also your home directory, then it's there.). If this is inconvenient, you can use the --fileprefix option on the command line to tell CRM114 to "change over" to a different directory. The files that need to be in the home (or --fileprefix) directory are: rewrites.mfp spam.css nonspam.css priolist.mfp blacklist.mfp [ only for mailfilter; mailreaver ignores it ] whitelist.mfp [ only for mailfilter; mailreaver ignores it ] Here's a quick overview of these files; we'll get into the details further on. If you are in a hurry, you can have *empty* files for all four of the .mfp files and things will still work reasonably well (and you can upgrade later). You DO need to create the proper-sized .css files, though, or you won't be able to classify email at all (depending on your setup, it may be discarded, may be returned to sender, or may actually just get mangled and forwarded. None of these are a good thing, in the long run) --- Summary of each file --- [[ rewrites.mfp ]] The rewrites.mfp file controls how to "rewrite" incoming email so that your incoming email conforms more closely to what might be considered "archetypal". The rewrites.mfp setup is optional; if you build your own .css files (either from empty files, or from corpora) you can actually replace rewrites.mfp with an empty file; you just won't be able to share your .css files with anyone else. [[ spam.css and nonspam.css ]] The .css files themselves ( CRM114 Sparse Spectra files) are the "memory" that crm114 uses to statistically describe the words and phrases that characterize various kinds of mail. Although it depends on the classifier you are using, by default the .css files are in a hashed binary format and it is not easy (or sometimes, even possible!) to reconstruct your email from the .css files. However, it *is* possible to determine from the .css files if certain words or phrases have ever been trained into your classifier, so .css files do have some possible security implications. DMCA note: CRM114 statitics files are "effectively encrypted" according to the provisions of the DMCA - all parties are hereby notified that the copyright owner/author of any particular statistics file (.css , .cfc, .cor, .cwc, .chp, or other) is the creator of that file, not the author(s) of CRM114 itelf, and said creator may invoke the draconian punishments of the DMCA on any party attempting to extract the encoded information without prior approval. So there. [[ priolist.mfp ]] The priolist.mfp file is a sequential list of tests to be run; each test starts with a + or a - (thumbs up or thumbs down), then a regex pattern; if the pattern matches, the mail is either accepted unconditionally or sent to the spam bucket unconditionally. Then, The blacklist.mfp, and whitelist.mfp are "match this, you're spam" and "match this, you're good" regex pattern sets. If this seems redundant, you're right; all you need is priolist.mfp, but enough folks have historically requested "blacklists" and "whitelists" as an explicit marketing checkoff that we've put them into mailfilter.crm. Priolist.mfp is the preferred method of doing blacklists and whitelists now; if a P.H.B. asks "does it have blacklists and whitelists", you can now say "yes, and they're even _prioritized_ blacklists and whitelists!". Step 4 Part 1 - Setting up the Rewrites file. To set up the rewrites.mfp file, edit the file "rewrites.mfp" and replace the placeholders (in this case, "wsy", "merl.com", and "mail.merl.com") with your corresponding username, domain name, and mail server information. These rewrite rules will be used to "scrub" your sample text of user-specific strings. (note that this is only strictly necessary if you want to use the pre-built .css files. However, it is in general recommended, so that you can "share/merge" your .css files with your friends.) Note the "arrowheads" in the file. They look like this: >-> or >--> This is a rewrite operator. Anything that matches the regex on the left-hand side of the arrowhead will be replaced with the text on the right-hand side of the arrowhead. (the "arrowheads" that have one hyphen in them will rewrite only if the entire left-hand match is found on a single line; if you use two hyphens, to make a ">-->" instead of ">->" then the left-hand match can be multi-lined.) Example: if your name was Agent Smith, your email account AgentSmith@the.matrix.net, and your mail router was mail.matrix.net at IP address 192.168.10.5, then the rewrites.mfp file should look like: AgentSmith@the.matrix.net>->MyEmailAddress [[:space:]]Agent Smith>-> MyEmailName mail.matrix.net>->MyLocalMailRouter 192.168.10.5>->MyLocalMailRouterIP The idea is to turn your email headers into headers that don't refer to any of your own actual name, address, etc, but contain only the strings "MyEmailAddress", "MyEmailName", "MyLocalMailRouter", and "MyLocalMailRouterIP". If you have more than one incoming email name , email address, server, router, etc, add lines in rewrites.mfp for each email name, email address, server, router, and so forth. This is something you really _should_ do, if you have more than one email path leading to the account that leads to an account that is being filtered by CRM114 (if you don't, a lot of learning will have to be repeated for each path, which will cost you accuracy and use up valuable feature slots in the .css files that you could use in more valuable ways otherwise. On the other hand, if you have multiple email addresses that all channel through one CRM114 fileset, and the addresses recieve very different ratios of spam and nonspam (or, very differnt *types* of spam), then it _might_ be to your advantage to not use rewrites.mfp, (just replace it with an empty file), so that the extra statistical information of the incoming email address is not lost) If all this confuses you to no end, just make rewrites.mfp be an empty file and everything should decently well. ----- Step 4 Part 2 - Setting up the .CSS files You have a choice here. You can either build your own files from your own spam and nonspam email, or you can use the pre-learned .css files available from crm114.sourceforge.net . We recommend that you build your own files dynamically, as that will result in the best final accuracy. In either case your .css files should be in the same directory as your mailfilter will "run" in (as we mentioned above, default is your home directory on your mailserver). The particular directory that the mailfilter "runs" in is variable and depends on your local setup. Assuming you will use the ".forward" hook, there are two likely situations. If your mail service runs on your local machine (say, you have just one machine - and I do hope you have a firewall in that case), then mailfilter will almost certainly "run" in your home directory- the directory you're in when you log in. If your mail service runs on a mail server (not your local machine), then you will probably have a "home directory" on that machine as well, and that's the directory that the mail filter will run in. If neither of these is the case, you should ask your system administrator what the correct directory is. ----- Step 4 Part 2 Method A - Build Your Own Empty .CSS Files This method will give you the best final accuracy, but you will spend more time training. This is the recommended method for users wanting the best accuracy. To start from scratch, you need to create empty .css files. The cssutil program will do that for you. Just type: cssutil -b -r spam.css cssutil -b -r nonspam.css and you will have created _empty_ spam.css and nonspam.css files in your current directory (that is, the files are full-size, but contain no information. They'll be full of binary zeroes). Once you have these empty files you will have a high (50% or so) error rate for the first few hours, till you have 'taught' CRM114 what your particular mix of spam and nonspam looks like. Proceed below to "Step 4: Configuring Mailfilter". Many people want to "preload" their spam collection into CRM114. This used to be a bad idea. CRM114 is optimized for TOE learning - "Train Only Errors" learning; testing something like a quarter of a million test cases has proven that it is better to train only errors, and _only_ _as_ _they_ _occur_, than to preload a bulk database into CRM114. Note that the previous paragraph says "used to be". The new program "mailtrainer.crm" can do rapid TOE or DSTTTR training and build your .css files out of stored spam and good mail collections. You can read all about mailtrainer.crm in Appendix 1 of this document. If you're wondering, the statistics from the "torture test" (about 40,000 messages) are that training _only_ errors, in realtime, will give about 2.1 times better accuracy than force-training a big corpus, even if the messages are the same messages and presented in the same order. The "why" is mathematically complicated, but there's an intuitive description in the FAQ. Again: you will achieve the best possible accuracy if you let CRM114 itself make errors that you correct in real time. ----- Step 4 Part 2 Method B - Pre-LEARNed files: This is the simplest method, but less accurate than method A. If you choose to use the pre-learned .css files, you need to download the appropriate crm114 .css.tar.gz file, and then you can just type: tar -zxvf crm114-.css.tar.gz and you'll get the two files "spam.css" and "nonspam.css" in your current directory. Note that the download is fairly large - between 8 and 50 megabytes, and although this will give you a good starting point for your own statistics, you will have a better (smaller, faster) final configuration if you build your own .css files from scratch. ----- Step 4 Part 2 Method C - BETA TEST - Using mailtrainer.crm to Build .CSS Files New in 20060101 is the "mailtrainer.crm" program. This program accepts two directories of "archetype" good and spam email, and runs an interative training procedure to produce some very high quality .css files from these examples. The example files need to be "SMTP Virgin" - that is, exactly what was recieved at SMTP time by your mail server, with _nothing_ changed. (any changes will affect accuracy, probably negatively) The mailtrainer training will typically take something like 1 to 10 minutes per 1000 messages in your training set. Mailtrainer.crm will create your spam.css and nonspam.css files automatically. Mailtrainer.crm will also read your mailfilter.cf configuration file, and rewrites.mfp, so be sure to set up those files _first_ (if you're doing things in order, you're in good shape). The full description of how to use mailtrainer.crm is in Appendix 1 at the end of this document. So, jump there, read Appendix 1, run mailtrainer.crm, and then proceed to the next section- checking your .css files. ----- Step 4 Part 2 Method D - ALPHA TEST -- MAKEFILE Build And Preload .CSS Files From Fresh Spam and Nonspam CAUTION - this applies ONLY to kits 20060606 and later!!! DO NOT DO THIS if you are running a pre-20060606 makefile! It will hose you! If you, by any chance, happen to have un-altered examples of spam and nonspam, you can use these to pre-build a set of .css files. (As of versions 20060606 and later ONLY. Previous versions had a bad implementation of this that took different arguments and tended to produce bloated .css files that didn't function well. Post 20060606, the mailtrainer system is used and that works very well indeed) You also need to be sure your emails are "SMTP Virgin" - that is, they are exactly as recieved at SMTP time, not with headers or footers added or taken out by your mail delivery agent or your mail reading program. (if this isn't true, the headers will be rather bogus and you will lose significant accuracy and you should use method A above instead). If you are OK with this, here's what to do: 1) Put copies (or symlinks/hardlinks) to all of your example spam into a subdirectory named spam.dir in the local directory. 2) Put copies (or symlinks/hardlinks) to all of your example good email into a subdirectory named good.dir in the local directory. 3) IF you want to train from scratch (not necessarily good or bad, but your option... choose well): rm -rf spam.css rm -rf nonspam.css 4) Invoke the mailtrainer make cssfiles to build your new spam.css and nonspam.css files. That's all. It'll take a few minutes to run but mailtrainer will give you running status so it's not like things have hung. Again, let me emphasize that doing this is ONLY recommended on full installs post 20060606 . Versions prior to that will hose you if you do this. -------- Step 4 Part 3 - Checking your installation Once you have set up mailfilter.cf, rewrites.mfp, the *list.mfp files, and the .css files, you can test your configuration by typing the following (The '^D' at the end is a control-D, which is an END-OF-FILE on Linux. Other systems may use a different END-OF-FILE character): ./mailfilter.crm This is a test. Just type a few lines of text that you might ordinarily get, like a short rant on why Perl is useless for big projects, or why Linux is superior or inferior to NetBSD. ^D or (to use mailreaver instead) ./mailreaver.crm This is a test. Just type a few lines of text that you might ordinarily get, like a short rant on why Perl is useless for big projects, or why Linux is superior or inferior to NetBSD. ^D If you have set up Mailfilter for Procmail-style filtering you will always get a small report back saying something like either of these (the actual numbers and some minor text strings will change, but you should have something that _vaguely_ looks like the following): From foo@bar Thu Sep 18 19:20:35 2003 X-CRM114-Status: Good ( pR: 12.630237 ) ** ACCEPT: CRM114 PASS SBPH/BCR TEST** Probabilistic match quality: 1.000000, pR: 12.630237 P(succ): 1.000000e-00, P(fail): 2.342950e-13 Features: 336, S hits : 4313, F hits : 5901 or: From foo@bar Thu Sep 18 19:19:39 2003 X-CRM114-Status: SPAM ( pR: -2.866484 ) ** REJECT: CRM114 FAIL SBPH/BCR TEST** Probabilistic match quality: 0.001358, pR: -2.866484 P(succ): 1.358082e-03, P(fail): 9.986419e-01 Features: 144, S hits : 2337, F hits : 3313 If you are using "mail to spamtrap account" filtering, then you will either get an "accept" report back (the first report above is an "accept") or the text you typed in will be mailed to your spamtrap address. If you don't get a report back, check the spamtrap address and see if your test text ended up there. If all the numbers are zero, or the result is "UNSURE", that's OK, it just means there isn't enough statistical information in the .css files yet to actually decide if it's spam or not. This is a good situation. If you don't get _either_ of the above, something is broken, either in your installation of CRM114 or in your configuration file. You need to fix the problem before you engage Mailfilter. If your installation and configuration passes the above test, congratulations! You have now configured mailfilter.crm . ----- Step 4 Part 4 - OPTIONAL - CHECKING YOUR .CSS FILES For all three (four?) methods of setting up your .css files, you can check that the .css files are reasonable. Use the "cssutil" utility. Note: this works fine for the default classifiers like Markov, OSB, and OSB Unique, but _not_ for Winnow, Hyperspace, or Corellative classifiers; for OSBF classifiers use osbf-util instead of cssutil. Type in: cssutil -b -r spam.css cssutil -b -r nonspam.css You should get back a report something like this: Sparse spectra file spam.css statistics: Total available buckets : 1048576 Total buckets in use : 506987 Total hashed datums in file : 1605968 Average datums per bucket : 3.17 Maximum length of overflow chain : 39 Average length of overflow chain : 1.84 Average packing density : 0.48 Note that the packing density is 0.48; this means that this .css file is about half full of features. Once the packing density gets above about 0.9, you will notice that CRM114 will take longer to process text. The penalty is small below packing densities below about 0.95 and only about a factor of 2 at 0.97 . Note - do NOT believe "ls -la" with respect to .css files! Because CRM114 uses memory mapping instead of file I/O (because it's much faster to go through the page-fault tables than through the file I/O system), the m-time (time last modified) and c-time (time created) never change, only the a-time (time last accessed), and that even the a-time only changes if your file system had the proper compile-time options to keep track of the a-time, and that defaults to "not keep track". Believe in what cssutil tells you- if new features show up after learning (because the bucket counts change), you _are_ learning and "ls -la" is lying to you! Conversely, if the bucket counts do NOT change, you have a file redirection or file protection problem and your system is NOT learning. That's bad and you need to figure out the problem and fix it. You can also see how easy it will be for CRM114 to differentiate spam from nonspam with your .css files. The utility "cssdiff" will compare the statistical features of two .css files. (again, only for Markov, OSB, and OSB Unique classifiers) Try it: cssdiff spam.css nonspam.css and you'll get back a report like: Sparse spectra file spam.css has 1048577 bins total Sparse spectra file nonspam.css has 1048577 bins total File 1 total features : 1605968 File 2 total features : 1045152 Similarities between files : 142039 Differences between files : 1279964 File 1 dominates file 2 : 1463929 File 2 dominates file 1 : 903113 Note that there's a big difference between the two files; in this case there are about 10 times as many differences between the two files as there are similarities. That's pretty much typical- and it's a good sign that your filtering should be quite accurate. Now, move on to "Step 4: Configuring Mailfilter". ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- Step 5: Engaging Mailfilter There are two common ways to engage Mailfilter.crm on your incoming mail stream: you can use Procmail recipes and have Mailfilter run as a procmail subprocess, or you can use the .forward hook of Sendmail (and Sendmail clones which also support .forward) In the first method (recommended), you use Procmail's ability to execute a program as part of a Procmail recipe to run CRM114, which adds headers as needed to let Procmail or your mail-reading program do the sorting. In the .forward method, you (or your system manager) must add a link from an execution command of crm114 to the directory /etc/smrsh. This is because sendmail will NOT run any program that isn't "approved" by the system manager (by linking it into /etc/smrsh/whatever). The output of mailfilter is then directly appended to your /var/spool/mail file (or possibly forwarded to your spam-bucket account). ----- Step 5 Method A: For Procmail and Maildrop Users For Procmail users just add a procmail recipe to .procmailrc to run CRM114 and mailfilter whenever your other procmail rules fail to decide what to do. Here's a sample Procmail recipe set. Notice that we actually have TWO recipes - one to actually run crm114 and mailfilter, the other to then sort the mail based on the result. # # :0fw: .msgid.lock | /usr/bin/crm -u /home/my_user_directory mailfilter.crm :0: * ^X-CRM114-Status: SPAM.* mail/crm-spam That's all that Procmail users should need. Mailfilter should now be active - send yourself a test message and see where it ends up. To use mailreaver instead of mailfilter, just put "mailreaver.crm" in instead of "mailfilter.crm" . If you get the test message, proceed to "Step 6: Training CRM114". ----- ( note: Sub-Method A-one) If you use an MUA that can highlight on headers, you can use something like this in your procmail (from Philipp Weiss): in .procmailrc CRMSCORE=`$HOME/bin/crmstats.sh` :0fw: .formail.crm114.lock | formail -I "X-CRM114-Score: $CRMSCORE" where ~/bin/crmstats.sh is a simple script: #!/bin/bash grep -a -v "^X-CRM114" | \ /usr/bin/crm -u $HOME/.crm114 mailfilter.crm --stats_only ------ (note: Sub-Method A-two) If you're using maildrop ( http://www.courier-mta.org/maildrop.html ), you can put this in your ~/.mailfilter (from Stefan Seyfried and Joost van Baal) CRMSCORE=`grep -a -v "^X-CRM114" | crm -u $HOME/.crm114/ /usr/share/crm114/ma\ilfilter.crm --stats_only` xfilter "formail -I \"X-CRM114-Score: $CRMSCORE\"" if ($CRMSCORE < -1) { xfilter "formail -I \"X-CRM114-Spam: yes\"" } log "Spam: $CRMSCORE" if (/^X-CRM114-Spam: yes/) { to Mail/spam/inbox } ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- Advanced Topic: Huge Emails and Denial Of Service Avoidance CRM114 has a number of built-in anti-Denial-of-Service (anti-DoS) features; one of them is that it will not grow buffers beyond a certain limit, No Matter What. This default maximum is altered with the -w parameter. However, you may find that you actually recieve emails bigger than this limit. In these cases, it is effective to simply filter on the first few tens of kilobytes of incoming text; that will speed things up a lot. [[ Obsolescence note: CRM114 builds prior to about 20050601 need the method described below. After that, mailfilter has the built-in option :decision_length: in mailfilter.cf which defaults to 16000 chars ]] This is easy to do with "head". head -c 10000 gives the first 10,000 characters of input, which is usually adequate for CRM114 to get a good decision on. This can be directly piped in right in the procmail command: :0fw: .msgid.lock | head -c 10000 | /usr/bin/crm -u /home/my_user_directory mailfilter.crm :0: * ^X-CRM114-Status: SPAM.* mail/crm-spam ----- Step 5 Method B: The .forward hook file For .forward hook users you should be aware that you should NOT put a direct link to crm in /etc/smrsh; since crm can do arbitrary things, (such as SYSCALL to invoke any command, it'd be like putting BASH there) you ought to attempt to control the damage as much as possible. 1) add a link from /etc/smrsh to crm114's executable binary in /usr/bin by becoming root and typing: cat > /etc/smrsh/crmfilter /usr/bin/crm mailfilter.crm >> /var/spool/mail/your_account_name_here ^D 2) add a .forward file to your account by typing: cat > .forward |/etc/smrsh/crmfilter ^D That's all. The mailfilter should now be active - send yourself a test message and see where it ends up. ---- Once you have engaged CRM114 mailfilter, you now get to train it to recognize spam and nonspam. Proceed to "Step 6: Training CRM114". Note: CRM114 contains a design decision that you may have to play with. Instead of doing memory management games, which both consume significant runtime CPU as well as present a major denial-of-service opportunity, CRM114 has an upper limit on the window size and it simply won't exceed that limit (it gives an error message if an incoming message tries to exceed the limit) You -can- change the maximum memory limit at runtime with the -w nnnnn flag; for example, if you want 100 megabytes of memory available, you can set that with ... -w 100000000 to set 100,000,000 bytes as the hard limit ceiling on per-buffer memory usage. Actual usage may be about five times that number, as CRM114 does a buffer-shuffling dance to minimize time spent reclaiming and compactifying memory. --------------------------------------------------------------------------- Step 6: Training CRM114 and Mailfilter One of the great strengths of CRM114 Mailfilter is that it has no preconcieved notions of "spam" and "nonspam". It _learns_ what you consider spam, and what you consider nonspam. For the first few days CRM114 will make a lot of mistakes sorting spam and nonspam. It is _very_ important that you train each mistake back into CRM114, otherwise it will never learn what you consider spam or nonspam. You should train in the mistake as quickly as possible. Start one morning and try to train every hour for the first few hours at least. Don't think you're training a computer- pretend you're housebreaking a new puppy. You train mistakes right from your mail reader. There are several ways to do this. Note that you can use mailfilter.crm _or_ mailreaver.crm interchangeably here; the instructions say "mailfilter.crm" but mailreaver.crm works exactly the same way from the user point of view. * Mail-to-Myself with In-Line Commands to retrain (Method A) * shell commands to retrain (Method B) * Mutt direct interface (Method C) * Some Other Interface (Method D) Whatever Way You Train : try to train _approximately_ equal amounts of spam and nonspam. If you are within 50% one way or the other, performance will be very good. If you are running mailfilter.crm: Train only errors! This is called TOE training. (TOE :== Train Only Errors) It's not necessary to train near-misses; experiments show that the performance increase on training near misses is miniscule at best, and may be negative at times. If you are running mailreaver.crm: Some messages may come through with a header that says "I am unsure about this message. Please train it either way." - so do exactly that. This is one reason mailreaver learns faster than mailtrainer, and why it's also more accurate. It's best for at least the first day or so that you check your mail at least every hour or so and send training information back to CRM114. This will help it rapidly converge on a good set of statistics for your particular mix of spam and nonspam. It will take several days worth of errors for CRM114's mailfilter to approach 95% accuracy, and around two weeks to a month to reach 99+ per cent accuracy. I usually exceed 99.9% accuracy (less than one error per thousand). Step 6 Method A: Mail-to-Myself The first way is to use the in-line command feature. Just forward the mistake back to yourself, with full headers (except edit out any CRM114-added headers or text). Just before the first line of the text to be "learned" as spam or nonspam insert a COMMAND line. Everything from the command line to the end of the message will be learned (so edit the text to remove things you _don't_ want considered indicative of spam/nonspam nature). The command line looks like this: command spam or command nonspam (for mailfilter.crm) command good (for mailreaver.crm) The "c" in "command" must be in column 1, and you must put your secret password into the command line. Don't use the <> brackets, use JUST your secret password. Examples: If your secret password was "Ihatespam", then the command line to learn something as spam would be: command Ihatespam spam and the command to learn something as nonspam would be: command Ihatespam nonspam (for mailfilter.crm users) or command Ihatespam good (for mailreaver.crm users) [[ Mailreaver users: if you have the cache enabled (which is the default) and the message you mail to yourself contains an intact SFID (Spam Filter ID), either in the Message-Id: field or in the X-CRM114-CacheID: field, then you don't need to worry about editing the text so that extra headers, footers, etc. are removed. The cached version of the message is saved during the first time the message was seen by mailreaver, and so headers, footers, etc. that are added by your MDA or MUA or other stuff will NOT affect accuracy. ]] If you are a mailreaver user, you also have a priority system you can access, either by editing your priolist.mfp file directly or by sending youself email in the following forms (where mypwd is the command passworda_regex_pattern is what will be used for priority matching. Priority matches can occur in both the headers and body of the text.) command mypwd maxprio +a_regex_pattern - sets a maximum priority GOOD command mypwd maxprio -a_regex_pattern - sets a maximum priority SPAM command mypwd minprio +a_regex_pattern - sets a maximum priority GOOD command mypwd minprio -a_regex_pattern - sets a maximum priority SPAM command mypwd delprio a_regex_pattern - deletes the first priority list entry that fully matches the regex pattern Step 6 Method B: Shell commands to retrain >> For mailfilter users (mailreaver is different - skip to below! << The second way to train in spam and nonspam is to use mailfilter.crm's shell command line options. When you find a spam that was mistakenly accepted as good mail, pipe it through mailfilter.crm with the "--learnspam" flag set, like this: bash> mailfilter.crm --learnspam < the_spam.txt Likewise, if you get an email that was falsely classified as a spam, pipe it through mailfilter with the "--learnnonspam" flag set, like this: bash> mailfilter.crm --learnnonspam < the_NON_spam.txt (yes, if you have a scriptable mail reader, you can put these functions right on the menu bars somewhere. Yes, that's a hint. :) ) [[ If you are using mailreaver.crm instead of mailfilter.crm, and cacheing is enabled, you don't even need to pipe in the full text in, all that's needed is either the intact X-CRM114-CacheID: line or the Message-ID line containing an intact sfid. That's another reason to switch to mailreaver! :) ]] >> For mailreaver.crm users << You're in luck, assuming you have taken the default and left cacheing turned on. All you need to pipe into mailreaver for training is any text or text fragment containing an intact X-CRM114-CacheID: line or the Message-ID line containing an intact sfid; mailreaver will go get the exact incoming text of the message and train it, so you don't need to worry about munged headers. The command looks like this: crm mailreaver.crm [options] < some_text.txt The command options you have available in mailreaver command line are: --spam - train the incoming text as SPAM (if there's a recognizable cacheid, use the cached msg). --good - train the incoming text as GOOD (if there's a recognizable cacheid, use the cached msg). --cache - default is to train using the text stored in the reavercache. Use --cache=NO to *not* use the cached version, if for some reason you don't want to. --dontstore - default is that every incoming message that isn't a training message (that is, --spam or --good) is put into the cache. Use --dontstore to not put into the cache (for example, "seekrit" users who aren't allowed to train or who might get msgs that you don't want archived). --stats_only - Don't do a full report or forwarding, just report the pR value on stdout. This is a value between (roughly) -1000 and +1000 where negative values indicate spammyness and positive values indicate goodness. For a simple test, just look at the first nonblank character. If it's a "-" sign, the input was spam. Because there's no other output, --stats_only forces --dontstore. --outbound This message is "outbound" - that is, known to be good. If it would classify as spam, train and cache it. Otherwise, no action. --undo To the extent possible, undo a training with this text (cached will be used if possible). --undo requires either --spam or --good as well. --fileprefix=dir Assume that the config file "mailfilter.cf" and the .css files are in directory "dir". Remember to use a final closing slash on the directory name, e.g. /my/home/dir1/ instead of /my/home/dir1. Otherwise, the filename will be spliced together from the last component of your --fileprefix and the nominal names, and you almost certainly don't want that. --config=file Don't use mailfilter.cf as the configuration file; instead use the file so noted. Part 6 Method C: For Mutt Users (Contributed by Mathieu Doidy and Joost van Baal:) In your ~/.muttrc, put: macro index \es "crmlearn --learnspam\n=spam/done\n" \"crm114 learn as spam, save in spam/done" macro index \eh "crmlearn --learnnonspam\n" "crm114 learn as ham" where crmlearn is this script grep -a -v "^X-CRM114" | \ /usr/share/crm114/mailfilter.crm -u $HOME/.crm114/ $1 | \ grep -a "^X-CRM114" Now you have two new macros in the Mutt index menu: * esc-s will tag a message, falsely classified as ham, as spam, * esc-h will tag a message, falsely classified as spam, as ham. Part 6 Method D: Some Other Method There are at least five other ways to retrain CRM114. Some interface with common mail readers, some are command line tricks. Rather than catalog them here (which would quickly go out of date) you should go to the CRM114 web page (crm114.sourceforge.net) and browse the list of applications under "Cool Stuff". Some of these are plugins, some are web-based MUAs, and some are entirely new mail filters. What To Do if CRM114 says "LEARNING UNNECESSARY..." --------------------------------------------------- Occasionally, some CRM114 configurations may refuse to learn an errror, claiming that it "got it right the first time" (yes, this is a subtle bug that is not allowing itself to be found, but there is reason to believe it has to do with the interaction of mail clients and headers and that some mail readers are lying to the user when they claim they are forwarding with full headers). While we applaud this self confidence, the error is still there, so you need to "force" the learning. You can do this either from BASH or from the mail-to-yourself command line. For BASH, add "--force" to the command line; for mail-to-yourself commands, just add "force" From BASH, add --force to the command line: # mailfilter.crm < the_error_text --learnspam --force for mail-to-yourself, add "force" to the command line: command mysecretpassword spam force (and similarly for nonspam). The training files "spamtext.txt" and "nonspamtext.txt" ------------------------------------------------------ [[ Note: this section is becoming obsoleted by the reavercache, which does more, better, and easier. ]] Whenever CRM114 learns a new spam or nonspam, it not only modifies the .css files, but it also keeps the source text of that learning in the files "spamtext.txt" or "nonspamtext.txt". These two files can be considered the "source code" of your .css files; they're all you really need to rebuild your .css files if/when you upgrade CRM114 and the .css file is changed but the algorithm is similar. For example, upgrading from Markovian filtering (the default) to Winnow or OSBF is "incompatible", and you might want to start with these files as a kickstart. ... but not necessarily; some filtering is radically different than Markovian; as we add new filters as technology moves forward, sometimes we will be able to kickstart, and sometimes we can't. - for upgrades that can use the current .css files, we will say so; - for upgrades that cannot use the current .css files, but *can* get kickstarted from spamtext.txt and nonspamtext.txt, we will say so; - for upgrades that are radically different enough that you must relearn from scratch, we will say so (and have you rename your old spamtext and nonspamtext files so that they will not be accidentally reused. If your mail system is so short of disk that you cannot afford to keep these (relatively) small files, then you may either delete them or symlink these files to /dev/null; you don't absolutely *need* them. These files are quite small though- I have been running CRM114 for nearly five years now and my *total* example text sizes are 678 Kbytes for nonspam and 893 Kbytes for spam (after something like five years of daily use and about a gigabyte of email). ----------------------------------------------------------------------- Step 7: Adding Priority Lists, Whitelists, and Blacklists If you really want, you can add white, black, and priority lists to CRM114. Most people don't need them, but there are always exceptions. [[ Note to mailreaver.crm users - mailreaver.crm uses ONLY the priolist.mfp, and does NOT support whitelist.mfp or blacklist.mfp. This really is no loss of functionality, because anything you can do with a whitelist or blacklist, you can also do with a priolist, and more besides. ]] For example, your lawyer, your boss, and your paramour all probably rate being on your "whitelist", so whatever they send to you is always marked "nonspam". Likewise, your ex-girlfriend/boyfriend, your nagging acquaintance, and the stalker from the library should all get blacklisted. Whitelisting, blacklisting, and prio-listing are all based on regex matching. If the regex you put in the file "whitelist.mfp" matches the incoming mail _anywhere_, the mail will be marked "good" no matter how it scores statistically. Similarly, if the mail matches any regex in "blacklist.mfp", the mail will be marked as "spam", no matter how it compares statistically. Note that sometimes this can cause considerable confusion, for example "ac.com" in a whitelist will not just match "billing.ac.com", but also "drac.complete.viagra.sales.com" (the match being the 'ac.com' in "drac.complete"). To prevent this, use ^ and $ to "anchor" the start and end of the regex, if possible. Lastly (well, actually firstly, because prio-listing happens before whitelisting or blacklisting) any mail that matches any regex in priolist.mfp . The format of priolist.mfp is that the first character on the line is a + or a -, which indicates "whitelist" or "blacklist", and the rest of the line is a regex. These regexes are tested in the order given in the file. An empty file is perfectly acceptable. For examples of how to set up the whitelist, blacklist, and priolist files, see the included "whitelist.mfp.example", "blacklist.mfp.example", and "priolist.mfp.example". Note: for my accuracy tests, I *turn off* whitelists, blacklists, and prio-lists. Be sure to test any whitelist, blacklist, or other list that you add, otherwise you may get a rude surprise some day. ---------------------------------------------------------------- Step 8: Useful Utilities You don't _need_ to know the stuff in this section to set up and use CRM114 and mailfilter or mailreaver, but it might be useful to you- or at least satisfy some of your curiosity. There are three utilities for dealing with the .css files (these are the files that contain the "learned information"). The utilities are: cssutil - gives you a readout of the characteristics of the information in a .css file cssdiff - gives you a summary of the differences between two .css files (handy for seeing learning!) cssmerge - merges two .css files into one; handy for importing new data into a .css file. Note that this is a destructive operation on the first .css file named! The cssutil utility: Usage is cssutil somefile.css which will give you statistics on the file somefile.css. You can then rescale, clip, and otherwise manage your .css files. It is especially useful to check the "Average Packing Density" of the .css files you use; when it approaches .7 to .8, you may want to consider enlarging your .css file. To do that, see below on "Enlarging a .css file" Here's the -h help: Usage: cssutil [-b -r] [-s css-size] cssfile -h - print this help -b - brief; print only summary -r - report then exit (no menu) -s css-size - if no cssfile found, create new cssfile with this many buckets. -S css-size - same as -s, but round up to next 2^n + 1 boundary. The cssdiff utility ------------------- To get the difference between two .css files, use ./cssdiff somefile.css anotherfile.css which writes out a summary of how two different .css files are. The cssmerge utility -------------------- To merge two .css files, use cssmerge . ./cssmerge outfile.css infile.css Note that this is _destructive_ to outfile.css, so make a copy somewhere else first. You _CAN_ merge two .css files of different length. You can also expand (or contract) a .css file this way: rename the old file, and allow a new one to be created with learnspam or learnnonspam while using the '-s nnnnnnnnn' s(lots) flag to set the number of feature slots desired in the new file. Then cssmerge your old file into the fresh new file, and all is well. Here's the cssmerge help: Usage: cssmerge [-v] [-s] will be created if it doesn't exist. must already exist. -v -verbose reporting -s NNNN -new file length, if needed Enlarging a .css file --------------------- One of the advantages of CRM114 is that the .css files are relatively small and of fixed size; they don't grow out of control and never need trimming if you use , which is the default. The disadvantage of this is that if your spam/nonspam discrimination is too convoluted, it won't be able to sort them out ( in trek-speak this is a high-order nonlinearity in the discrimination function ). The fix in this situation is to increase the dimensionality of the feature space. The number of dimensions is about 1/12 the number of bytes in the .css files; this works well at about a million dimensions (12 megabytes) for most people. But if you're not most people, you may need to (eventually) increase it. You can tell when this is necessary- running cssutil will give you a utilization and percentage of slots full; when that gets up near 95 percent, you may be running low on space and old features will be erased to make room for new features (that is, your feature set will dynamically evolve in real time to find what works.) However, that's slow and may cause a slight loss of accuracy. One way to fix this is to "increase the dimensionality of the discrimination hyperspace" (no, I am not making that phrase up). It means to add new slots to the .css files. The easiest way to do this is to 1) use cssutil to create a temporary, empty, larger .css file 2) merge the data from the old, small .css file onto the new big file. 3) copy the new big file over the old, small file. You can even combine steps 1 and 2, because newer versions of cssmerge will create a new file if needed (the -s N flag sets the number of slots in the new file; -S N does the same thing but rounds up to a 2^N+1 boundary, which is recommended ). For example, here's how to increase the size of the spam.css file from 1,000,001 slots (the default) to 2,000,001 slots. Just type: cssmerge temporary.css spam.css -s 2000001 mv temporary.css spam.css The newly replaced spam.css will have all of the features of the old spam.css file, but will be 2000001 slots long instead of the default 1000001 slots. -------------------------------------------------------------------- -------------------------------------------------------------------- APPENDIX 1 Using mailtrainer.crm New (as of 20060117) is the training program mailtrainer.crm . This program will take directories of spam and nonspam files, and iterate over them to build (or improve) a set of .css files for you. ***** WARNING WARNING WARNING ***** Mailtrainer.crm (and the documentation for it) is BETA QUALITY. There are very likely some very amusing bugs. Be warned !!! Archive your data and your .css files before using mailtrainer.crm. Really! ***** WARNING WARNING WARNING ***** Mailtrainer by default uses whatever settings are in your current mailfilter.cf file, so you'll get .css files that are optimized for your standard setup including mime decoding, normalization, classifier flags, etc. However, this means you *must* set up your mailfilter.cf file FIRST, before you run mailtrainer. Mailtrainer.crm uses DSTTTTR (Double Sided Thick Threshold Training with Testing Refutation) which is something I didn't come up with (Fidelis is on my list of suspects for this). The good news is that this can more than double accuracy of OSB and similar classifiers. It is safe to run mailtrainer.crm repeatedly on a .css fileset and training data; if the data doesn't need to be trained in, it won't be (unlike the old "make cssfiles" command, which forces everything in whether it is useful or not). This is a big improvement and minimizes .css file bloating. "make cssfiles" has now been fixed to use mailtrainer.crm. The example files in each of the spam and good directories need to be one example per file. The closer these files are to what mailfilter.crm wil see in real life the better your training will be. Preferably the headers and text will be complete, intact, and unmutilated. The closer these examples are to what SMTP will show "on the wire" the better. If you use a mail reader that puts your "good" and "spam" emails as separate files in two different directories (or can hack up a script to do that) then you could even run mailtrainer.crm automatically every night to optimize the .css files to your current profile. If you do this, your script needs to gaurd against situations where you haven't checked your mail in a few days and errors crept in; for safety your script should only add the files to the training directories until you have hand-checked them (or at least tacitly agreed). If you find you've made a mistake, don't worry. It's recoverable. Just put the misplaced files into the correct directory and rerun mailtrainer.crm . That will re-optimize the .css files (though some low-value features may be swept away). Alternatively, if you start out keeping each and every file that you've trained, you can just delete the erroneous spam.css and nonspam.css files and re-run mailtrainer.crm to get correct .css files. It's OK to have the spam and good directories just be full of links (either symlinks or hardlinks) to the actual spam and good mail files (that's what I do). NOTE: mailfilter.crm doesn't (yet) understand how to build and maintain the spam and good email directories. NOTE 2: It is at this point unknown whether it's a good idea or a bad idea to run mailtrainer on the probably good and probably bad emails (which end up in the reaver cache as .../prob_good/whatever and .../prob_spam/whatever, or just on those that are in the thick threshold zone. If anyone gets good data on this, let me know please. ----- Mailtrainer Options --- The mailtrainer.crm options are as follows. You *must* provide --spam and --good; the other flags are optional. Required: --spam=/directory/full/of/spam/files/one/per/file --good=/directory/full/of/good/files/one/per/file These define the directories or files to be learned. If these end with a slash, it means a directory and all of the files within are used, otherwise, it's taken as a file. If the filename contains a wildcard, be sure to enclose it in singlequotes 'like.this' or else BASH will do bad things to it. Note that this is (currrently) incompatible with the --random shuffling of training order. Optional: --help - quick synopsys of mailtrainer options. --thick=N - thickness for thick-threshold training- this overrides the thickness in your mailfilter.cf file. Omit it if you want to use the in-file value. (10 works well for most classifiers; use 0.1 or less for Hyperspace) --streak=N - how many successive correct classifications before we conclude we're done. Default is 10000. This number should be larger than the total number of sample emails. --repeat=N - how many passes should we go through this corpus before we conclude we're done. Default is 1 --worst=N - run the entire training set, then train only the N worst offenders, and repeat. This is excruciatingly slow but produces very compact .css files. Use this only if your host machine is dreadfully short of memory. Default is NOT to use worst-offender training. N=5% of your total corpus works pretty well, but N=1 will produce the most compact .css files. --random - randomize the training set rather than taking the files in sequential alternating order (one from good, then one from spam). Note that this is (currently) incompatible with a wildcard for selection of good versus spam files. --reload - if we run out of one kind of file (good or spam) before the other, "reload" (start from the first file again) in that category. Default is to simply use only the remaining category for the remainder of the training pass. --verbose - Verbose. Print out more stuff. --fileprefix=directory - use the mailfilter.cf, rewrites.mfp, and .css files in 'directory', rather than in the current directory. --goodcss=somecssfile.css - use this 'good' cssfile instead of the default "nonspam.css" --spamcss=somecssfile.css - use this 'spam' cssfile instead of the default "spam.css" --collapse - collapse the flying output down to scroll less on a TTY. --report_header="some text" - put this at the head of the report --rfile="somefilename.txt" - append (not overwrite!) log to this file. --validate=regex_no_slashes - Any file with a name that matches the regex supplied is not used for training; instead it's held back and used for validation of the new .css files. The result will give you an idea of how well your .css files will work. Do NOT put slashes around the regex! Example 1: - We want to create new .css files for our mail filter - We already have presorted directories of good and spam email - We have already set up mailfilter.cf and rewrites.mfp to define our preferred configuration, Then we can use the following incantation to build some nice .css files (not perfect, but not bad). This incantation can all be on one line (remove the '\' backslash characters if you put it on one line), and don't forget the trailing slash for directory names; otherwise mailtrainer will try to train the directory listing itself (and fail, because a directory can't be read like a normal file). Note that you *must* set up your mailfilter.cf and rewrites.mfp files first, before doing this, otherwise you'll generate bad .css files, or possibly get an error! crm mailtrainer.crm \ --good=/your/good/files_dir/ \ --spam=/your/spam/files_dir/ \ --repeat=5 \ --random Example 2: - We want to run mailtrainer.crm against a bunch of examples in the directory ../SA2/spam/ and ../SA2/good/. (This happens to be where the TREC test set is on my computer- your location will be different) - We want to quit when we get 4000 tests in a row correct, or if we go through the entire corpus 5 times. - We want to use DSTTTR, with a training thickness of 5 pR units. - We want to "validate" our training - that is, to hold back some fraction of the training set as test cases. In our case here, we decide we want to use any file name that contains a "*3.*" . These files will be saved up and used as a test corpus instead of for training. Here's the command (this can all be on one line as well; if so, remove the backslashes): crm mailtrainer.crm \ --spam=../SA2/spam/ \ --good=../SA2/good/ \ --repeat=5 \ --streak=4000 \ --validate=[3][.] \ --thick=5.0 This will take about eight minutes to run on the TREC 2005 SA corpus of about 6000 messages; 1000 messages a minute is a good estimate for 5 passes of DSTTTTR training. Notes: * If the .css statistics files don't exist, they will be created for you, in the format set up by the mailfilter.cf file. So- be SURE to set up mailfilter.cf first! * If the first test file comes back with a pR of 0.0000 exactly, it is assumed that these are empty .css statistics files, and that ONE file will be trained in to each .css file that returns a 0.0000, simply to get the system "off center" enough that normal training can occur. If there is anything already in the files, this won't happen. * When running N-fold validation, if the filenames are named as in the SA2 corpus in a form of 00123.456789 , there's an easy trick to partition the data into 10 roughly equal sets. Just use a validation regex like [0][.] for the first run, [1][.] for the second run, [2][.] for the third, and so on. Notice that this a CRM114-style regex, and _not_ a BASH-style file globbing as "*3.*" would be. If you use a globbing regex like "*3.*" , then BASH will suck it in and expand it in-line to all of the individual filenames and that won't work. A regex like [chars] is invisible to BASH and so will pass unscathed. * If you want to run N-fold validation, you must remember to delete and rebuild a fresh set of .css files after each run, otherwise you will not get valid results. * N-fold validation does NOT run training at all on the validation set, so if you decide you like the results, you can do still better by running mailtrainer.crm once again, but DO NOT specify --validate. That will train in the validation test set as well, and hopefully improve your accuracy still more. --------------------------------------------------------------------- That's all! If you have errors or updates (or find bugs!) please let me know; the best way is to join the CRM114-general mailing list; it's on the webpage: http://crm114.sourceforge.net and ask there. The reason for using the mailing list rather than personal email is that personal email isn't archived, but the mailing list _is_ both archived and read widely, so we not only create a background archive of solutions but you will get a better answer back faster than if you sent the email to me alone. Enjoy, and good luck. -Bill Yerazunis crm114-20100106-BlameMichelson.src/crm_svm_lib_fncts.c0000644000000000017500000012267111321154266020643 0ustar rootwsy#include "crm_svm_lib_fncts.h" // crm_svm_lib_fncts.c - Support Vector Machine //////////////////////////////////////////////////////////////////////// // This code is originally copyright and owned by William // S. Yerazunis as file crm_neural_net. In return for addition of // significant derivative work, Jennifer Barry is hereby granted a full // unlimited license to use this code, includng license to relicense under // other licenses. //////////////////////////////////////////////////////////////////////// // // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. //static function declarations static SVM_Solution *svm_solve_init_sol(Matrix *Xy, Vector *st_theta, double weight, int max_train_val); /********************************************************************** *This method is taken from: * Training SVMs in Linear Time * Thorsten Joachims * ACM Conference on Knowledge Discovery and Data Mining 2006 * *For labeled examples {(x_1, y_1), (x_2, y_2), ..., (x_n, y_n)}, the classic *SVM problem is * min_{theta, zeta >= 0} 0.5*theta*theta + C/n*sum_{i = 1}^n zeta_i * s.t. for all i 1 <= i <= n, y_i*theta*x_i >= 1 - zeta_i *where zeta is the vector of slack variables and C is large and positive. *The classification of an example x is * h(x) = sgn(theta*x) *Note that this formulation DOES NOT INCLUDE A CONSTANT VALUE. If you want *a constant value (so that h(x) = sgn(theta*x + b)), you can create that by *adding an extra column to each example with the value +1. * *Now define a binary vector c of length n. We will call this a "constraint *vector" and there are 2^n such vectors. *Let x_c = 1/n sum_{i=1}^n c_i*y_i*x_i for any c. *Then Joachims shows that the problem formulation * min_{theta, zeta >= 0} 0.5*theta*theta + C*zeta * s.t. for all c \in {0, 1}^n, theta*x_c >= 1/n*||c||_1 - zeta *where ||c||_1 is the L1-norm (ie the number of 1's in c) is equivalent *to the problem given above. In its dual form this is * max_{alpha >= 0} \sum_{c \in {0, 1}^n} 1/n*||c||_1*alpha_c - 0.5*sum_{c, c'} alpha_c alpha_c' x_c*x_c' * s.t. sum_{c} alpha_c <= C (DUAL PROBLEM) *where theta = sum_c alpha_c*x_c and *zeta = max_c (1/n||c||_1 - theta*x_c *In QP terms (which requires a sign change since we minimize) the problem is * min_{alpha} 0.5*alpha*H*alpha + f*alpha * s.t. A*alpha >= b * where H_{c, c'} = x_c*x_c' (k x k on the kth iteration) * f_c = -1/n*||c||_1 (1 x k) * A is k+1xk with the top row all -1's (corresponding to sum_{c} alpha_c) * and the bottom kxk matrix the kxk identity matrix times -1 * b is the vector 1xk+1 vector with the first entry -SVM_MAX_X_VAL * and the last k entries 0 * *Clearly this form of the problem has an exponential number of contraints. *To solve it efficiently we use a cutting-plane method to decide which *constraints are important. Specifically we do the following: * 1) Solve the DUAL PROBLEM over the current H, f, A, b * see quad_prog.c for how that is done. * 2) Calculate theta and zeta from the alpha_c using the above equations * 3) Calculate the most violated constraint c * c has a 1 in the i^th position if x_i is not classified correctly * by a large enough problem * 4) Check how much this constraint is violated by. * This corresponds to zeta' = 1/n*||c||_1 - theta*x_c * If this error is within ACCURACY of the training error zeta * from the last QP run (ie within ACCURACY of "as close as we could get") * we return * 5) Update the arguments to the QP solver * This requires: * Adding a row and a column to H of the dot products with the new x_c * Adding -1/n||c||_1 as the last entry of f * Adding a row and column to A. The top and bottom entries of the new * column are -1 * Adding a new 0 entry to b * We also save x_c so that we can recreate theta from the alpha_c's * * *INPUT: This method takes as input the data matrix X where * X_i = y_i*x_i * Here X_i is the ith row of X, y_i is the label (+/-1) of the ith * example and x_i is the ith example. Ie, if x_i is an example belonging to * class -1, the ith row of X would be x_i multiplied by -1. (In the case where * x_i contains only positive entries, then the ith row of X would contain only * negative entries.) * *OUTPUT: The function returns a solution struct. This struct contains the * vector theta, which is the decision boundary: The classification of * an example x is * h(x) = theta*x * theta is a NON-SPARSE vector. If X originally had a number of nonzero * columns, you need to convert theta into a sparse vector using the * colMap returned from preprocess. If you call this function using the * wrapper function, solve, it will take care of this process for you and * return theta as a sparse vector. * * The solution struct also contains all of the support vectors, which allows * you to restart the learning with a new example without having to relearn * which of the old examples were support vectors. Support vectors are those * vectors on or in violation of the margin (y_i*theta*x_i <= 1 => x_i is a * support vector). * * There is also data in the solution struct allowing the restart of the * solution. The variable num_examples will be set to Xy->rows and * max_train_val = curr_max_train_val - sum_c alpha_c. For how these are used * see the comment to svm_solve_init_solution. * * Xy will then only contain those vectors that ARE NOT support vectors. * Note that Xy WILL CHANGE. * *TYPES: We recommend that X be a SPARSE ARRAY to optimize memory caching * but testing has shown that making it a SPARSE LIST doesn't slow anything * down too much. Not sure why you would want to do that though since X is * static and a SPARSE LIST would actually take more memory. * DO NOT CHANGE THE TYPES IN THE FUNCTION. To make this as fast as possible, * I have moved away in some places from using structure-independent calls. * If you, for example, change theta to be anything but a MATR_PRECISE NON_SPARSE * vector it WILL cause a seg fault. * *ACCURACY: This runs until it finds a solution within some "accuracy" * parameter. Here the accuracy on each iteration with new constraint vector * c, current slack variable zeta, and current decision boundary theta * is measured by the function: * delta = ||c||_1/n - 1/n*sum_{i=1}^n c_i*theta*y_i*x_i - zeta * This is exactly the average over the margin violations of all of the * vectors minus the average margin violations already accounted for by the * slack variable. In other words, accuracy is the *average margin violation * unaccounted for by the slack variable*. Since this is MARGIN violation, * NOT necessarily a classification error (we would have to violate the margin * by more than 1 for a classification error), we can set the ACCURACY * parameter fairly high (ie 0.01 or 0.001) and still have good results. * *SV_TOLERANCE: An example x_i is tagged as a support vector if * theta*y_i*x_i <= 1 + SV_TOLERANCE. In general, setting * SV_TOLERANCE = ACCURACY is approximately right since ACCURACY is kind of * how much play we have around the margin. A low SV_TOLERANCE, will lead * to fast inclusion of new examples (because there are fewer support vectors * from old runs), but less accuracy. One strategy might be to set * SV_TOLERANCE very low (even to 0), but rerun all seen examples every so * often. * *WARNINGS: *1) This function uses NON-SPARSE vectors of length X->cols * to do fast dot products. Therefore, X should not have a large * number of fully zero columns. If it is expected to, run * preprocess first. * *2) X should contain NO ZERO ROWS. If it may, run preprocess first. * *3) On return Xy contains only those vectors that are NOT support vectors. * Xy WILL CHANGE. * *CALLING SVM_SOLVE: with a new matrix and a null old solution will * preprocess the matrix and feed it to this function correctly. ***********************************************************************/ SVM_Solution *svm_solve_no_init_sol(Matrix *Xy) { return svm_solve_init_sol(Xy, NULL, 0, SVM_MAX_X_VAL); } /************************************************************************* *Solves the SVM problem using an initial solution. As far as I know *(I haven't really done that much research) this approach is novel. * *Note that all of the x_c's we calculated in our old solution still exist *if we simply assume 0's in the correct place for the c's. The only *difference is that now we have more examples so the denominator changes. *Specifically if we had n_old examples before and n_new examples now *we need to update * x_c -> n_old/(n_old+n_new)*x_c *Therefore, if the old decision boundary was theta, the new boundary is * theta -> n_old/(n_old + n_new)*theta * *Now we break alpha, f, and H (see above comment for these definitions) *into two parts: the "old" parts, which we solved for before and the "new" *parts that we have yet to solve. The QP becomes: * min_{alpha >= 0} 0.5*alpha_old*H_old*alpha_old + f_old*alpha_old * + 0.5*alpha_new*H_new*alpha_new + f_new*alpha_new + alpha_new*H_{new, old}*alpha_old * s.t. sum_{c} alpha_c <= C *Clearly, the problem almost decouples - the only term involving alpha_new *and alpha_old is the last (note that H_{new, old} = X_{c, new}*X_{c, old}^T). *However, define * theta_new = alpha_new*X_{c, new} = sum_{new c} (alpha_c*x_c) * theta_old = X_{c, old}^T*alpha_old = sum_{old c} (alpha_c*x_c) *When we solved for alpha_old, we assumed that theta_new = 0. How good was *this approximation? Well, the full answer to the problem is * theta = theta_old + theta_new *If we assume that we are adding in just a few new examples to a problem we *have already pretty well learned, ||theta_new|| << ||theta_old||, making *this a good approximation. Therefore, in the incremental learning, we *hold alpha_old constant. This pulls out the terms just involving alpha_old *leaving us with the QP problem * * min_{alpha >= 0} 0.5*alpha_new*H_new*alpha_new + f_new*alpha_new + alpha_new*H_{new, old}*alpha_old * s.t. sum_{new c} alpha_c <= C - sum_{old c} alpha_c *Now you might worry that the term alpha_new*H_{new, old}*alpha_old *is difficult to *calculate but in fact it is quite easy. Consider: * H_{new, old}*alpha_old = X_new*X_old^T*alpha_old = X_new*theta_old *Therefore, just by saving theta_old (which we were doing anyway since it was *our old decision boundary!) we can simply fold the last term into the linear *term using * f' = f_new + X_new*theta_old. *Therefore, the only extra calculation we must do per iteration is * f_c' = f_c + x_c*theta_old. *This is a simple dot product. * *Now what if we didn't have a lot of examples to start with? Does that mess *up this assumption? Possibly. Therefore, we also train on the old support *vectors. This gives the old solution some "input" as well as dealing with *this problem - if our old solution incorporates very few examples they are *likely to be almost all support vectors. Therefore, we will train on them *again. * *In addition, notice that each x_c is weighted by 1/n. In this formulation *n is always the total number of examples seen, NOT the current number we *are training on. That means that if our old decision boundary is theta_d, *theta_old = n_old/(n_old + n_new)*theta_d. Thus if n_old is small and *n_new is large, our old solution will not influence the new solution much. *Similarly, as we argued above, if n_old is large and n_new is small, the *new solution is a small addition to the old one. * *There's one other issue: each time we go through this, we drop the maximum *allowed value for the sum of the alpha's. If this value started fairly small *it can get to zero pretty quickly. Therefore, we bottom it out at *SVM_MIN_X_VAL so that every new example will contribute something to the answer. * *A few notes about what the arguments to this function are: * Note that when we add more examples we need to "pretend" as though they * were there all along. This means that any time in the old algorithm that * we divided by 1/n, n needs to increase to include the new examples. * Therefore, st_theta should actually NOT just be the old decision boundary * since that was calculated using the wrong n. If the old decision boundary * was theta_d, st_theta should be * st_theta = n_old/(n_old + n_new)*theta_d * In addition, in THIS algorithm anywhere we multiply by 1/n, we need to make * sure that n is n = n_old + n_new. For that reason we pass in the "weight" * parameter where weight = 1.0/(n_old + n_new). * * We also have that sum_{c new} alpha_c <= C - sum_{c old} alpha_c. * Therefore, we need to remember the boundary on the alpha_c. This is passed * in as max_sum = C - sum_{old c} alpha_c. In summary: * *INPUT: * Xy: The matrix of the NEW examples and the OLD SUPPORT VECTORS multiplied * by their label. * st_theta: The reweighted old decision boundary. If the old decision * boundary was theta_d calculated using n_old examples and we are adding in * n_new examples, st_theta = n_old/(n_old + n_new)*theta_d. If you have * no old solution, use st_theta = NULL. * weight: If the previous solution was calculated on n_old examples and we * are adding n_new examples, weight = 1.0/(n_old + n_new). In other words, * weight = 1.0/n where n is the TOTAL NUMBER OF EXAMPLES WE HAVE SEEN. * max_sum: The sum of all alpha's calculated in the old solution subtracted * from the original maximum value max_sum = SVM_MAX_X_VAL - sum_{old c} alpha_c. * * *OUTPUT: * The function returns a solution struct. See the comment to * svm_solve_no_init_sol for an explanation of that struct. * *TRAINING METHOD: * The incremental method is most error prone in the region * ||theta_new ~= theta_old||. If you * have about the same number of old and new examples, it is almost certainly * better and not much (if any) slower to retrain the whole thing than to try * to use the incremental method to add those on. This is ESPECIALLY TRUE if * the new examples are differently biased (ie many more negative or many more * positive) than the old example. * THE SVM IS MOST SENSITIVE TO THE MOST RECENTLY TRAIN THINGS! If you are * using an incremental training method, try to mix positive and negative * examples as much as possible! * *TYPES, ACCURACY, SV_TOLERANCE: See the comment to svm_solve_no_init_sol. * *WARNINGS: *1) This function uses NON-SPARSE vectors of length X->cols * to do fast dot products. Therefore, X should not have a large * number of fully zero columns. If it is expected to, run * preprocess first. * *2) X should contain NO ZERO ROWS. If it may, run preprocess first. * *3) On return Xy contains only those vectors that are NOT support vectors. * Xy WILL CHANGE. * *4) st_theta is NOT the old solution. It is the old solution REWEIGHTED by * n_old/(n_old + n_new). * *5) Xy should ONLY contain new examples and old support vectors. It should * NOT contain previously seen old non-support vectors if st_theta is * non-null. * *CALLING SVM_SOLVE: with an old solution struct and a new matrix will compute * the correct arguments to this function and take care of the preprocessing. * We HIGHLY RECOMMEND that you do that. This function is static because * the arguments to it are complicated! ****************************************************************************/ static SVM_Solution *svm_solve_init_sol(Matrix *Xy, Vector *st_theta, double weight, int max_sum) { unsigned int n, i, j; Vector *row, //a row of XC usually *xc, //the x_c we are adding //Lagrange multipliers - the solution of the QP problem *alpha = vector_make(0, SPARSE_LIST, MATR_PRECISE), //Solution to the SVM. Should be NON_SPARSE for fastest execution. *theta = vector_make(Xy->cols, NON_SPARSE, MATR_PRECISE), //Linear term in the QP problem (-1/n*||c||_c + st_theta*x_c) *f = vector_make(0, NON_SPARSE, MATR_PRECISE), //The L1 norms of the c *l1norms = vector_make(0, NON_SPARSE, MATR_COMPACT), //The constraint vector for the QP problem. The first term of //b is max_sum. The rest are zeros *b = vector_make(1, SPARSE_LIST, MATR_PRECISE); double delta = SVM_ACCURACY + 1, zeta, d, s, dev; //The Hessian H_{c, c'} = x_c*x_c' //You could try to save space by making this compact and leaving out the 1/n^2 //terms, but the numbers in H will quickly exceed 32 bit so it's probably //not worth it Matrix *H = matr_make(0, 0, NON_SPARSE, MATR_PRECISE), //The constraint matrix for the QP problem. The top row of //A is k+1xk where k is the number of iterations. //The top row is all 1's for the constraint sum_{c}alpha_c <= C //The remaining kxk matrix is I_k (kxk identity) to represent alpha_c >= 0 *A = matr_make_size(1, 0, SPARSE_ARRAY, MATR_COMPACT, SVM_EXP_MAX_IT), //The current x_c's we are considering. We make this compact by actually //storing n*x_c. This should be NON_SPARSE for fastest execution. *XC = matr_make(0, Xy->cols, NON_SPARSE, MATR_COMPACT); VectorIterator vit; int nz, loop_it = 0, sv[Xy->rows], offset; SVM_Solution *sol; MATR_DEBUG_MODE = SVM_DEBUG_MODE; if (!alpha || !theta || !f || !l1norms || !b || !H || !A || !XC) { if (SVM_DEBUG_MODE) { fprintf(stderr, "Error initializing svm solver.\n"); } vector_free(alpha); vector_free(theta); vector_free(f); vector_free(l1norms); vector_free(b); matr_free(H); matr_free(A); matr_free(XC); return NULL; } n = Xy->rows; //set up the first row of b to be SVM_MAX_X_VAL (ie the constant C) //note that our QP solver takes constraints of the form A*x >= b //so everything is multiplied by -1 vectorit_set_at_beg(&vit, b); if (max_sum > SVM_MAX_X_VAL) { max_sum = SVM_MAX_X_VAL; } if (max_sum < SVM_MIN_X_VAL) { max_sum = SVM_MIN_X_VAL; } if (SVM_DEBUG_MODE >= SVM_SOLVER_DEBUG) { fprintf(stderr, "Using %d as limit for multipliers.\n", max_sum); } vectorit_insert(&vit, 0, -1.0*max_sum, b); if (SVM_DEBUG_MODE >= SVM_SOLVER_DEBUG_LOOP) { fprintf(stderr, "Xy = \n"); matr_print(Xy); } if (weight < SVM_EPSILON) { weight = 1.0/n; } while (delta > SVM_ACCURACY && loop_it < SVM_MAX_SOLVER_ITERATIONS) { if (!(loop_it % SVM_CHECK) && delta <= SVM_CHECK_FACTOR*SVM_ACCURACY) { //close enough break; } //run the QP problem if (H->rows > 0) { if (SVM_DEBUG_MODE >= SVM_SOLVER_DEBUG) { fprintf(stderr, "Running quadratic programming problem.\n"); } run_qp(H, A, f, b, alpha); if (SVM_DEBUG_MODE >= SVM_SOLVER_DEBUG) { fprintf(stderr, "Returned from quadratic programming problem.\n"); } } //calculate theta //time for loop is N*|W| //theta = st_theta + sum_c alpha_c*x_c vectorit_set_at_beg(&vit, alpha); if (st_theta) { vector_copy(st_theta, theta); } else { vector_zero(theta); } //sum_c alpha_c*x_c while(!vectorit_past_end(vit, alpha)) { row = matr_get_row(XC, vectorit_curr_col(vit, alpha)); if (!row) { continue; } vector_add_multiple(theta, row, weight*vectorit_curr_val(vit, alpha), theta); //for (i = 0; i < XC->cols; i++) { //theta->data.nsarray.precise[i] += // weight*(vit.pcurr->data.data)*(row->data.nsarray.compact[i]); //} vectorit_next(&vit, alpha); } //calculate which examples we aren't classifying //with a high enough margin //this gives us our new x_c and also the average margin //deviation over ALL the examples (the variable dev) matr_add_row(XC); xc = matr_get_row(XC, XC->rows-1); //will hold our x_c if (!xc) { //this indicates that something has gone really wrong //probably the original input was corrupted break; } s = 0; nz = 0; for (i = 0; i < n; i++) { //d = dot(theta, example i); d = 0; row = matr_get_row(Xy, i); if (!row) { continue; } d = dot(theta, row); if (d < 1) { //we violate the margin s += d; //add it to our average deviation vector_add(xc, row, xc); //and to x_c nz++; //number of ones in c } //keep track of the support vectors //namely those that are exactly at the margin //or in violation //we will save them in case we need to restart the learning if (d <= 1.0+SV_TOLERANCE) { //support vector! sv[i] = 1; } else { sv[i] = 0; } } //this is the average deviation from the margin dev = weight*s; //add a row and a column to H //corresponding to the new XC //and calculate zeta matr_add_col(H); matr_add_row(H); zeta = 0; vectorit_set_at_beg(&vit, f); //loop is |W|*N for (i = 0; i < H->rows; i++) { //d = (weight^2)*dot(matr_get_row(XC, i), xc); d = 0; //s = weight*dot(matr_get_row(XC, i), theta) s = 0; row = matr_get_row(XC, i); if (!row) { continue; } //it's more efficient to do both of these together //enough to have an impact on the running time //since xc, row, and theta are all initialized in this //function, we know what vector type they are //and can access the data directly for (j = 0; j < XC->cols; j++) { d += xc->data.nsarray.compact[j]* row->data.nsarray.compact[j]; s += theta->data.nsarray.precise[j]* row->data.nsarray.compact[j]; } d *= weight*weight; //these are inserts at the end of a sparse vector //will be fast //note that H is symmetrical (yay for positive semi-definiteness!) row = matr_get_row(H, H->rows-1); if (!row) { //bad problems break; } vectorit_set_at_end(&vit, row); vectorit_insert(&vit, i, d, row); row = matr_get_row(H, i); if (!row) { //disaster! break; } vectorit_set_at_end(&vit, row); vectorit_insert(&vit, H->cols-1, d, row); //now calculate zeta //this is zeta from solving the QP problem waaaay at the top //of the loop //it is more efficient to calculate it here, but we need to //remember not to incorporate our newest (as yet untrained on) x_c if ((int)i < (int)(H->rows - 2)) { d = weight*(vector_get(l1norms, i) - s); if (d > zeta) { zeta = d; } } } //add a column to f //this is 1/n*||c||_1 + x_c dot theta_old vector_add_col(f); vectorit_set_at_end(&vit, f); if (st_theta) { d = weight*dot(st_theta, xc); } else { d = 0; } vectorit_insert(&vit, f->dim-1, -1.0*(nz)*weight + d, f); //add a column to l1norms //this is the number of non-zero entries in c vector_add_col(l1norms); vectorit_set_at_end(&vit, l1norms); vectorit_insert(&vit, l1norms->dim-1, nz, l1norms); //add a row and a column to A matr_add_col(A); matr_add_row(A); row = matr_get_row(A, 0); if (!row) { //uh oh break; } vectorit_set_at_end(&vit, row); vectorit_insert(&vit, A->cols-1, -1, row); row = matr_get_row(A, A->rows-1); if (!row) { //not good break; } vectorit_set_at_end(&vit, row); vectorit_insert(&vit, A->cols-1, 1, row); //add a column to b (last element is zero) vector_add_col(b); //add a column to alpha //note that the solution to the last iteration is an excellent //starting point for the next iteration for exactly the reasons //that this iterative method works //so just add this column and //don't reset alpha to be anything vector_add_col(alpha); //calculate the accuracy //this is the average deviation from the margin //not already accounted for by zeta //note that we "assume" that old examples we are not training //on STILL don't violate the margin delta = weight*nz - dev - zeta; //print out more debugging information if (SVM_DEBUG_MODE >= SVM_SOLVER_DEBUG_LOOP) { fprintf(stderr, "theta = "); vector_print(theta); fprintf(stderr, "x_c = "); vector_print(xc); fprintf(stderr, "alpha = "); vector_print(alpha); fprintf(stderr, "zeta = %.10lf dev = %lf nz = %d, weight = %lf\n", zeta, dev, nz, weight); } if (SVM_DEBUG_MODE >= SVM_SOLVER_DEBUG) { fprintf(stderr, "%d: delta = %.10lf\n", loop_it, delta); } loop_it++; } if (delta > SVM_ACCURACY + SVM_EPSILON && SVM_DEBUG_MODE) { fprintf(stderr, "Warning: SVM solver did not converge all the way. Full convergence would have solved to an accuracy of %lf - we solved only to an accuracy of %lf. If this is not accurate enough, increase SVM_MAX_SOLVER_ITERATIONS, decrease SVM_CHECK_FACTOR, or change your training method.\n", SVM_ACCURACY, delta); } //free everything! vector_free(f); vector_free(l1norms); vector_free(b); matr_free(H); matr_free(A); matr_free(XC); //make the solution block sol = (SVM_Solution *)malloc(sizeof(SVM_Solution)); sol->theta = theta; sol->num_examples = n; //store the support vectors sol->SV = matr_make_size(0, Xy->cols, Xy->type, Xy->compact, Xy->size); offset = 0; for (i = 0; i < n; i++) { if (sv[i]) { row = matr_get_row(Xy, i - offset); if (!row) { continue; } matr_shallow_row_copy(sol->SV, sol->SV->rows, row); matr_erase_row(Xy, i-offset); offset++; } } //figure out what the maximum value is next time vectorit_set_at_beg(&vit, alpha); sol->max_train_val = max_sum; while (!vectorit_past_end(vit, alpha)) { sol->max_train_val -= vectorit_curr_val(vit, alpha); vectorit_next(&vit, alpha); } if (sol->max_train_val > SVM_MAX_X_VAL) { sol->max_train_val = SVM_MAX_X_VAL; } else if (sol->max_train_val < SVM_MIN_X_VAL) { sol->max_train_val = SVM_MIN_X_VAL; } //free more stuff! vector_free(alpha); return sol; } /********************************************************************** *Removes zero rows and columns from the matrix X. *If the number of columns of X is an integer (ie X->cols < 2^32) *then this runs in constant time O(ns). * *INPUT: Matrix X from which to remove zero rows and columns. * old_theta: the old decision boundary if you have it. this will densify * the columns of that boundary so that it can be used with the preprocessed * X. If you have no old solution, pass in NULL. * *OUTPUT: An expanding array colMap mapping between the renumbered * columns of X and the old columns of X. Specifically colMap[i] = j * if the ith column of X AFTER preprocessing was the jth column BEFORE * preprocessing. X remains sparse. The same is true of old_theta if * you passed one in. **********************************************************************/ ExpandingArray *svm_preprocess(Matrix *X, Vector *old_theta) { ExpandingArray *colMap; colMap = matr_remove_zero_rows(X); expanding_array_free(colMap); if (X->type == NON_SPARSE) { //don't densify X if X is non-sparse return NULL; } if (old_theta) { matr_shallow_row_copy(X, X->rows, old_theta); } colMap = matr_remove_zero_cols(X); if (old_theta) { matr_erase_row(X, X->rows-1); } return colMap; } /*************************************************************** *Solve the SVM problem * *INPUT: Xy: the matrix of examples. Examples from class 0 should * be multiplied by the label +1 and examples from class 1 should be * multiplied by the label -1. * sol_ptr: A pointer to the old SVM solution or a pointer to * NULL if there is no old solution. * *OUTPUT: sol_ptr will contain a pointer to an SVM_Solution struct * which can be used to resolve the SVM with more examples or * to classify examples. For an overview of the struct, see the OUTPUT * comment to solve_svm_no_init_sol. * * Xy = *Xy_ptr will contain the examples that are NOT support vectors. If * there were support vectors in *sol_ptr that are no longer support * vectors these will have been added to Xy. Similarly, all examples * in Xy that became support vectors will have moved to the solution struct. * If all examples are support vectors Xy = NULL. * *WARNINGS: *1) sol_ptr is a DOUBLE POINTER because the svm solver returns * a pointer. even if you have no previous svm solution sol_ptr * should not be NULL - *sol_ptr should be NULL. *2) Note that each row of Xy should be *premultiplied* by the * class label which MUST be +/-1 (classic SVM problem). This algorithm * does not explicitly add a constant value to the decision (ie it solves * for theta such that h(x) = sgn(theta*x)). If you want a constant * value, you need to add a column of all +/-1 to each example. *3) Xy does NOT have to be preprocessed (ie have the all-zero * rows and columns removed). This function will do that for * you. If you do the preprocess ahead of time, this function * will just redo that work. *4) On return Xy = *Xy_ptr will contain only those examples (perhaps * including old support vectors from *sol_ptr) that are NOT * support vectors. The solution struct will contain the support vectors. * Note that Xy WILL CHANGE and MAY BE NULL. Note that sol WILL CHANGE. * Examples will migrate between Xy_ptr and sol_ptr... don't expect them * to _not_ move. * ****************************************************************************/ void svm_solve(Matrix **Xy_ptr, SVM_Solution **sol_ptr) { SVM_Solution *sol; ExpandingArray *colMap = NULL; int i, n_old_examples, max_train; VectorIterator vit; Vector *theta, *row; Matrix *Xy; double weight; MATR_DEBUG_MODE = SVM_DEBUG_MODE; if (!sol_ptr) { if (SVM_DEBUG_MODE) { fprintf(stderr, "svm_solve: unitialized sol_ptr. If you have no previous svm solution, make *sol_ptr = NULL\n"); } return; } if (!Xy_ptr) { if (SVM_DEBUG_MODE) { fprintf(stderr, "svm_solve: unitialized Xy_ptr. If you have no new examples, make *Xy_ptr = NULL\n"); } return; } sol = *sol_ptr; Xy = *Xy_ptr; if (sol) { //for what we do with the old solution, see the comment to //svm_solve_init_sol if (SVM_DEBUG_MODE >= SVM_SOLVER_DEBUG) { fprintf(stderr, "Incorporating old solution\n"); } theta = sol->theta; //n_old_examples is the number of non-support-vector old examples n_old_examples = sol->num_examples - sol->SV->rows; //add the old support vectors into our current matrix matr_append_matr(&Xy, sol->SV); max_train = sol->max_train_val; if (n_old_examples < 0 || !(Xy) || !(Xy->data)) { if (SVM_DEBUG_MODE) { fprintf(stderr, "svm_solve: something is weird with the initial solution. Why don't you try again with no initial solution?\n"); } svm_free_solution(sol); *sol_ptr = NULL; return; } } else { if (!Xy) { if (SVM_DEBUG_MODE) { fprintf(stderr, "One of *Xy_ptr and *sol_ptr must be non-null!\n"); } return; } //no initial solution theta = NULL; n_old_examples = 0; max_train = SVM_MAX_X_VAL; } //debugging info if (SVM_DEBUG_MODE >= SVM_SOLVER_DEBUG) { fprintf(stderr, "Xy is %d by %u with %d non-zero elements\n", Xy->rows, Xy->cols, Xy->nz); } //get rid of zero rows and columns of Xy colMap = svm_preprocess(Xy, theta); if (!(Xy->rows) && !(n_old_examples)) { if (SVM_DEBUG_MODE) { fprintf(stderr, "SVM solve: nothing to learn on.\n"); } if (*sol_ptr) { svm_free_solution(*sol_ptr); } *sol_ptr = NULL; return; } //this is 1/(n_old + n_new) - ie 1/(total # of examples we've seen) weight = 1.0/(Xy->rows + n_old_examples); if (theta) { if (SVM_DEBUG_MODE >= SVM_SOLVER_DEBUG) { fprintf(stderr, "sol->num_examples = %d, n_old_examples = %d, Xy->rows = %d\n", sol->num_examples, n_old_examples, Xy->rows); fprintf(stderr, "multiplying theta by %f\n", sol->num_examples/(n_old_examples + (double)Xy->rows)); } //reweight theta to include the new examples in n vector_multiply(theta, sol->num_examples/(n_old_examples+(double)Xy->rows), theta); } //more debugging information... if (SVM_DEBUG_MODE >= SVM_SOLVER_DEBUG) { fprintf(stderr, "After preprocess Xy is %d by %u\n", Xy->rows, Xy->cols); } if (SVM_DEBUG_MODE >= SVM_SOLVER_DEBUG_LOOP) { fprintf(stderr, "Xy = \n"); matr_print(Xy); } //run the solver! sol = svm_solve_init_sol(Xy, theta, weight, max_train); if (*sol_ptr) { //we don't need the old solution any more svm_free_solution(*sol_ptr); } if (!sol) { //uh oh, the solver choked on something //probably the data was corrupted if (Xy) { matr_free(Xy); } if (colMap) { expanding_array_free(colMap); } *Xy_ptr = NULL; *sol_ptr = NULL; if (SVM_DEBUG_MODE) { fprintf(stderr, "SVM Solver Error.\n"); } return; } //sol->num_examples = Xy->rows. so tell it that it also had n_old_examples //it didn't see but were used to generate the older solution sol->num_examples += n_old_examples; theta = sol->theta; //ok, yes, we do a lot of debugging if (SVM_DEBUG_MODE >= SVM_SOLVER_DEBUG) { fprintf(stderr, "Number support vectors: %d\n", sol->SV->rows); } //undo the densification if we did it //note that sol->SV and Xy are STILL SPARSE //they just have the densified column numbers //so we need to change that if (colMap) { //make theta sparse with the correct column numbers vector_convert_nonsparse_to_sparray(sol->theta, colMap); //give sol->SV the correct column numbers if (sol->SV->rows) { matr_add_ncols(sol->SV, expanding_array_get(sol->SV->cols-1, colMap).compact->i+1 - sol->SV->cols); for (i = 0; i < sol->SV->rows; i++) { row = matr_get_row(sol->SV, i); if (!row) { continue; } vectorit_set_at_end(&vit, row); while (!vectorit_past_beg(vit, row)) { vectorit_set_col(vit, expanding_array_get(vectorit_curr_col(vit, row), colMap).compact->i, row); vectorit_prev(&vit, row); } } } else { if (SVM_DEBUG_MODE) { fprintf(stderr, "svm_solve: No support vectors recorded. Run again with SV_TOLERANCE set higher if they are necessary.\n"); } } //give Xy the correct column numbers if (Xy && Xy->rows) { matr_add_ncols(Xy, expanding_array_get(Xy->cols-1, colMap).compact->i+1 - Xy->cols); for (i = 0; i < Xy->rows; i++) { row = matr_get_row(Xy, i); if (!row) { continue; } vectorit_set_at_end(&vit, row); while (!vectorit_past_beg(vit, row)) { vectorit_set_col(vit, expanding_array_get(vectorit_curr_col(vit, row), colMap). compact->i, row); vectorit_prev(&vit, row); } } } else { matr_free(Xy); Xy = NULL; } expanding_array_free(colMap); } if (Xy && !Xy->rows) { matr_free(Xy); Xy = NULL; } *sol_ptr = sol; *Xy_ptr = Xy; } /***********************SVM_Solution Functions******************************/ /*************************************************************************** *Classify an example. * *INPUT: ex: example to classify * sol: SVM solution struct * *OUTPUT: +1/-1 label of the example ***************************************************************************/ int svm_classify_example(Vector *ex, SVM_Solution *sol) { double d; if (!ex || !sol || !sol->theta) { if (SVM_DEBUG_MODE) { fprintf(stderr, "svm_classify_example: null argument.\n"); } return 0; } d = dot(ex, sol->theta); if (d < 0) { return -1; } return 1; } /***************************************************************************** *Write a solution struct to a file in binary format. * *INPUT: sol: Solution to write * filename: file to write to * *OUTPUT: the amount written in bytes ****************************************************************************/ size_t svm_write_solution(SVM_Solution *sol, char *filename) { FILE *fp = fopen(filename, "wb"); size_t size; if (!fp) { if (SVM_DEBUG_MODE) { fprintf(stderr, "svm_write_solution: bad filename %s\n", filename); } return 0; } size = svm_write_solution_fp(sol, fp); fclose(fp); return size; } /***************************************************************************** *Write a solution struct to a file in binary format. * *INPUT: sol: Solution to write * fp: file to write to * *OUTPUT: the amount written in bytes ****************************************************************************/ size_t svm_write_solution_fp(SVM_Solution *sol, FILE *fp) { //write theta size_t size; if (!sol || !fp) { if (SVM_DEBUG_MODE) { fprintf(stderr, "svm_wrte_solution: bad file pointer.\n"); } return 0; } size = vector_write_bin_fp(sol->theta, fp); //write support vectors size += matr_write_bin_fp(sol->SV, fp); size += sizeof(int)*fwrite(&(sol->num_examples), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(sol->max_train_val), sizeof(int), 1, fp); return size; } /***************************************************************************** *Read a solution struct from a file in binary format. * *INPUT: filename: file to read from * *OUTPUT: the solution struct stored in the file or NULL if it couldn't be * read * *WARNINGS: *1) This file expects a file formatted as svm_write_solution creates. If * file is not formatted that way, results may vary. It should not seg * fault, but that's about all I can promise. ****************************************************************************/ SVM_Solution *svm_read_solution(char *filename) { SVM_Solution *sol; FILE *fp = fopen(filename, "rb"); if (!fp) { if (SVM_DEBUG_MODE) { fprintf(stderr, "svm_read_solution: bad filename %s\n", filename); } return NULL; } sol = svm_read_solution_fp(fp); fclose(fp); return sol; } /***************************************************************************** *Read a solution struct from a file in binary format. * *INPUT: fp: file to read from * *OUTPUT: the solution struct stored in the file or NULL if it couldn't be * read * *WARNINGS: *1) This file expects a file formatted as svm_write_solution creates. If * file is not formatted that way, results may vary. It should not seg * fault, but that's about all I can promise. ****************************************************************************/ SVM_Solution *svm_read_solution_fp(FILE *fp) { SVM_Solution *sol = (SVM_Solution *)malloc(sizeof(SVM_Solution)); size_t unused; if (!fp) { if (SVM_DEBUG_MODE) { fprintf(stderr, "svm_read_solution: bad file pointer.\n"); } free(sol); return NULL; } sol->theta = vector_read_bin_fp(fp); if (!sol->theta) { if (SVM_DEBUG_MODE) { fprintf(stderr, "read_solution: Bad file.\n"); } free(sol); return NULL; } sol->SV = matr_read_bin_fp(fp); unused = fread(&(sol->num_examples), sizeof(int), 1, fp); unused = fread(&(sol->max_train_val), sizeof(int), 1, fp); return sol; } /*************************************************************************** *Maps a solution from a block of memory in binary format (the same *format as would be written to a file using write. * *INPUT: addr: pointer to the address where the solution begins * last_addr: the last possible address that is valid. NOT necessarily where * the solution ends - just the last address that has been allocated in the * chunk pointed to by *addr (ie, if *addr was taken from an mmap'd file * last_addr would be *addr + the file size). * *OUTPUT: A solution STILL referencing the chunk of memory at *addr, * but formated as an SVM_Solution or NULL if a properly formatted * solution didn't start at *addr. * *addr: (pass-by-reference) points to the first memory location AFTER the * full solution * *n_elts_ptr: (pass-by-reference) the number of elements actually read * *WARNINGS: * 1) *addr needs to be writable. This will CHANGE VALUES stored at *addr and * will seg fault if addr is not writable. * 2) last_addr does not need to be the last address of the solution * but if it is before that, either NULL will be returned or an * matrix with a NULL data value will be returned. * 3) if *addr does not contain a properly formatted solution, this function * will not seg fault, but that is the only guarantee. * 4) you MUST call solution_free! * 5) *addr and CHANGES! * 6) the address returned by this IS NOT EQUAL to *addr as passed in. ***************************************************************************/ SVM_Solution *svm_map_solution(void **addr, void *last_addr) { SVM_Solution *sol = (SVM_Solution *)malloc(sizeof(SVM_Solution)); sol->theta = vector_map(addr, last_addr); if (!sol->theta) { if (SVM_DEBUG_MODE) { fprintf(stderr, "map_solution: Bad file.\n"); } free(sol); return NULL; } sol->SV = matr_map(addr, last_addr); if (*addr > last_addr || *addr + 2*sizeof(int) > last_addr) { if (SVM_DEBUG_MODE) { fprintf(stderr, "map_solution: Bad file.\n"); } svm_free_solution(sol); return NULL; } sol->num_examples = *((int *)(*addr)); *addr += sizeof(int); sol->max_train_val = *((int *)(*addr)); *addr += sizeof(int); return sol; } /***************************************************************************** *Free a solution struct. * *INPUT: sol: struct to free ****************************************************************************/ void svm_free_solution(SVM_Solution *sol) { if (sol) { if (sol->theta) { vector_free(sol->theta); } if (sol->SV) { matr_free(sol->SV); } free(sol); } } crm114-20100106-BlameMichelson.src/Macbeth_Act_IV.txt0000644000000000017500000005421511321154266020275 0ustar rootwsy ACT IV. SCENE I. A dark Cave. In the middle, a Caldron Boiling. [Thunder. Enter the three Witches.] FIRST WITCH. Thrice the brinded cat hath mew'd. SECOND WITCH. Thrice; and once the hedge-pig whin'd. THIRD WITCH. Harpier cries:--"tis time, 'tis time. FIRST WITCH. Round about the caldron go; In the poison'd entrails throw.-- Toad, that under cold stone, Days and nights has thirty-one Swelter'd venom sleeping got, Boil thou first i' the charmed pot! ALL. Double, double, toil and trouble; Fire, burn; and caldron, bubble. SECOND WITCH. Fillet of a fenny snake, In the caldron boil and bake; Eye of newt, and toe of frog, Wool of bat, and tongue of dog, Adder's fork, and blind-worm's sting, Lizard's leg, and howlet's wing,-- For a charm of powerful trouble, Like a hell-broth boil and bubble. ALL. Double, double, toil and trouble; Fire, burn; and caldron, bubble. THIRD WITCH. Scale of dragon, tooth of wolf, Witch's mummy, maw and gulf Of the ravin'd salt-sea shark, Root of hemlock digg'd i' the dark, Liver of blaspheming Jew, Gall of goat, and slips of yew Sliver'd in the moon's eclipse, Nose of Turk, and Tartar's lips, Finger of birth-strangl'd babe Ditch-deliver'd by a drab,-- Make the gruel thick and slab: Add thereto a tiger's chaudron, For the ingredients of our caldron. ALL. Double, double, toil and trouble; Fire, burn; and caldron, bubble. SECOND WITCH. Cool it with a baboon's blood, Then the charm is firm and good. [Enter Hecate.] HECATE. O, well done! I commend your pains; And everyone shall share i' the gains. And now about the cauldron sing, Like elves and fairies in a ring, Enchanting all that you put in. Song. Black spirits and white, red spirits and gray; Mingle, mingle, mingle, you that mingle may. [Exit Hecate.] SECOND WITCH. By the pricking of my thumbs, Something wicked this way comes:-- Open, locks, whoever knocks! [Enter Macbeth.] MACBETH. How now, you secret, black, and midnight hags! What is't you do? ALL. A deed without a name. MACBETH. I conjure you, by that which you profess,-- Howe'er you come to know it,--answer me: Though you untie the winds, and let them fight Against the churches; though the yesty waves Confound and swallow navigation up; Though bladed corn be lodg'd, and trees blown down; Though castles topple on their warders' heads; Though palaces and pyramids do slope Their heads to their foundations; though the treasure Of nature's germins tumble all together, Even till destruction sicken,--answer me To what I ask you. FIRST WITCH. Speak. SECOND WITCH. Demand. THIRD WITCH. We'll answer. FIRST WITCH. Say, if thou'dst rather hear it from our mouths, Or from our masters? MACBETH. Call 'em, let me see 'em. FIRST WITCH. Pour in sow's blood, that hath eaten Her nine farrow; grease that's sweaten From the murderer's gibbet throw Into the flame. ALL. Come, high or low; Thyself and office deftly show! [Thunder. An Apparition of an armed Head rises.] MACBETH. Tell me, thou unknown power,-- FIRST WITCH. He knows thy thought: Hear his speech, but say thou naught. APPARITION. Macbeth! Macbeth! Macbeth! Beware Macduff; Beware the Thane of Fife.--Dismiss me:--enough. [Descends.] MACBETH. Whate'er thou art, for thy good caution, thanks; Thou hast harp'd my fear aright:--but one word more,-- FIRST WITCH. He will not be commanded: here's another, More potent than the first. [Thunder. An Apparition of a bloody Child rises.] APPARITION.-- Macbeth! Macbeth! Macbeth! MACBETH. Had I three ears, I'd hear thee. APPARITION. Be bloody, bold, and resolute; laugh to scorn The power of man, for none of woman born Shall harm Macbeth. [Descends.] MACBETH. Then live, Macduff: what need I fear of thee? But yet I'll make assurance double sure, And take a bond of fate: thou shalt not live; That I may tell pale-hearted fear it lies, And sleep in spite of thunder.--What is this, [Thunder. An Apparition of a Child crowned, with a tree in his hand, rises.] That rises like the issue of a king, And wears upon his baby brow the round And top of sovereignty? ALL. Listen, but speak not to't. APPARITION. Be lion-mettled, proud; and take no care Who chafes, who frets, or where conspirers are: Macbeth shall never vanquish'd be, until Great Birnam wood to high Dunsinane hill Shall come against him. [Descends.] MACBETH. That will never be: Who can impress the forest; bid the tree Unfix his earth-bound root? Sweet bodements, good! Rebellion's head, rise never till the wood Of Birnam rise, and our high-plac'd Macbeth Shall live the lease of nature, pay his breath To time and mortal custom.--Yet my heart Throbs to know one thing: tell me,--if your art Can tell so much,--shall Banquo's issue ever Reign in this kingdom? ALL. Seek to know no more. MACBETH. I will be satisfied: deny me this, And an eternal curse fall on you! Let me know:-- Why sinks that cauldron? and what noise is this? [Hautboys.] FIRST WITCH. Show! SECOND WITCH. Show! THIRD WITCH. Show! ALL. Show his eyes, and grieve his heart; Come like shadows, so depart! [Eight kings appear, and pass over in order, the last with a glass in his hand; Banquo following.] MACBETH. Thou are too like the spirit of Banquo; down! Thy crown does sear mine eyeballs:--and thy hair, Thou other gold-bound brow, is like the first;-- A third is like the former.--Filthy hags! Why do you show me this?--A fourth!--Start, eyes! What, will the line stretch out to the crack of doom? Another yet!--A seventh!--I'll see no more:-- And yet the eighth appears, who bears a glass Which shows me many more; and some I see That twofold balls and treble sceptres carry: Horrible sight!--Now I see 'tis true; For the blood-bolter'd Banquo smiles upon me, And points at them for his.--What! is this so? FIRST WITCH. Ay, sir, all this is so:--but why Stands Macbeth thus amazedly?-- Come,sisters, cheer we up his sprites, And show the best of our delights; I'll charm the air to give a sound, While you perform your antic round; That this great king may kindly say, Our duties did his welcome pay. [Music. The Witches dance, and then vanish.] MACBETH. Where are they? Gone?--Let this pernicious hour Stand aye accursed in the calendar!-- Come in, without there! [Enter Lennox.] LENNOX. What's your grace's will? MACBETH. Saw you the weird sisters? LENNOX. No, my lord. MACBETH. Came they not by you? LENNOX. No indeed, my lord. MACBETH. Infected be the air whereon they ride; And damn'd all those that trust them!--I did hear The galloping of horse: who was't came by? LENNOX. 'Tis two or three, my lord, that bring you word Macduff is fled to England. MACBETH. Fled to England! LENNOX. Ay, my good lord. MACBETH. Time, thou anticipat'st my dread exploits: The flighty purpose never is o'ertook Unless the deed go with it: from this moment The very firstlings of my heart shall be The firstlings of my hand. And even now, To crown my thoughts with acts, be it thought and done: The castle of Macduff I will surprise; Seize upon Fife; give to the edge o' the sword His wife, his babes, and all unfortunate souls That trace him in his line. No boasting like a fool; This deed I'll do before this purpose cool: But no more sights!--Where are these gentlemen? Come, bring me where they are. [Exeunt.] SCENE II. Fife. A Room in Macduff's Castle. [Enter Lady Macduff, her Son, and Ross.] LADY MACDUFF. What had he done, to make him fly the land? ROSS. You must have patience, madam. LADY MACDUFF. He had none: His flight was madness: when our actions do not, Our fears do make us traitors. ROSS. You know not Whether it was his wisdom or his fear. LADY MACDUFF. Wisdom! to leave his wife, to leave his babes, His mansion, and his titles, in a place From whence himself does fly? He loves us not: He wants the natural touch; for the poor wren, The most diminutive of birds, will fight, Her young ones in her nest, against the owl. All is the fear, and nothing is the love; As little is the wisdom, where the flight So runs against all reason. ROSS. My dearest coz, I pray you, school yourself: but, for your husband, He is noble, wise, Judicious, and best knows The fits o' the season. I dare not speak much further: But cruel are the times, when we are traitors, And do not know ourselves; when we hold rumour From what we fear, yet know not what we fear, But float upon a wild and violent sea Each way and move.--I take my leave of you: Shall not be long but I'll be here again: Things at the worst will cease, or else climb upward To what they were before.--My pretty cousin, Blessing upon you! LADY MACDUFF. Father'd he is, and yet he's fatherless. ROSS. I am so much a fool, should I stay longer, It would be my disgrace and your discomfort: I take my leave at once. [Exit.] LADY MACDUFF. Sirrah, your father's dead; And what will you do now? How will you live? SON. As birds do, mother. LADY MACDUFF. What, with worms and flies? SON. With what I get, I mean; and so do they. LADY MACDUFF. Poor bird! thou'dst never fear the net nor lime, The pit-fall nor the gin. SON. Why should I, mother? Poor birds they are not set for. My father is not dead, for all your saying. LADY MACDUFF. Yes, he is dead: how wilt thou do for father? SON. Nay, how will you do for a husband? LADY MACDUFF. Why, I can buy me twenty at any market. SON. Then you'll buy 'em to sell again. LADY MACDUFF. Thou speak'st with all thy wit; and yet, i' faith, With wit enough for thee. SON. Was my father a traitor, mother? LADY MACDUFF. Ay, that he was. SON. What is a traitor? LADY MACDUFF. Why, one that swears and lies. SON. And be all traitors that do so? LADY MACDUFF. Everyone that does so is a traitor, and must be hanged. SON. And must they all be hanged that swear and lie? LADY MACDUFF. Every one. SON. Who must hang them? LADY MACDUFF. Why, the honest men. SON. Then the liars and swearers are fools: for there are liars and swearers enow to beat the honest men and hang up them. LADY MACDUFF. Now, God help thee, poor monkey! But how wilt thou do for a father? SON. If he were dead, you'ld weep for him: if you would not, it were a good sign that I should quickly have a new father. LADY MACDUFF. Poor prattler, how thou talk'st! [Enter a Messenger.] MESSENGER. Bless you, fair dame! I am not to you known, Though in your state of honor I am perfect. I doubt some danger does approach you nearly: If you will take a homely man's advice, Be not found here; hence, with your little ones. To fright you thus, methinks, I am too savage; To do worse to you were fell cruelty, Which is too nigh your person. Heaven preserve you! I dare abide no longer. [Exit.] LADY MACDUFF. Whither should I fly? I have done no harm. But I remember now I am in this earthly world; where to do harm Is often laudable; to do good sometime Accounted dangerous folly: why then, alas, Do I put up that womanly defence, To say I have done no harm?--What are these faces? [Enter Murderers.] FIRST MURDERER. Where is your husband? LADY MACDUFF. I hope, in no place so unsanctified Where such as thou mayst find him. FIRST MURDERER. He's a traitor. SON. Thou liest, thou shag-haar'd villain! FIRST MURDERER. What, you egg! [Stabbing him.] Young fry of treachery! SON. He has kill'd me, mother: Run away, I pray you! [Dies. Exit Lady Macduff, crying Murder, and pursued by the Murderers.] SCENE III. England. Before the King's Palace. [Enter Malcolm and Macduff.] MALCOLM. Let us seek out some desolate shade and there Weep our sad bosoms empty. MACDUFF. Let us rather Hold fast the mortal sword, and, like good men, Bestride our down-fall'n birthdom: each new morn New widows howl; new orphans cry; new sorrows Strike heaven on the face, that it resounds As if it felt with Scotland, and yell'd out Like syllable of dolour. MALCOLM. What I believe, I'll wail; What know, believe; and what I can redress, As I shall find the time to friend, I will. What you have spoke, it may be so perchance. This tyrant, whose sole name blisters our tongues, Was once thought honest: you have loved him well; He hath not touch'd you yet. I am young; but something You may deserve of him through me; and wisdom To offer up a weak, poor, innocent lamb To appease an angry god. MACDUFF. I am not treacherous. MALCOLM. But Macbeth is. A good and virtuous nature may recoil In an imperial charge. But I shall crave your pardon; That which you are, my thoughts cannot transpose; Angels are bright still, though the brightest fell: Though all things foul would wear the brows of grace, Yet grace must still look so. MACDUFF. I have lost my hopes. MALCOLM. Perchance even there where I did find my doubts. Why in that rawness left you wife and child,-- Those precious motives, those strong knots of love,-- Without leave-taking?--I pray you, Let not my jealousies be your dishonors, But mine own safeties:--you may be rightly just, Whatever I shall think. MACDUFF. Bleed, bleed, poor country! Great tyranny, lay thou thy basis sure, For goodness dare not check thee! wear thou thy wrongs, The title is affeer'd.--Fare thee well, lord: I would not be the villain that thou think'st For the whole space that's in the tyrant's grasp And the rich East to boot. MALCOLM. Be not offended: I speak not as in absolute fear of you. I think our country sinks beneath the yoke; It weeps, it bleeds; and each new day a gash Is added to her wounds. I think, withal, There would be hands uplifted in my right; And here, from gracious England, have I offer Of goodly thousands: but, for all this, When I shall tread upon the tyrant's head, Or wear it on my sword, yet my poor country Shall have more vices than it had before; More suffer, and more sundry ways than ever, By him that shall succeed. MACDUFF. What should he be? MALCOLM. It is myself I mean: in whom I know All the particulars of vice so grafted That, when they shall be open'd, black Macbeth Will seem as pure as snow; and the poor state Esteem him as a lamb, being compar'd With my confineless harms. MACDUFF. Not in the legions Of horrid hell can come a devil more damn'd In evils to top Macbeth. MALCOLM. I grant him bloody, Luxurious, avaricious, false, deceitful, Sudden, malicious, smacking of every sin That has a name: but there's no bottom, none, In my voluptuousness: your wives, your daughters, Your matrons, and your maids, could not fill up The cistern of my lust; and my desire All continent impediments would o'erbear, That did oppose my will: better Macbeth Than such an one to reign. MACDUFF. Boundless intemperance In nature is a tyranny; it hath been The untimely emptying of the happy throne, And fall of many kings. But fear not yet To take upon you what is yours: you may Convey your pleasures in a spacious plenty, And yet seem cold, the time you may so hoodwink. We have willing dames enough; there cannot be That vulture in you, to devour so many As will to greatness dedicate themselves, Finding it so inclin'd. MALCOLM. With this there grows, In my most ill-compos'd affection, such A stanchless avarice, that, were I king, I should cut off the nobles for their lands; Desire his jewels, and this other's house: And my more-having would be as a sauce To make me hunger more; that I should forge Quarrels unjust against the good and loyal, Destroying them for wealth. MACDUFF. This avarice Sticks deeper; grows with more pernicious root Than summer-seeming lust; and it hath been The sword of our slain kings: yet do not fear; Scotland hath foysons to fill up your will, Of your mere own: all these are portable, With other graces weigh'd. MALCOLM. But I have none: the king-becoming graces, As justice, verity, temperance, stableness, Bounty, perseverance, mercy, lowliness, Devotion, patience, courage, fortitude, I have no relish of them; but abound In the division of each several crime, Acting it many ways. Nay, had I power, I should Pour the sweet milk of concord into hell, Uproar the universal peace, confound All unity on earth. MACDUFF. O Scotland, Scotland! MALCOLM. If such a one be fit to govern, speak: I am as I have spoken. MACDUFF. Fit to govern! No, not to live!--O nation miserable, With an untitled tyrant bloody-scepter'd, When shalt thou see thy wholesome days again, Since that the truest issue of thy throne By his own interdiction stands accurs'd And does blaspheme his breed?--Thy royal father Was a most sainted king; the queen that bore thee, Oftener upon her knees than on her feet, Died every day she lived. Fare-thee-well! These evils thou repeat'st upon thyself Have banish'd me from Scotland.--O my breast, Thy hope ends here! MALCOLM. Macduff, this noble passion, Child of integrity, hath from my soul Wiped the black scruples, reconcil'd my thoughts To thy good truth and honour. Devilish Macbeth By many of these trains hath sought to win me Into his power; and modest wisdom plucks me From over-credulous haste: but God above Deal between thee and me! for even now I put myself to thy direction, and Unspeak mine own detraction; here abjure The taints and blames I laid upon myself, For strangers to my nature. I am yet Unknown to woman; never was forsworn; Scarcely have coveted what was mine own; At no time broke my faith; would not betray The devil to his fellow; and delight No less in truth than life: my first false speaking Was this upon myself:--what I am truly, Is thine and my poor country's to command: Whither, indeed, before thy here-approach, Old Siward, with ten thousand warlike men Already at a point, was setting forth: Now we'll together; and the chance of goodness Be like our warranted quarrel! Why are you silent? MACDUFF. Such welcome and unwelcome things at once 'Tis hard to reconcile. [Enter a Doctor.] MALCOLM. Well; more anon.--Comes the king forth, I pray you? DOCTOR. Ay, sir: there are a crew of wretched souls That stay his cure: their malady convinces The great assay of art; but, at his touch, Such sanctity hath heaven given his hand, They presently amend. MALCOLM. I thank you, doctor. [Exit Doctor.] MACDUFF. What's the disease he means? MALCOLM. 'Tis call'd the evil: A most miraculous work in this good king; Which often, since my here-remain in England, I have seen him do. How he solicits heaven, Himself best knows: but strangely-visited people, All swoln and ulcerous, pitiful to the eye, The mere despair of surgery, he cures; Hanging a golden stamp about their necks, Put on with holy prayers: and 'tis spoken, To the succeeding royalty he leaves The healing benediction. With this strange virtue, He hath a heavenly gift of prophecy; And sundry blessings hang about his throne, That speak him full of grace. MACDUFF. See, who comes here? MALCOLM. My countryman; but yet I know him not. [Enter Ross.] MACDUFF. My ever-gentle cousin, welcome hither. MALCOLM. I know him now. Good God, betimes remove The means that makes us strangers! ROSS. Sir, amen. MACDUFF. Stands Scotland where it did? ROSS. Alas, poor country,-- Almost afraid to know itself! It cannot Be call'd our mother, but our grave: where nothing, But who knows nothing, is once seen to smile; Where sighs, and groans, and shrieks, that rent the air, Are made, not mark'd; where violent sorrow seems A modern ecstasy; the dead man's knell Is there scarce ask'd for who; and good men's lives Expire before the flowers in their caps, Dying or ere they sicken. MACDUFF. O, relation Too nice, and yet too true! MALCOLM. What's the newest grief? ROSS. That of an hour's age doth hiss the speaker; Each minute teems a new one. MACDUFF. How does my wife? ROSS. Why, well. MACDUFF. And all my children? ROSS. Well too. MACDUFF. The tyrant has not batter'd at their peace? ROSS. No; they were well at peace when I did leave 'em. MACDUFF. Be not a niggard of your speech: how goes't? ROSS. When I came hither to transport the tidings, Which I have heavily borne, there ran a rumour Of many worthy fellows that were out; Which was to my belief witness'd the rather, For that I saw the tyrant's power a-foot: Now is the time of help; your eye in Scotland Would create soldiers, make our women fight, To doff their dire distresses. MALCOLM. Be't their comfort We are coming thither: gracious England hath Lent us good Siward and ten thousand men; An older and a better soldier none That Christendom gives out. ROSS. Would I could answer This comfort with the like! But I have words That would be howl'd out in the desert air, Where hearing should not latch them. MACDUFF. What concern they? The general cause? or is it a fee-grief Due to some single breast? ROSS. No mind that's honest But in it shares some woe; though the main part Pertains to you alone. MACDUFF. If it be mine, Keep it not from me, quickly let me have it. ROSS. Let not your ears despise my tongue for ever, Which shall possess them with the heaviest sound That ever yet they heard. MACDUFF. Humh! I guess at it. ROSS. Your castle is surpris'd; your wife and babes Savagely slaughter'd: to relate the manner Were, on the quarry of these murder'd deer, To add the death of you. MALCOLM. Merciful heaven!-- What, man! ne'er pull your hat upon your brows; Give sorrow words: the grief that does not speak Whispers the o'er-fraught heart, and bids it break. MACDUFF. My children too? ROSS. Wife, children, servants, all That could be found. MACDUFF. And I must be from thence! My wife kill'd too? ROSS. I have said. MALCOLM. Be comforted: Let's make us medicines of our great revenge, To cure this deadly grief. MACDUFF. He has no children.--All my pretty ones? Did you say all?--O hell-kite!--All? What, all my pretty chickens and their dam At one fell swoop? MALCOLM. Dispute it like a man. MACDUFF. I shall do so; But I must also feel it as a man: I cannot but remember such things were, That were most precious to me.--Did heaven look on, And would not take their part? Sinful Macduff, They were all struck for thee! naught that I am, Not for their own demerits, but for mine, Fell slaughter on their souls: heaven rest them now! MALCOLM. Be this the whetstone of your sword. Let grief Convert to anger; blunt not the heart, enrage it. MACDUFF. O, I could play the woman with mine eye, And braggart with my tongue!--But, gentle heavens, Cut short all intermission; front to front Bring thou this fiend of Scotland and myself; Within my sword's length set him; if he 'scape, Heaven forgive him too! MALCOLM. This tune goes manly. Come, go we to the king; our power is ready; Our lack is nothing but our leave: Macbeth Is ripe for shaking, and the powers above Put on their instruments. Receive what cheer you may; The night is long that never finds the day. [Exeunt.] crm114-20100106-BlameMichelson.src/INTRO.txt0000644000000000017500000014707111321154266016443 0ustar rootwsy# # INTRO.txt - INTRO to the CRM114 DISCRIMINATOR # # Copyright 2000-2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # INTRO to the CRM114 DISCRIMINATOR Copyright (c) W.S.Yerazunis, 2000-2009 Last update - 2 March 2009 --------------------------------------------------------------------------- DANGER, WILL ROBINSON!! TAKE COVER, DR. SMITH!!!!!!!!!! CRM114 IS STILL UNDER DEVELOPMENT AND EXPANSION. YOU MAY FIND THAT THE LANGUAGE CHANGES OUT FROM UNDER YOU . BUGS, MISFEATURES, OR EVEN EXPLOITS MAY LURK WITHIN THIS CODE. IT IS SUPPLIED "AS-IS", WITH NO WARRANTY! SEE THE GPL LICENSE FOR DETAILS. ---------------------------------------- This document is the programmer's introduction to CRM114 Discriminator. If you are reading this to get information on how to install CRM114 as a mailfilter, you have the _wrong_ document. But fear not, we _do_ have the document you want. The document you want if you want to know how to install CRM114 as a mailfilter is: CRM114_Mailfilter_HOWTO.txt which will tell you everything you need to know about how to install, activate, and train the CRM114 mailfilter. ------------------------------------------------------------------------- Before We Begin In Earnest, A Few Choice Quotes: "It's not ugly like PERL. It's a whole different _kind_ of ugly." -John Bowker, on hearing the design details. ------------------ "The CRM-114 Discriminator is designed not to receive at _all_. That is, not unless the message is preceded by the proper 3-letter code group." - George C. Scott, as General Buck Turgidson, _Dr. Strangelove_ ------------------ C views the entire world as if your only tool is a hammer. CRM114 views the world as if your only good tools are a set of scissors and a roll of sticky splicing tape. ------------------ "What is this? Some kind of grep bitten by a radioactive spider?" -me CRM114 is a language designed to write filters in. It caters to filtering email, system log streams, html, and other marginally human-readable ASCII that may occasion to grace your computer. CRM114's unique strengths are the data structure (everything is a string and a string can overlap another string), its ability to work on truly infinitely long input streams, its ability to use extremely advanced classifiers to sort text, and the ability to do approximate regular expressions (that is, regexes that don't quite match) via the TRE regex library. CRM114 also sports a very powerful subprocess control facility, and a unique syntax and program structure that puts the fun back in programming (OK, you can run away screaming now). The syntax is declensional rather than positional; the type of quote marks around an argument determine what that argument will be used for. The typical CRM114 program uses regex operations more often than addition (in fact, math was only added to TRE in the waning days of 2003, well after CRM114 had been in daily use for over a year and a half). In other words, crm114 is a very VERY powerful mutagenic filter that happens to be a programming language as well. The filtering style of the CRM-114 discriminator is based on the fact that most spam, normal log file messages, or other uninteresting data is easily categorized by a few characteristic patterns ( such as "Mortgage leads", "advertise on the internet", and "mail-order toner cartridges".) CRM114 may also be useful to folks who are on multiple interlocking mailing lists. In a bow to Unix-style flexibility, by default CRM114 reads its input from standard input, and by default sends its output to standard output. Note that the default action has a zero-length output. Redirection and use of other input or output files is possible, as well as the use of windowing, either delimiter-based or time-based, for real-time continuous applications. CRM114 can be used for other than mail filtering; consider it to be a version of GREP with super powers. If perl is a seventy-bladed swiss army knife, CRM114 is a razor-sharp katana that can talk. ----- How CRM114 Is Different From ... ----- CRM114 is different than procmail in that: * CRM114 code is readable by the uninitiated, while procmail code looks like modem noise. * CRM114 allows looping * CRM114 allows gotos * CRM114 allows nested statements in a useful way * CRM114 can learn, if you want. * CRM114 uses per-match control flags, rather than procmail's per-recipe control flags, and the control flags are words, not cryptocharacters. * CRM114 separates mail processing from mail delivery, rather than conflating the two. ----- CRM114 is different from awk / gawk / perl / grep in that: * CRM114 is entity-oriented, and views the entire input as a single structured entity (structure is imposed during processing, rather than from within, as in XML); there is no concept of "lines", "words", "stanzas" or "records" unless you choose to put them there. * CRM114 tries to avoid the bizarre syntax, mind-reading, and action-at-a-distance of perl; * CRM114 can learn, if you want. CRM114 is unique in that: * CRM114 can use a swept window to manage the amount of data retained in each analysis pass; highly useful on log files and packet traces. * CRM114 can learn. Oh, just for completeness- yes, CRM114 is Turing-complete, as it can emulate (to within the limits of available memory) a single-tape Turing machine. To do this requires an interesting initialization of the input tape, which is left as an exercise to the reader (backwards hint: each symbol on the tape has two parts - the logic state, and a unique identifier; the identifier is used as a marker so that tape motion "to the left" and "to the right" can be performed. ----- Anything Else ? ----- Lastly, this guide is just an _introduction_ to CRM114. It doesn't explain all of the statements, nor does it fully explain all of the statements that it does cover. The QUICKREF quick reference card makes a much better attempt at covering every capability, at the expense of a terse format. If you want the big manual, we have that too- it's on the web page (but not part of this download; it's big). And again, CRM114 is GPLed software and a community effort - if you have an improvement, a bugfix, or even just a bug, please do report it back on the crm114 mailing list. You can get on the mailing list (a closed list, so it won't spam you) via a link on: crm114.sourceforge.net ----- Getting and Installing CRM114 --------- You should already have the source code. If you don't, you can fetch the full kit from Sourceforge. CRM114 is GPLed, you can use it freely without asking anyone for permission or paying any licensing fees. Open any browser, and go to: http://crm114.sourceforge.net Read the webpage- it will usually have direct clickable links to pull down both the most recent cutting-edge version of CRM114 (usually for developers and testers), and the "Recommended for Users" version. Click on the version you want, and downloading will commence. Once you have the .gz file(s), you will need to unpack them. If you have .gz files, type: tar -zxvf crm114-whateverversion.tar.gz and the full source directory will be built in your current directory Now, cd down into that source directory, become root, and type: make install to build and install the executables and utilities. If the make complains of not being able to find the TRE approximate-regex library, you can either: Plan A) install TRE libraries from your distribution. This is recommended, and how to do so varies with your OS. For Ubuntu, it is installed with: sudo apt-get install libtre-dev or you can: Plan B) install TRE libraries manually. Obtain the TRE source directory from http://www.laurikari.net/tre/, and compile it statically. zcat tre-0.7.5.tar.gz | tar xvf - cd tre-0.7.5 ./configure --enable-static make make install Then try to build CRM114 again. You can then execute the executable with: ./crm [ [ [ [....]]]] . To install crm114 as a systemwide utility, type "make install" to install it as /usr/bin/crm so anyone can use it. Now would be a _good_ time to read the CRM114 QUICK REFERENCE CARD, which is one of the files you already have. A lot of it won't make sense... yet. But it will, soon enough. ----- Getting Started ----- Crm114 is a filter, like "grep" or "wc". It reads from standard input, and outputs to standard output (mostly- these can be overridden). By default, crm114 runs your program in the following steps: 1) it reads your program in 2) it runs a preprocessor over your program 3) it runs an incremental microcompiler over your program 4) it reads standard input until either it hits EOF (^D on the keyboard), or until it exhausts the data window size (which you can change with the -w parameter; the default at version 2003-02-19 is sixteen megabytes). 5) Then the crm114 runtime system actually runs your program. Program execution is on a line-by-line, JIT-compiled style. To speed things up and detect some errors, CRM114 does a microcompile to convert your program into a VHL representation which is then interpreted. This is not a full compile; since many arguments can only be evaluated in the dynamic context of a partially-executed program, a full compilation is not possible in any case. Put only one statement on a line, if possible (this is the recommended style). If you can't, separate the statements with semicolons. Here's a VERY simple program. output /Hello, world! \n/ which accepts an arbitrary input (just hit ^D for now), then outputs Hello, world! Some mechanics- assuming you you want to run these programs as standalones, make sure the first line of your program is a line that looks like this: #! /usr/bin/crm If you put this at the start of each file, the shell will know your program is a CRM114 program and will automagically load CRM114 to run your program. You will also need to do a "chmod o+x yourfilename" to enable the file as an executable. If you don't want to do both of these things, you can still run a bare crm114 program as a command-line argument: crm filename If you just want to dash off a one-liner, you can put the whole program onto the command line between curly braces (the quotes are so the shell will pass on your program text without doing any substitutions.) crm '-{ output /Hello, world! \n/ ; }' Here's another version of the same "Hello World" program: crm '-{ output /Hello, world! :*:_nl:/ ; }' Note the ':*:_nl:' at the end of the output line. It contains two parts: the value name :_nl:, which is initialized by crm114 to a newline (to C programmers, it's a '\n' ). Putting a ":*" on the front of a value name means "put my value in here instead of my name". So, :*:_nl: turns into a newline character when the output statement is executed. (nota bene: the ':*:' does this name-to-value translation only once. So, if you had a value named :foo: with the value ":*:bar:", and :bar: had the value "FOOLED YOU", :*:foo would evaluate to ":*:bar:", not to "FOOLED YOU". If you want to do this multiple value resubstitution, you have to explicitly ask for this by using the :+: indirection operator instead of :*: evaluation operator. Why does CRM114 evaluate variables only once? It's so that you can embed any string you want and know what it will evaluate to. Notice in the README that there are : vars for several "tricky" characters. Note that I said "value name", not variable. In truth, crm114 _has_ _no_ _variables_; all data storage can be viewed as start/length pairs indicating ranges of character strings existing on a few huge strings. The default string (called the default input window buffer) is filled with stdin (until EOF) during program startup, another string is initialized with a few standard values, and is available for scratch use as needed. (well, _by default_ the input window buffer is filled from standard input; this can be overridden easily) All variables are really captured values - these are just start/length indices into these big strings. The power of this is that these captured values can overlap and so the view of the input data as a contiguous whole is not disrupted. These overlapping values retain any heirarchial structure you choose to impose. For instance, a multipart message can be easily manipulated, split, some XML file hierarchy can be manipulated, etc. If you need to, you _can_ create temporary, isolated variables - they are just other sections of a big string buffer that don't happen to be part of the input buffer (see ISOLATE, below). Instead of addition and subtraction, the basic operations in crm114 is the matching of one string against another, the capturing of a value, and the destructive replacement of one value with another. ----- Matching ----- Here's a simple example of a CRM114 program that does string matching. #! /usr/bin/crm { match /foo/ output /Hey, there's a foo in the input \n/ } Try this program. Give it any input you want (remember to hit ^D to signal end-of-file if you are typing input from a keyboard). The result will be that the program will either do nothing at all, or it may print out "Hey, there's a foo in the input". Note that there's no "if" statement here (or, for that matter, in _any_ crm program). The MATCH statement is itself an IF statement. If the match succeeds, execution continues with the next statement. If the match fails, then execution skips to the end of the { } block. This "skip to end of block" is called a FAIL in CRM114 slang. By the way, if you should ever want to force a fail, there is a "fail" statement just for that. Crm114 statements have a general structure that looks like this: commands (vars) [restrictions] /regexes/ You'll find crm114 uses a standardized pattern of commands, then flags in <>, then vars in (), then substr restrictions in [], then regexes in // and block structures in {}. The only required order is that the command action must come first in a statement (and even that may be relaxed in the future.) But, back to programming. We can change the program just a little, to look for input files that contain any arbitrary regex-able string. We can also change the program to either reject the entire input (and output nothing - this is the default), or to ACCEPT the entire input as it currently exists. As an example, this little program looks for zebras. If the input file contains at least one "zebra", it outputs the entire input file. If it doesn't contain at least one zebra, it outputs nothing. This program also uses the "accept" statement. ACCEPT means "take whatever the current data window is, and write it to standard output." Many "go/nogo" filters will use ACCEPT as an easy way to ... well, accept their input as good. #! /usr/bin/crm { match /zebra/ accept } You don't have to be limited to fixed strings in the match. You can use the full Posix Extended match syntax. (type 'man 7 regex' to see more, or look in the QUICKREF.txt file). You can use backreferences, such as accepting only files that contain a four-letter palindromic sequence: #! /usr/bin/crm { match /(.)(.)\2\1/ accept } You can even use approximate matching, such as accept any file that contains a string that can be converted to "Niagara Falls" in no more than three inserts, deletes, or substitutions: #! /usr/bin/crm { match /(Niagara Falls){~3}/ accept } CRM114 is built with the TRE REGEX library as you no doubt read above, and uses the REG_EXTENDED mode of operation exclusively. One (current) limitation of TRE is that if you use approximate regex matches, you can't use backreferences and vice versa. Instead of REG_BASIC, TRE offers the mode, where no character has special meaning. Building CRM114 with the GNU regex library is no longer supported. GNU regex doesn't support approximate regexes, nor mode, and back-references like \1 never seem to work right for me, so it is no longer included in the source code. As in most POSIX libraries, the first match possible in a string is the one found, and given that starting point, the longest match possible with that starting point is used. Sub-matches (enclosed in parenthesis) are similarly located and extended (first found, then longest with that starting point). By default, matches can span lines; the regex /.*/ with no flags will match the full input window. Some handy POSIX-extended regexes are: ^ as first char of a match, matches only at the start of the matchable block (that is, the first character of the string for most matches, and the first character of a line for matches). $ as last char of a match, matches at the end of the matchable block (that is, the last character of the string, and the last character of the line for matches). . (a period) matches any _single_ character (except start-of-line or end of line "virtual characters", but it does match a newline). The following are other POSIX expressions, which mostly do what you'd guess they'd do from their names. [[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] [[:lower:]] [[:upper:]] [[:graph:]] <-- any character that puts ink on paper or lights a pixel [[:print:]] <-- any character that moves the "print head" or cursor. [[:punct:]] [[:space:]] [[:xdigit:]] Additionally, a '*' means "repeat preceding zero or more times", a '+' means "repeat one or more times", and a '?' means "repeat zero or one time". *?, +?, and ?? are the same, but match the _shortest_ match that fits, rather than the longest. You can specify repeat-counts as well. {N} means match N copies, {N,M} means any number of copies between N and M inclusive, and {N,} means match at least N copies. (N and M are sadly limited to 255 or less by POSIX.) TRE extends POSIX with approximate matching - {~N} means with no more than N insertions, deletions, and substitutions, and {~} means "closest match, no matter how many errors". Note that a string of length Z can be subjected to Z deletions and therefore "match" the empty string, watch out for this quaint (but mathematically correct) behavior if you use {~} matches. You can also specify some relative costing between insertions, deletions, and substitutions; QUICKREF.txt contains some further examples. ----- Comments ----- Comments in a CRM114 program start with a '#' sign and continue until either a newline or a "\#". Note that a ';' (a semicolon) does NOT end a comment (the reason it doesn't is because the semicolon is too often found _in_ a comment, whereas \# is pretty rare. It's a good idea to use "block comments" throughout your CRM114 programs; even though comments can be deceiving, it's usually better to have them than not to. ----- Capturing a value from a match ----- We can capture the values matched by the extended regex or even subparts of the extended regex; any variable name(s) enclosed in parenthesis in the match statment will be attached to successive parenthesized subexpressions (note- the first variable name, if it exists, is always bound to the _entire_ matched stream). One additional bit before our next example program: crm114 lets you see the command line inputs. These are some of the special temporary values; they appear as :_arg0: through :_argN:, and "positional" arguments (those _not_ of the form "--name=value") also appear as :_pos0: through :_posN: . By looking at these arguments, we can change our program's behavior from the command line. Let's re-write a basic grep then: #! /usr/bin/crm { match (:result:) /(:*:_arg2:)/ output /:*:result:/ } which indeed does function pretty much like grep, except it outputs only the matching string. This tells us the string was indeed present in the input stream, but doesn't give us any context. We can modify the program to work just like grep, by requiring the entire match to be satisfied on a single line, and by outputting the entire line found. To do this, we use a "modifier flag" on the match statement. Here, we want the match statement to be restricted to a single line, so we use the modifier flag on the match statement. Since the match is now limited to just the line that contained the input pattern, we can put a .* both in front and in back of the actual :*:_arg2: pattern. ( the pattern ".*" matches the longest string possible without caring what it's matching. It's a wildcard string) Here's the modified program: #! /usr/bin/crm { match < nomultiline > (:result:) /(.*:*:_arg2:.*)/ output /:*:result:/ } This works reasonably well, except it only shows us the first match. We can fix that with two more pieces: -- the "fromend" flag, which tells the match to start looking for a match at the end of the previous match, and --the LIAF statement, which tells program execution to go back to the start of the most recent program { } block and run again. (by the way, you can redirect any particular OUTPUT command to a file, by supplying the file name (or a variable with the right value) in [square_brackets] before the /output values/. To append to a file, put the flag in the OUTPUT statement; otherwise you will overwrite the contents of the file. The 'liaf' statement is the reverse of "fail". LIAF tells the execution to skip UPWARDS in the program, back to the _start_ of the enclosing { } block. You can remember that "liaf" is "fail" spelled backwards, or you can pretend it stands for Loop Iterate Awaiting Failure; either works as a mnemonic. Here's the program with the flags and liaf in place; we also put in a newline in the output so each separate line appears on a new line: #! /usr/bin/crm { match < nomultiline fromend> (:result:) /(.*:*:_arg2:.*)/ output /:*:result:\n/ liaf } and sure enough, it acts like grep (without some of the flags that grep has), but this version of grep can now do approximate matching. As long as the MATCH succeeds, execution continues through the OUTPUT statement and hits the LIAF. The LIAF statement bounces execution up to the open '{' statement and execution continues from there, down onto the MATCH statement again. [ note: You'll find that if you use this program very much that the pattern in arg2 is used as a regex. It's not a literal match, but a match that allows wildcards. If you wanted to not allow wildcards, you'd need to specify as well as and < fromend>, or you can use the \Q directive to specify verbatim quoting; \Q.*\E specifies the string of a dot followed by a star exactly. ] ----- ALTERing values ------ In the "like a grep" program above, it was perfectly fine to keep the result of the match in the captured value :result: (which remained part of the input buffer). Let's see what happens if we surgically alter that value. The ALTER statement alters the contents of a captured value by inserting or deleting characters at the start of the variable till the variable is the same length as the new value, then overwriting the old characters with new characters. The length of the captured value changes; so do the starts and lengths of any variable that overlaps the captured variable or that would have been affected by the insertions or deletions. Here's an example. This program surgically alters the input, by replacing the first 'foo' with 'IT'S A BAR NOW' #! /usr/bin/crm { match (:whole_input:) /.*/ output / The whole input file before ALTERing: \n/ output /:*:whole_input:/ output /\n/ match (:a_foo:) /foo/ alter (:a_foo:) /IT'S A BAR NOW/ match (:whole_input:) /.*/ output / The whole input file after ALTERing: \n/ output /:*:whole_input:/ } Give this program the input: apple foo banana and you'll get back apple IT'S A BAR NOW banana As you can see, we've destructively altered the value of :a_foo: to "IT'S A BAR NOW", and this change is reflected in the entire input buffer. (note to students- we really didn't need to rematch the :whole_input: twice, but we wanted to drive home the fact that this really was a surgical operation on the main text body, not on some copy somewhere) Aside: this program changes only the first foo. To make it change _every_ foo, use the LIAF-loop technique above on the match/alter in the middle. We also need to initialize our search at the beginning of the input but not use up any characters; the "match //" statement does that. The program crux would now look like: ... match // { match (:a_foo:) /foo/ alter (:a_foo:) /IT'S A BAR NOW/ liaf } ... ----- ISOLATE and Isolated Variables ----- The power to surgically alter the input is fine and dandy if we know precisely what alterations we want to make, but what if we don't want to mutilate the input, just want to do some specialized searching or produce a tenative value? We can do this by ISOLATEing any variable we want to preserve as separate from the input buffer, and then putting the desired values into that variable with the ALTER command. Note that the special ISOLATEd behavior of a variable only lasts as long as it's not re-assigned by a MATCH. This is intentional but can be the source of some misunderstandings because you can ALTER an ISOLATEd value and you can use its value with :*: and it stays ISOLATEd, but if you should bind it in a match, its ISOLATEed property is lost. An ISOLATEd variable is initialized with the value of a zero-length string, in case you wondered. Try this: crm '-{ isolate (:foo:) ; output /a:*:foo:z/; }' (remember to hit ^D so your program doesn't wait for an input that will never arrive). You'll get back the result "az", showing that the value of a freshly isolated variable is a string of length zero. If you want to set an initial value on an isolated variable, put the value in /slashes/. Example: crm '-{ isolate (:foo:) / Hi there! / ; output /a:*:foo:z/; }' which results in: a Hi there! z Lastly, if you ISOLATE a variable that already has a value, the result is that you make a new copy of the variable. This is not destructive of the old copy... it's still there and intact, in case any other variables happen to be using the same strings. It is important to remember that setting a captured value with a MATCH statement really just changes the start and length of that variable's pointers, it doesn't change any actual strings in memory. Setting a captured value with an ALTER statement actually _does_ change the string in memory. More precisely, an ALTER leaves the start location at the same place, but the old string is deleted, and the new string is inserted. Other captured variables may well change as well during an ALTER, it depends on how they overlapped the ALTERed variable. Here's an example - this demo file expects you to give it the input string of "abcdefghijklmnop", so type that in as soon as the program starts (there is no prompt, just type it in, and then EOF (usually control-D): #! /usr/bin/crm { match <> (:big:) /.*/ output /----- Whole file -----\n/ output /:*:big:/ output /----------------------/ match <> (:1:) /abcde/ match <> (:2:) /cde+fg/ match <> (:3:) /fghij/ output /\n 1: :*:1:, 2: :*:2:, 3: :*:3: \n/ output / ---altering--- \n/ alter (:2:) /CDEEEFG/ output / 1: :*:1:, 2: :*:2:, 3: :*:3: \n/ output /----- Whole file -----\n/ output /:*:big:/ output /----------------------\n/ match <> (:big:) /.*/ output /----- Rematched Whole file -----\n/ output /:*:big:/ output /----------------------\n/ } Notice how any captured variable that overlapped the ALTERed variable also changed? That's both very powerful and rather dangerous- be careful how you ALTER anything that isn't ISOLATEd. Input is possible other than via the input window; the 'input' statement reads a line of input from stdin and puts it into a captured variable. This is equivalent to the ALTER statement. If you don't want to modify something important, you should ISOLATE this variable till you have checked the input to be something you want (if the variable hasn't been captured or ISOLATEd before use, the value is ISOLATEd). Example: #! /usr/bin/crm window { output /\n ------INPUT TEST ---/ input (:x:) output /\n Got: \n:*:x: \n/ match [:x:] /foo/ output /\n it had a foo/ } This little program reads one line of input, outputs the line, and then searches it for a foo. If the foo is found, the program confirms this, and then exits. Note that match uses [:x:] to specify the input being matched against, while it uses (:x:) to specify the output of the resulting match. ----- WINDOWing through an infinitely long Input ----- You can control the rate and style of input into the input window with the WINDOW statement. By default, crm114 reads input till the first EOF, and then never reads again. With WINDOW, you can read as many times as you want, controlling the input buffer size as well. (this is _very_ handy when you're writing a filter to monitor an ever-growing syslog file, or sitting on a logging port that never EOFs). The WINDOW statement takes one of three flags (see next paragraph), and two regex patterns. It deletes characters in the input window buffer up to and including the first regex, then reads standard input until it finds the second regex, appending that to the end of the input buffer. Using WINDOW in a loop lets your program inch its way through an infinitely long file (and yes, we do mean "infinitely". The program will process the infinitely long input file one window's worth at a time. ). Since regex-matching is slightly expensive in terms of CPU, WINDOW has three flags that tell it how often to check for the 'got new input completed' regex. Those flags are bychar, bychunk, and byeof. With bychar, the regex is checked on every incoming character (assuming your input tty is already set to unbuffered operation), bychunk checks on every input "block" where a "block" is a conveniently large chunk of I/O, and byeof checks only when an EOF is read. (don't worry if your input stream is buffered, characters after the regex are NOT thrown away but saved for the next execution of a 'window' statement.) One last bit on WINDOWing - if a WINDOW statement is the first statement in your program that can affect the input window buffer, the normal crm114 behavior of reading the entire standard input till EOF is suppressed and your window statement takes over. If your window statement doesn't have any arguments, then no input is done, and your program starts running without waiting for any input at all. Yes, this is slighty hackish, live with it or come tell me a better way. Here's an example of a WINDOW - keep reading input, even past EOF, and look for occurrences of either 'apple' or 'banana'; if either is found, print a message. Note that you can't do this with grep because grep can't re-read past the first EOF, nor can grep mutilate the output. #! /usr/bin/crm { window /\n/ /\n/ { match (:my_fruit:) /apple|banana/ output /Found fruit: :*:my_fruit: ... good! \n/ } liaf } Now, why would you ever use this? How about for parsing a syslog file for security alerts like failed root logins, or attempts to open port 421 ? :-) Note the liaf-loop above- this is the "recommended" style to write an infinite loop, or a program that's supposed to run nearly forever. ----- Matching inside variables ----- We can restrict matching to be inside a particular value (the value can be isolated). For example, here's a simple program that accepts only input files that contain 'apple' in the first string found that begins with 'START' and ends with 'END'. #! /usr/bin/crm { match (:my_string:) /START.*END/ match [:my_string:] /apple/ accept } The bracketed parameters '[:your_variable:]' tell the match statement to restrict matching to inside the variable mentioned. One issue- the above example does two things strangely- one, it's case-sensitive ( "START apple END" works, but "start apple end" doesn't). Secondly, after it finds the first 'START whatever END', it commits to using that one, even if a second one exists. We can fix the first problem by using the "nocase" flag on both matches, and fix the second problem with a liaf loop. But, remember that a liaf-loop runs until one of the toplevel matches fails, so we need an escape out of the inner match/accept on 'apple'. Here's the code: #! /usr/bin/crm { match (:my_string:) /START.*END/ { match [:my_string:] /apple/ accept exit } liaf } ----- Getting INPUT from other places ----- You can do explicit INPUT of information with the INPUT statement; the INPUT statement works as follows: 1) if you don't specify an input filename in square brackets like this [ myfile.txt ] then input will read from stdin (a clearerr() is done first, so if you've already hit EOF on stdin, you will be able to read past that EOF should more input be available.) 2) if you specify , only the first line of the input file is read. ----- Getting a quick hashcode ----- At some point, you may want to take a captured value and make some hashcode or digest. The HASH statement does this conveniently; HASH is like ALTER but instead of surgically altering the variable to the expanded /slashed value/, it expands the slashed value and then takes a hash of that. The hash is a 32-bit hash, expressed as an eight-character hexadecimal string. You should use HASH in cases where you need a short index to a long string (for efficiency or database access), or where you need to provide a hard-to-invert password check. (note- because this is only a 32-bit hash, it's not particularly secure and should be viewed as a "picket fence", rather than as a "bank vault door". Adding a "salt value" to the /slash pattern/ will greatly increase resistance to dictionary attacks. Putting a randomly chosen dictionary word and number in front of the hashed value and another randomly chosen dictionary number after the hashed value will greatly increase your security; using a pair of HASHes, with different salt values will also greatly increase security. For example: #!/usr/bin/crm hash (:_dw:) /:*:_dw:/ accept will generate a quick-and-dirty hashcode of the input file. Note that this hash is NOT cryptographically secure; it can be broken in a few minutes of CPU time on any modern computer desktop. If you need security, use MD5. ----- LEARNing and CLASSIFYing ----- The next two statements in crm114 are the hardest to understand, because they are the 'learn' and 'classify' statements. These statements attempt to identify types of inputs based on word and phrase similarity. As of build 20020501, all phrases of up to four words are weighted equally in the classifier, and as of build 20031215, a better weighting (Bayesian/Markov Modeling) is used to get improved accuracy). Builds past 20040101 use chains of five words for yet more accuracy. The details of all this are explained in the file "classify_details.txt", but you don't need to understand them to use the classifiers. The LEARN statement updates a file of hashed phrase structures with the contents of the specified [ ] variable. If you don't specify an input variable, the default data window :_dw: is used as the input buffer. You will have to specify the classname you want to learn, and a regex that defines what a "word" is. For english text, a good regex is [[:graph:]]+ , which is a string of characters that all have some nonblank, noncontrol characters. The LEARN statement creates a file with the same name as the classname to be learned, so watch out and don't clobber a file you want to keep. The CLASSIFY statement uses two or more of these classname files from LEARN to classify an input buffer into types. As with LEARN, the CLASSIFY statement accepts a [ ] input variable containing the text to classify. If you don't specify an input variable, the default data window :_dw: is used. You specify any number of classes (each one must have a preexisting hashed phrase file) and a regex to define a word (again, [[:graph:]]+ is a good place to start). CLASSIFY then compares the input window against each of the classes in turn. If the class that best matches the input window occurs _before_ the '|' marker in the list of hashed phrase filenames, 'classify' succeeds and execution of your program continues with the next line. If the class that best matches the input window occurs after the '|', then the classify statement fails to match, and execution skips to the end of the { } block (just like a match statement). CLASSIFY can take a second variable (in parens (:here:) like that) which will be ALTERed to contain a text-formatted set of matching statistics. This can be useful if you want to do some sort of mathematical comparison or checking. ----- IF-THEN-ELSE without IF, THEN, or ELSE ----- MATCH and CLASSIFY can act as IF-statements, but what about IF-THEN-ELSE situations? for that matter, how can we implement CASE statements, where we want one (and only one) of N different alternatives to execute? The ALIUS statement provides this functionality. "Alius" is latin for "other" or "another" (or, more literally "the other man"). An ALIUS statement looks at the most recently completed bracket-block of code - if _that_ bracket block failed (exited because a MATCH or CLASSIFY failed, or because of a FAIL statement), then ALIUS is a no-op and execution continues with the next statement. If the most recently completed bracket block completed successfully (didn't exit due to a MATCH fail, CLASSIFY fail, or FAIL statement) then ALIUS itself is a FAIL statement, and causes a skip to the end of the current (outer) bracket block. This is a skip, not a FAIL, and so a surrounding ALIUS on the outer bracket block won't itself FAIL. Here's an example of ALIUS used for a 3-way case statement: #! /usr/bin/crm # test the alius statement { { output /checking for a foo.../ match /foo/ output /Found a foo \n/ } alius { output /no foo... checking for bar,,,/ match /bar/ output /Found a bar. \n/ } alius { output /neither foo nor bar \n/ } } output / That's all, folks! / When you run this, you'll see that each MATCH test is applied in sequence, and as soon as a MATCH succeeds (and so has a bracket-block complete successfully) that's the end of the program's execution. You _can_ program this with a lot of goto's, but it's much easier to use ALIUS. If ALIUS still confuses you, pretend that ALIUS really means "IF THAT WORKED, SKIP THE REST OF THIS BLOCK, OTHERWISE TRY THIS NEXT BIT OF CODE AND SEE IF IT WORKS OR NOT" which is pretty much what it does. ----- Minion Processes and Syscalls ----- CRM114 has a fairly powerful mechanism for creating and communicating with subprocesses, called "minion processes". You can have an unbounded number of minion processes, and minion processes can run in parallel with CRM114, repeatedly receiving input from CRM114 and outputting to CRM114. The minion processes can also do other things besides talking to CRM114. Here's an example program that runs some minion processes; the first one runs "ls" (and gets a file listing), the second runs 'bc', and uses bc to calculate 1 + 2 + 3. We then play some games, running "ls -la", cat-ting into a file, and using asynchronous input to accomodate slow programs (or those with HUGE outputs). This program also uses the 'window' statement by itself to inhibit any reading of standard input, so this program just goes off and runs without waiting for any input. #! /usr/bin/crm window { isolate (:lsout:) output /\n ----- executing an ls -----\n/ syscall ( ) (:lsout:) /ls/ output /:*:lsout:/ isolate (:calcout:) output /\n ----- calculating sum of 1 + 2 + 3 using bc -----\n/ syscall ( 1 + 2 + 3 \n ) (:calcout:) /bc/ output /:*:calcout:/ isolate (:lslaout:) output /\n ----- executing an ls -la -----\n/ syscall ( ) (:lslaout:) /ls -la/ output /:*:lslaout:/ isolate (:catout:) output /\n ----- outputting to a file using cat -----\n/ syscall ( This is a cat out \n) (:catout:) /cat > e1.out/ output /:*:catout:/ # note that we expect :catout: to be null isolate (:c1: :proc:) output /\n ----- keeping a process around ---- \n/ output /\n preparing... :*:proc:/ syscall ( a one \n ) ( ) (:proc:) /cat > e2.out/ output /\n did one... :*:proc:/ syscall ( and a two \n ) () (:proc:) // output /\n did it again...:*:proc:/ syscall ( and a three \n) () (:proc:) // output /\n and done ...:*:proc: \n/ output /\n ----- doing asynchronous reads from a minion-----\n/ isolate (:lslaout:) syscall () (:lslaout:) (:proc:) /ls -la /dev / output /--- got this immediate : \n :*:lslaout: \n ---end-----/ :async_test_sleeploop: output /--- sleeping 1 seconds ---/ syscall <> () () /sleep 1/ syscall () (:lslaout:) (:proc:) // output /--- and got this async : \n :*:lslaout: \n ---end-----/ { ### if we got at least three chars, we should look for more. match [:lslaout:] /.../ goto :async_test_sleeploop: } syscall <> () (:lslaout:) (:proc:) // output /--- and synch : \n :*:lslaout: \n ---end-----/ } ----- INSERTing a file verbatim ------ At some point, you may desire to call a second crm114 program from the current program. There are two ways you can do this: either SYSCALL it (as above), or you can INSERT the program text verbatim into your current program. Either works; syscalling keeps the variables and data windows of the two programs separate, while INSERT actually makes one big program file. One issue on INSERT - all INSERTs happen at the very start of program setup, during preprocessing, and way before micro-compilation and execution, even before the data window gets loaded from standard input. This means that the only variable filenames you can INSERT into your program are those that are defined via command line arguments; you can't compute :filename: and then INSERT :*:filename: in your program (the compiler would get very sick if you tried!). But you _can_ SYSCALL if you really need this functionality. ----- Doing Math and EVAL ----- At some point, you may need to do math, or evaluate a mathematical expression. The EVAL statement does this. EVAL is like ALTER, but instead of evaluating its arguments left to right once, it repeatedly evaluates the arguments until they stop changing (EVAL does do a little bit of smart cacheing so that it can catch arguments that loop). EVAL actually keeps a log of the hashes of each intermediate state and checks this log on each pass of expansion. The default as of version 20040210 is 4096 states in the statelog, and if your program tries to EVAL a string that keeps changing for more than that number of passes, it's a nonfatal error. EVAL also defaults to allowing extended var-expansion; in extended var-expansion the string expansion operator :*: is retained, but two new ones are added: :#:var: - returns the number of characters in var :@:math_expr: - evaluates math_expr and returns the numeric result as a string. The mathematical expression evaluator can work either in algebraic notation (with left-to-right precedence, overridden only by parenthesis), or in RPN notation (like an HP calculator). If you use a relational mathematics operator like >, =, or <, then EVAL itself will evaluate the truth status of that operator, putting a 1 or 0 in for true or false, respectively. After completing the mathematical evaluation and ALTERing the result variable (if there is one), EVAL will then do one of the following: - if no relational mathematical operator was used, execution continues with the next statement. - if a relational mathematical operator was used, and the relation result was TRUE, execution continues with the next statement. - if a relational mathematical operator was used, and the relation result was FALSE, then EVAL does a FAIL to the end of the bracket-block (and an ALIUS statement will see this as a FAIL). Here's an example: #!/usr/bin/crm { window isolate (:z:) eval (:z:) / The length of 'foo' is :#:foo: letters / output /:*:z: \n/ eval (:z:) / and (2 * 3) + (4 * 5) is :@: (2 * 3) + (4 * 5):/ output /:*:z: \n/ } which gives you: The length of 'foo' is 3 letters and (2 * 3) + (4 * 5) is 26 which is as you would expect. ----- FAULT and TRAP ----- CRM114 programs can encounter errors during execution; an error can often be "fixed up" and execution continued, or at least the program can clean up and exit gracefully. Whenever an error occurs, it creates a string that describes the problem. This string is normally printed out as the error message. However, it can be used by the program itself to attempt to fix the problem before the program itself fails. The TRAP statement is how a program can catch an error before the program fails. The TRAP will "catch" almost any program error that occurs (and all of these conditions are true): - inside the bracket-block that holds the TRAP statement, - occurs above the trap statement - and the error message describing the error is matched by the TRAP statement's regex. If the TRAP statement's regex doesn't match the error message, then the next TRAP outward will be activated, and the process repeats. If no TRAP can handle the error, then your program will exit if the error was fatal, or print out the error and continue if the error was just a warning. If you need to create your own "errors" during a program run, such as if you find a file is missing or important data is not properly formatted, you can force an error with the error message of your choice with the FAULT statement. The FAULT statement creates the fault string you describe, which is still matched against the REGEX in each enclosing TRAP. If you have two TRAPs in series, the first TRAP gets first try at matching the FAULT regex, then the second one. Note that there is no "return from TRAP" - once a trap occurs, the trap code must GOTO or otherwise properly resume execution in an appropriate place. The reason for this is that many TRAPs really aren't "fixable" in the complete sense; the most that can be done is to issue an error message and exit gracefully. Additionally, there are some errors that simply aren't recoverable in a TRAP. For example, a fault that occurs during preprocessing or inside the microcompiler can't be caught by a TRAP, because the TRAP hasn't been compiled yet. It's also possible to create a FAULT situation where attempting to read the fault string itself causes an error. In this case, TRAP itself can't function and the error just forces a sad error message and CRM114 will terminate without grace or honor. ----- In Conclusion ----- This is the end of the Introduction to CRM114. There are quite a few statements and options in the QUICKREF that aren't discussed here in this document. Feel free to explore. If you come up with a good introduction to the use of a statement or technique, send it to me and I'll put it here! That's it.... a basic introduction to CRM114. Have fun and don't break anything. ----- Appendix 1 - Useful Idioms ----- A Few Useful Idioms: * - LIAF-looping - Use the liaf (Loop Iterate Awaiting Failure to iterate your way through the entire input window. For example: ... { match (:what_you_seek:) /a_regex/ ... # your code goes here liaf } ... will execute your code ONCE for each occurrence of the regex in the input window. * - null-WINDOWing: The WINDOW statement causes the data window to be updated... _except_ the "nonsense" WINDOW statement that contains no cut-to-here regex nor any fill-to-here regex, only when it's the first executable statement of your program, tells the compiler to _skip_ all data window input until you specify it later in the program with a second WINDOW statement (or skip it entirely, if there is no second WINDOW statement). Example: #!/usr/bin/crm { window output /Hello, world! \n/ } doesn't read any input at all. It just prints out "Hello, world!" * - file-CATting: to get input from a file rather than from stdin. The easiest way to read in an entire file (of reasonable length) is to "cat" the file into an isolated variable. E.g.: ... isolate (:my_data:) syscall () (:my_data) /cat < whatever_file_I_want.txt / If the file is truly huge (larger than fits in an I/O buffer), you can use the flag to get only as much as will conveniently fit, e.g.: ... isolate (:some_data: :my_proc:) :loop_here: syscall () (:some_data:) (:my_proc:) /cat /var/log/messages/ # # do something useful here. # goto :loop_here: If the result can take a long time to produce (say, because it's going out over the network to a slow server), then the flag reads only what is available and returns with that, without waiting for an EOF. ... isolate (:some_data: :my_proc:) :loop_here: syscall () (:some_data:) (:my_proc:) / cat /var/log/messages / # # do something useful here. # goto :loop_here: * - Processes that return more than 256K of text, possibly infinite amounts... Here's a way to cope with processes that return more than 256K of text (the limit for dynamically allocated heap in some kernels is 256K, so that's why this artificial limit exists). This example does an ls -la on /dev, which is usually more than 256K long (typically around 350K as of Linux kernel 2.4.18). Note that "do the work" here is to ACCEPT the contents of the data window; we could do anything else we wanted instead. window isolate (:p:) { syscall () (:_dw:) (:p:) /ls -la \/dev / # # do the work here... { accept } match /.+/ liaf } The important bits of code here are the syscall to launch the process (notice it's with the KEEP flag), and the subsequent MATCH /.+/ to check for more output. If there is more output, the MATCH passes and the LIAF kicks us back to the start of the { } block. If the match fails, the LIAF is skipped and the program exits. Cute, eh? Note that this program will fail if the SYSCALLed program simply is waiting for a slow network, etc. Since there's no way to determine whether a program that is just doing a long computation versus one that is truly wedged (it's a nasty version of the halting problem, proven by Alan Turing himself to be unsolvable), you'll have to use some artifice to determine that on a case-specific basis. Two good things you can try are: 1) do a SYSCALL to ps(1) with the PID and examine the returned string; 2) do a SYSCALL to sleep(1). for a few seconds and thereby do whatever timeout you desire. * - ALIUS-nesting. ALIUS checks to see if the most recently finished bracket-block completed successfully or FAILed out- but ALIUS itself isn't a FAIL. So, you can nest ALIUSed conditionals, like this: A? A1 or A2? B? B1 or B2? which would look like this: { { match /A/ { { match /A1/ ... } alius { match /A2/ ... } } } alius { match /B/ { { match /B1/ ... } alius { match /B2/ ... } } } } Note how each ALIUS looks at the most recently exited bracket-block, so nested IF statements don't get confusing (think about how you would write this in C to see the contrast) ----- Anyone else have any handy idioms they want to publish? ----- Things I'd like help on ---- 1) if anyone has strong bison-fu, and could give me a hand coming up with a real parser (not the handcarved crock that's in the current microcompiler) that would be great. 2) a few programs (like a spamkiller) would be nice... I have one but it's tailored to *me* . Suggestions, anyone? (yes, there's one in the distro now, read the README on it! It's about 99.95 per cent accurate as it stands, on my personal spam mix (for comparison, SpamAssassin is only around 90% accurate). -Bill Yerazunis crm114-20100106-BlameMichelson.src/traptest.crm0000755000000000017500000000175311321154266017357 0ustar rootwsy#! /usr/bin/crm # # traptest.crm - Test for traps # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # test the alius statement { { { output /checking for a foo.../ match /foo/ output /Found a foo :*:_nl:/ } alius { output /no foo... checking for bar.../ match /bar/ output /Found a bar. :*:_nl:/ } alius { output /found neither... continuing :*:_nl:/ fault / plutonium / } trap (:my_fault:) /nuts/ { output / dammit! / } } { { output /checking for a baz.../ match /baz/ output /Found a baz :*:_nl:/ # the next line intentionally doesn't trap! trap { output / YOU SHOULD NEVER SEE THIS / } } alius { output /no baz... checking for wugga.../ match /wugga/ output /Found a wugga. :*:_nl:/ } alius fault / cork / output /found neither baz nor wugga :*:_nl:/ } trap /.*/ (:my_fault:) { output / thrice damned - fault was :*:my_fault: :*:_nl: / } } crm114-20100106-BlameMichelson.src/fataltraptest.crm0000755000000000017500000000170611321154266020365 0ustar rootwsy#! /usr/bin/crm # # fataltraptest.crm - test trap statement # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # test the trap statement window output / :*:_nl: CRM114 testing FAULT\/TRAP... :*:_nl: / output / --- you should not see an error yet --- :*:_nl:/ { { output / you should see this -->/ } { trap /.*/ output / BUT YOU SHOULD NOT SEE THIS / } output /<--and this, with nothing in between. :*:_nl: :*:_nl:/ } output / --- the next thing you see SHOULD be an error "unable to read-open"--- :*:_nl:/ { { input [zoob.zipulrlfjf] (:hi:) } trap (:my_fault:) /.*/ { output / Caught the error - fault text was :*:my_fault: / } } output / --- and again, the next thing you see SHOULD be an error "unable to write-open"--- :*:_nl:/ { { output [/No/Such/Directory/frotz.mumble] (:hi:) } trap (:my_fault:) /.*/ { output / Caught the error - fault text was :*:my_fault: / } } output /:*:_nl:/ crm114-20100106-BlameMichelson.src/aliustest.crm0000755000000000017500000000062711321154266017525 0ustar rootwsy#! /usr/bin/crm # # aliustest.crm - test alius statement for foo and bar # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. { { output /checking for a foo.../ match /foo/ output /Found a foo :*:_nl:/ } alius { output /no foo... checking for bar.../ match /bar/ output /Found a bar. :*:_nl:/ } alius { output /neither foo nor bar :*:_nl:/ } } crm114-20100106-BlameMichelson.src/maillib.crm0000755000000000017500000003413011321154266017115 0ustar rootwsy#! /usr/bin/crm # # maillib.crm - handy library for mail whacking # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. ########################################################## # # :load_cf_file: # # Calling Sequence: # call /:load_cf_file:/ [filename] # Returns: # nothing. This routine ONLY sets up variables from the .cf file # ########################################################### # # Load in the specified .cf file. The .cf file needs # to be in the format # :varname: /value/ # # Blank lines and lines starting with a # are ignored # # Note that because this happens during the run, stuff set in # a .cf file will _override_ any comand line arguments that get set. # This could (and should) probably be changed. # :load_cf_file: (:cf_filename:) { isolate (:option_txt:) isolate (:ev:) isolate (:verbose_startup:) # # Part 1 - read in the options/configuration file # { { match [:cf_filename:] /.+/ return } input [:*:cf_filename:] (:option_txt:) } # # # reset loop for matching to start of :option_txt: match [:option_txt:] // # and loop till there are no more options. { # find a line that looks like a parameter setting... match < fromend nomultiline > (:line: :name: :value:) \ [:option_txt:] /^[ ]*(:[[:graph:]]+:)[ \t]+\/(.*)\// { # don't execute the assign if there's # a # at the start of the line. match [:name:] /^\x23/ { # Verbose startup? match [:verbose_startup:] /SET/ output / :*:name:\n :*:value:\n/ } isolate (:*:name:) /:*:value:/ } liaf } } # All done, return. return ########################################################### # # :mail_preprocess: # # Calling Sequence: # call /:mail_preprocess:/ [text to be preprocessed] # # Returns: # the processed text # ########################################################### # Preprocess a piece of mail to whatever we've specified in the # loaded .cf file setup. :mail_preprocess: (:mutilation_input:) # # We take the input and create a mutilated, annotated copy # of the incoming text. The mutilations are defined by # whatever the .cf file has set up. # will become an annotated _copy_ of the incoming text, # with whatever changes we think will help us classify better. # # We clip m_text to be the first :decision_length: characters of # the incoming text # match (:m_text:) [:mutilation_input: 0 :*:decision_length:] /.*/ isolate (:m_text:) # # :b_text: is the text with base64's expanded. isolate (:b_text:) /:*:m_text:/ # # :i_text: is the text with Hypertextus Interruptus removed. isolate (:i_text:) /:*:m_text:/ # isolate (:commentbin:) // # # # do we do any expansions? { # expansion 1: - do we perform base64 expansions? { match [:do_base64:] /yes/ { # yes, expand base64's if there are any # # Note: some spams don't even bother to use # a 'Content-Transfer-Encoding marker, # and even fewer use Content-Type: text/whatever # so we have to sort of wing it, when to expand # what _might_ be base64 and when to ignore it. # For now, if it says it's a base64, it gets # expanded, no matter what the type. Maybe # someday someone will put in a lockout for # things like .jpg files, .doc files, etc. # match [:m_text:] (:a: :h: :b:) \ /(Content-Transfer-Encoding): base64(.*)/ match (:c:) [:b:] \ /([a-zA-Z0-9+=!\/]+:*:_nl:){2,200}/ isolate (:exp_text:) // syscall (:*:c:) (:exp_text:) /:*:mime_decoder: / # and stuff the result back into m_text for # classification right in context. alter (:c:) /:*:exp_text:/ # and mark this piece of mime as "prior". alter (:h:) /Content-Transfer-Prior-Encoding/ # repeat till no more Mime base64 encodings liaf } } # Expansion 2 - fetch and insert URLs into the stream for further # analysis. BUG NOTE: as originally written, this was fully recursive # without limit, and might concieveably spider the entire web. The # EVAL statement limits total fetched length to not more than # one fetch more than :decision_length: { match [:expand_urls:] /yes/ { match [:m_text:] (:url:) /http:\/\/[[:graph:]]+/ isolate (:wget_output:) // syscall /:*:url_fetch_cmd: :*:url: | :*:url_trim_cmd: / \ () (:wget_output:) alter (:url:) /:*:url:\n :*:wget_output: \n/ eval /:@: :#:m_text: < (:*:decision_length: \/ 4) :/ liaf } } # expansion 3 : do we bust HTML comments ( a.k.a. # hypertextus interruptus) out? { match [:undo_interruptus:] /yes/ { match [:m_text:] (:comment:) // alter (:commentbin:) /:*:commentbin: :*:comment:/ alter ( :comment: ) // liaf } # if we had at least 80 characters worth of comments, then # it's worth using the decommented text, else not. # (this my personal judgement call) { { match [:commentbin:] /(.){80,}/ } alius { alter (:commentbin:) // } } } } # and reassemble the mucked-over text into the :m_text: var, always # with the base64's expanded, then the extacted comments # { alter (:m_text:) \ /:*:m_text: \n :*:commentbin: \n\n/ } ######################################################### # # Do we want to do any rewrites before running? # { match [:rewrites_enabled:] /yes/ # # NOTE CHANGE THIS ONE TO ISOLATE AND THE PROGRAM FAILS! isolate (:rewrites:) // isolate (:fileprefix:) // input (:rewrites:) [:*:fileprefix:rewrites.mfp] # reset matching on rewrites to start of string - if no string, no more # processing of rewrites !! match [:rewrites:] // # # { # Grab the next regex; turn the one-per-line patterns into a # regex and a replacement string. # First, do the line-spanning regexes. match (:ch: :fr1: :to:) [:rewrites:] /(.+)>-->(.*)/ # see if the "fr" regex matches anywhere match [:m_text:] // { match [:m_text:] (:place:) /:*:fr1:/ # Yep, it matched... alter it and do it again # alter (:place:) /:*:to:/ liaf } # Nope, didn't match... grab the next regex and try again, liaf } # # reset back to the start of the rewrites. # match [:rewrites:] // # # and do it again for non-line-spanners { # Go through and do it again, except this time do it for # the non-line-spanning regexes. match (:ch: :fr2: :to:) [:rewrites:] /(.+)>->(.*)/ # see if the "fr" regex matches anywhere { match [:m_text:] (:place:) /:*:fr2:/ # Yep, it matched... alter it and do it again # alter (:place:) /:*:to:/ liaf } # Nope, didn't match... grab the next regex and try again, liaf } } # done with rewrites. # all done; m_text now has the fully mutilated text. return /:*:m_text:/ ############################################################### # # This is Mungmail - these are the replacement routines for # formail(), to remove dependency on formail() being in every # distribution (because formail() isn't _in_ every distribution) # # # Add a new header :mungmail_add: (:new_header:) { # Grab the current headers call /:mungmail_grab_current_headers:/ alter (:current_headers:) /:*:current_headers::*:new_header:\n/ return } # # extract a header (first of them found) # :mungmail_extract: (:header_name:) { # Extract the header with the given field type, and # return that. Note that we add the colon here; don't # put it into the desired_header string. # call /:mungmail_grab_current_headers:/ { match [:current_headers:] (:: :desired_header:) \ /(?:^|\n)(:*:header_name: *: ([^\n]|\n[[:space:]])*)/ return /:*:desired_header:/ } return // } # # delete all current headers of this type # :mungmail_delete: (:new_header:) { call /:mungmail_grab_current_headers:/ { match (:new_header_type:) [:new_header:] /[[:graph:]]+/ } # # a LIAF-loop to delete any header (including continuations) that # has a type that matches the new_header_type. { match [:current_headers:] (:kill_this_line:) \ /:*:new_header_type: ([^\n]|\n[[:space:]])*\n/ alter (:kill_this_line:) // liaf } } return # delete all current headers of this type, insert ours instead. # :mungmail_unique: (:new_header:) { call /:mungmail_grab_current_headers:/ { match (:new_header_type:) [:new_header:] /[[:graph:]]+/ } call /:mungmail_delete:/ [:*:new_header:] call /:mungmail_add:/ [:*:new_header:] } return # # Helper routine to get the current mail headers of :_dw: # :mungmail_grab_current_headers: { { # Grab everything before the first \n\n match (:: :current_headers:) /(([^\n]+\n)+)\n/ # output /-A-->:*:current_headers:<---\n/ return } # if we got here, it wasn't a real message (void body, and/or no # doubled newline) but it might still have useful text anyway. # Is there a final newline? { match (:current_headers:) /^.*\n$/ # output /-B-->:*:current_headers:<---\n/ return } # if we got to here, then there wasn't even a final newline. # That's a violation of RFC, we'll add it. { alter (:_dw:) /:*:_dw:\n/ match (:current_headers:) /.+/ # output /-C-->:*:current_headers:<---\n/ return } fault / Couldn't manage to find the headers, though I tried really hard\n/ } return # # find header arg1, append comment arg2-n. If no # such header, create it, and add the comment. Note that # neither the header name nor the comment can contain spaces. # :mungmail_add_comment: (:ac_args:) { # parse input args to this routine match [:ac_args:] (:: :header: :comment:) /([[:graph:]]+) ([[:graph:]]+)/ { # find the header if it exists match (:found: :hd: :tail:) /^(:*:header:) (.*)/ alter (:tail:) /:*:tail: (:*:comment:)/ } alius { # no such header. make one. call /:mungmail_add:/ [:*:header: (:*:comment:)] } } return # # change_subject_line # :mungmail_mung_subject: (:new_tag:) # get the Subject: line. If none, make one. { { match (:subject_line: :subj_text:) \ /^Subject: (.*)/ } alius { match (:end_of_headers:) /\n\n/ alter (:end_of_headers:) /\nSubject: ( none supplied in original message )\n\n/ match (:subject_line: :subj_text:) /^Subject: (.*)/ } } { # # If we are re-sending this, we want to de-fang the # subject, otherwise we don't. match [:reject_address:] /[a-zA-Z0-9]/ # Paolo P. suggests this alteration to avoid subversion # by enclosing an alternate target in "marks". We always # have to do this. { match (:dq:) [:subj_text:] /\$/ alter (:dq:) /USD/ liaf } # # and translate away anything that might be a shell subterfuge translate (:subj_text:) [:subj_text:] /^-a-zA-Z0-9!., / } # # If the user asked for a spam-flagging string, put the flagging # string into the subject. # { match [:new_tag:] /./ alter (:subj_text:) \ /:*:new_tag: :*:subj_text:/ } return # # Mark a piece of mail with Reaver IDs. Hopefully one or the # other of these will survive your local mailer. # :mungmail_add_cache_info: (:cid:) { call /:mungmail_unique:/ [X-CRM114-CacheID: sfid-:*:cid: ] call /:mungmail_add_comment:/ [Message-Id: sfid-:*:cid:] } ############################################################### # # Reaver Cache routines # # Assumptions= the var :text_cache: contains the name of # the cache directory # # Assure that the text cache exists :reavercache_init: { match [:text_cache:] /./ { ### If the text_cache dir isn't there, create it # and it's subdirectories. # isolate (:tmp:) // syscall () (:tmp:) /ls :*:text_cache: 2>&1 / match [:tmp:] /texts/ syscall () () /mkdir :*:text_cache: / syscall () () /mkdir :*:text_cache:\/texts / syscall () () /mkdir :*:text_cache:\/prob_good / syscall () () /mkdir :*:text_cache:\/prob_spam / syscall () () /mkdir :*:text_cache:\/known_good / syscall () () /mkdir :*:text_cache:\/known_spam / syscall () () /mkdir :*:text_cache:\/empty / } } return # # Put some text into the cache; # side effect: # variable :reaver_cacheid: to the filename (no directory) # variable :long_cacheid: is set to the file alone (no directory) name # :reavercache_store: (:text:) { match [:text_cache:] /./ # Don't store it if no reavercache desired isolate (:system_time: :msg_hash:) // syscall () (:system_time:) /date +%Y%m%d_%H%M%S_%N / match [:system_time:] ( :: :cacheid: ) /([[:graph:]]+)..../ hash (:msg_hash:) /:*:text:/ isolate (:cacheid:) /:*:cacheid:/ # It's unclear if the following increases security at all. isolate (:cacheid:) /:*:cacheid:_:*:msg_hash:/ isolate (:long_cacheid:) /:*:text_cache:\/texts\/:*:cacheid:/ output [:*:long_cacheid:] /:*:text:/ } return # # And the mother of all traps... # # trap (:broken_program_message:) /.*/ { accept output /:*:_nl: Aw, crud. maillib.crm broke. Here's the error: :*:_nl:/ output /:*:broken_program_message:/ output [stderr] /:*:_nl: ERROR: maillib.crm broke. Here's the error\: :*:_nl:/ output [stderr] /ERROR: :*:broken_program_message:/ } exit /:*:program_fault_exit_code:/ crm114-20100106-BlameMichelson.src/GPL_License.txt0000644000000000017500000004310311321154266017623 0ustar rootwsy GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. crm114-20100106-BlameMichelson.src/alternating_example_svm.crm0000755000000000017500000000400011321154266022405 0ustar rootwsy#! /usr/bin/crm window { output /**** Alternating Example SVM Network Classifier TRAINING\n/ # load the files in isolate (:Macbeth: :Alice:) input (:Macbeth:) [ Macbeth_Act_IV.txt 0 16000] input (:Alice:) [ Alice_In_Wonderland_Chap_1_And_2.txt 0 16000] # Now loop. isolate (:loopcontrol:) // isolate (:loopcounter:) /0/ { eval (:loopcounter:) / :@: :*:loopcounter: + 1 : / # output /Top of loop at :*:loopcounter: \n/ match [:loopcontrol:] /./ { { # Grab a good chunk of Macbeth... match (:onep:) /(....){255}.*?\n/ [:Macbeth:] match [:onep:] /.../ learn [:onep:] < SVM unigram append> (m_test.css) learn [:onep:] < SVM unigram refute append> (a_test.css) } alius # Set done mark { alter (:loopcontrol:) /X/ } } { { # Grab a good chunk of Alice... match (:twop:) /(....){255}.*?\n/ [:Alice:] match [:twop:] /.../ learn [:twop:] < SVM unigram append> (a_test.css) learn [:twop:] < SVM unigram refute append> (m_test.css) } alius # reset to start of Macbeth file. { alter (:loopcontrol:) /X/ } } liaf } # Now run one fromstart loop on each of the files learn [:_nl:] (m_test.css ) # learn [:_nl:] (a_test.css) } output /\n**** Alternating Example SVM Network Classifier RUNNING TEST\n/ isolate (:s:) isolate (:filetxt:) // { input (:filetxt:) [ Alice_In_Wonderland_Chap_1_And_2.txt 16000 4096 ] match (:t1:) [:filetxt:] /(....){255}.*?\n\n/ { classify < SVM unigram > ( m_test.css ) (:s:)/[[:graph:]]+/ [:t1:] output / type M \n:*:s:\n/ } alius { output / type A \n:*:s:\n/ } } { isolate (:t2:) // input (:filetxt:) [ Macbeth_Act_IV.txt 16000 4096 ] match (:t2:) [:filetxt:] /(....){255}.*?\n/ { classify < SVM unigram > ( m_test.css ) (:s:) /[[:graph:]]+/ [:t2:] output / type M \n:*:s:\n/ } alius { output / type A \n:*:s:\n/ } } crm114-20100106-BlameMichelson.src/crm_util_errorhandlers.c0000644000000000017500000000174011321154266021713 0ustar rootwsy// crm_util_errorhandlers.c - Error handling routines to be used ONLY // in utilities, not in the CRM114 engine itself; these don't do // what you need for the full crm114 runtime. // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. #include #include long fatalerror ( char *str1, char *str2) { fprintf (stderr, "ERROR: %s%s \n", str1, str2); exit (-1); } long nonfatalerror ( char *str1, char *str2) { fprintf (stderr, "ERROR: %s%s \n", str1, str2); exit (-1); } long untrappableerror ( char *str1, char *str2) { fprintf (stderr, "ERROR: %s%s \n", str1, str2); exit (-1); } long fatalerror5 ( char *str1, char *str2) { fprintf (stderr, "ERROR: %s%s \n", str1, str2); exit (-1); } long nonfatalerror5 ( char *str1, char *str2) { fprintf (stderr, "ERROR: %s%s \n", str1, str2); exit (-1); } long untrappableerror5 ( char *str1, char *str2) { fprintf (stderr, "ERROR: %s%s \n", str1, str2); exit (-1); } crm114-20100106-BlameMichelson.src/statustest.crm0000755000000000017500000000141011321154266017722 0ustar rootwsy#! /usr/bin/crm # # statustest.crm - CRM114 testing outcall and status codes # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # window output /:*:_nl: CRM114 testing outcall and status codes :*:_nl:/ { isolate (:c1: :proc:) output /:*:_nl: ----- keeping a process around ---- :*:_nl:/ output /:*:_nl: preparing... :*:proc:/ syscall ( a one :*:_nl: ) ( ) (:proc:) /cat > e2.out/ output /:*:_nl: OK_IF_PID_CHANGES: one... :*:proc:/ syscall ( and a two :*:_nl: ) () (:proc:) // output /:*:_nl: OK_IF_PID_SAME_AS_ABOVE: again... :*:proc:/ syscall ( and a three :*:_nl:) () (:proc:) // output /:*:_nl: and done ...:*:proc: :*:_nl:/ } { syscall () () (:proc:) /exit 123/ output /exit value test got :*:proc:/ } crm114-20100106-BlameMichelson.src/QUICKREF.txt0000644000000000017500000021521311321154266016753 0ustar rootwsy# # QUICKREF.txt - The CRM114 Quick Reference Card # # # Copyright 2006-2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # The CRM114 Quick Reference Card. Updated 20090302 ----- THE COMMAND LINE ------------- Invoke as 'crm whatever' or use '#!/usr/bin/crm' as the first line of a script file containing the program text. -d N - run N cycles, then drop into debugger. If no N, debug immediately -e - no environment variables imported -E - set engine runtime exit base value -h - print help text -l N - print a listing (detail level 1 through 5) -p - generate an execution-time-spent profile on exit -P N - max program lines -q m - default mathmode (0,1 = alg/RPN in EVAL, 2,3 = alg/RPN everywhere) -s N - new feature file (.css) size is N (default 1 meg+1 featureslots) -S N - new feature file (.css) size is N rounded up to 2^I+1 featureslots -t - prettyprint source listing & give user level execution trace output -T - implementors trace output (only for the masochistic!) -u dir - chdir to directory dir before starting execution -v - print CRM114 version identification and exit. -w N - max data window (bytes, default 16 megs) -- - signals the end CRM114 flags; prior flags are not seen by the user program; subsequent args are not processed by CRM114. --foo - creates the user variable :foo: with the value SET --x=y - creates the user variable :x: with the value y -{ stmts } - execute the statements inside the {} brackets. Absent the -{ program } flag, the first arg is taken to be the name of a file containing a crm114 program, subsequent args are merely supplied as :_argN: values. Use single quotes around commandline programs '-{ like this }' to prevent the shell from doing odd things to your command-line programs. CRM114 can be directly invoked by the shell if the first line of your program file uses the shell standard, as in: #! /usr/bin/crm You can use CRM114 flags on the shell-standard invocation line, and hide them with '--' from the program itself; '--' incidentally prevents the invoking user from changing any CRM114 invocation flags. Flags should be located after any positional variables on the command line. Flags _are_ visible as :_argN: variables, so you can create your own flags for your own programs (separate CRM114 and user flags with '--'). Examples: ./foo.crm bar mugga < baz -t -w 150000 <--- Use this ./foo.crm -t -w 1500000 -- bar < baz mugga <--- or this ./foo.crm -t -w 150000 bar < baz mugga <--- NOT like this You can put a list of user-settable vars on the '#!/usr/bin/crm' invocation line. CRM114 will print these out when a program is invoked directly (e.g. "./myprog.crm -h", not "crm myprog.crm -h") with the -h (for help) flag. (note that this works ONLY on Linux and Darwin - FreeBSD and Solaris have a different implementations and this doesn't work. Don't use this in programs that need to be portable) Example: #!/usr/bin/crm -( var1 var2=A var2=B var2=C ) - allows only var1 and var2 be set on the command line. If a variable is not assigned a value, the user can set any value desired. If the variable is equated to a set of values, those are the _only_ values allowed. #!/usr/bin/crm -( var1 var2=foo ) -- - allows var1 to be set to any value, var2 may only be set to either "foo" or not at all, and no other variables may be set nor may invocation flags be changed (because of the trailing "--"). Since "--" also blocks '-h' for help, such programs should provide their own help facility. ----- VARIABLES ---------- Variable names and locations start with a : , end with a : , and may contain only characters that have ink (i.e. the [:graph:] class) with a few exceptions- basically, no embedded ':' characters. They are case sensitive. Examples :here: , :ThErE:, :every-where_0123+45%6789: , :this_is_a_very_very_long_var_name_that_does_not_tell_us_much: . Builtin variables: :_nl: - newline :_ht: - horizontal tab :_bs: - backspace :_sl: - a slash :_sc: - a semicolon :_arg0: thru :_argN: - command-line args, including _all_ flags :_argc: - how many command line arguments there were :_pos0: thru :_posN: - positional args ('-' or '--' args deleted) :_posc: - how many positional arguments there were :_pos_str: - all positional arguments concatented :_env_whatever: - environment value 'whatever' :_env_string: - all environmental arguments concatenated :_crm_version: - the version of the CRM system :_cd: - the current call depth :_cs: - the current statement number :_pgm_hash: - hash of the current program - for version verification :_pgm_text: - copy of post-processed source code - matchable :_pid: - process ID of the current process. :_ppid: - process ID of the parent of the current process. :_dw: - the current data window contents (usually the default arg) :_iso: - the current isolated data block (change at your own peril!) ---- VARIABLE EXPANSION ---- You can use the standard C char constant '\' characters, such as "\n" for newline, as well as excaped hexadecimal and octal characters like \xHH and \oOOO but these are constants, not variables, and cannot be redefined. Variables are expanded by the ':*:' var-expansion operator, e.g. :*:_nl: expands to a newline character. Uninitialized vars evaluate to their text name (and the colons stay). User variables are also expanded with the :*: operator, so :*:foo: expands to whatever value :foo: has. Variables are indirected by the :+: indirection operator; the reason for the :+: operator is that if :foo: contains the name of another variable (such as might happen in a CALL statement), then :*: would only return the name of that other variable, but :+: would return the value in that other variable. Use :+: and :*:_cd: to get proper isolation in non-tail-recursive variables, like :+:foo_:*:_cd:: to get the value of a recursively labeled foo_0, foo_1, foo_2, etc. Depending on the value of "math mode" (flag -q). you can also use :#:string_or_var: to get the length of a string, and :@:string_or_var: to do basic mathematics and inequality testing, either only in EVALs or for all var-expanded expressions. See "Sequence of Evaluation" below for more details. ----- PROGRAM BEHAVIOR ---- Default behavior is to read all of standard input till EOF into the default data window (named :_dw:), then execute the program (this is overridden if first executable statement is a WINDOW stmt). Variables don't get their own storage unless you ISOLATE them (see below), instead variables are start/length pairs indexing into the default data window. Thus, ALTERing an unISOLATEd variable changes the value of the default data buffer itself. This is a great power, so use it only for good, and never for evil. --- STATEMENTS AND STUFF (separate statements with a ';' or with a newline) -- \ - '\' is the string-text escape character. You only _need_ to escape the literal representation of closing delimiters inside var-expanded arguments. You can use the classic C/C++ \-escapes, such as \n, \r, \t, \a, \b, \v, \f, \0, for the ASCII-defined escape sequences, and also \xHH and \oOOO for hex and octal characters, respectively. A '\' as the _last_ character of a line means the next line is just a continuation of this one. A \-escape that isn't recognized as something special isn't an error; you may _optionally_ escape any of these delimiters: > ) ] } ; / # \ and get just that character. A '\' anywhere else is just a literal backslash, so the regex ([abc])\1 is written just that way; there is no need to double-backslash the \1 (although it will work if you do). This is because the first backslash escapes the second backslash, so only one backslash is "seen" at runtime. # this is a comment # and this too \# - A comment is not a piece of preprocessor sugar- it is a -statement- and ends at the newline or at "\#" insert filename - inserts the file verbatim at this line at compile time. If the file can't be INSERTed, a system-generated FAULT statement is inserted. Use a TRAP to catch this fault if you want to allow program execution to continue without the missing INSERT file. filename - the local (-u applied) file to insert [expanded_filename] - the filename is first expanded against command-line and environment variables. ; - semicolon is a statement separator - unless it's inside delimiters it must be escaped as \; or else it _will_ mark the end of the statement. { and } - start and end blocks of statements. Must always be '\' escaped or inside delimiters or these will mark the start/end of a block. noop - no-op statement :label: - define a GOTOable label :label: (:arg:) - define a CALLable label. The args in the CALL statement are concatenated and put into the freshly ISOLATEd var :arg: (:arg:) - var-expanded varname to recieve the caller's arguments (usually a MATCH is then done to put locally convenient labels on the args). accept - writes the current data window to standard output; execution continues. alius - if the last bracket-group succeeded, ALIUS skips to end of {} block (a skip, not a FAIL); if the prior group FAILed, ALIUS does nothing. Thus, ALIUS is both an ELSE clause and a CASE statement. alter (:var:) /new-val/ - surgically change value of var to newval (:var:) - var to change (var-expanded) /new-val/ - value to change to (var-expanded) call /:entrypoint_label:/ [:arg1: :arg2:... ] (:ret_arg:) do a routine call on the specified (var-expanded) entrypoint label. Note that the called routine shares all variables (including the data window :_dw:). Return is accomplished with the RETURN statement. /:entrypoint_label:/ - the location to call [:arg1: :arg2: ...] - var-expanded list of args to call. These are concatenated and supplied to the called routine as a single ISOLATEd var, to be used as desired (usually a MATCH parses the arglist as desired, then :*: is used for call-by-value arguments, and :+: indirection is used to retrieve call-by-name arguments). Call-by-value arguments are NOT modifiable by the callee, while call-by-name arguments are modifiable. (:ret_arg:) - this variable gets the returned value from the routine called (if it returns anything). If it had a previous value, that value is overwritten on return. classify (:c1:...|...:cN:) (:stats:) [:in:] /word-pat/ /pR_offset/ - compare the statistics of the current data window buffer with classfiles c1...cN . In general, class statistics files are NOT portable between different classifiers! - ignore case in word-pat, does not ignore case in actual text (use tr() or the TRANSLATE command to do that on :in: if you want it) - enable the microgroomer to purge less-important information automatically whenever the statistics file gets to crowded. However, this disables certain optimizations that can speed classification. - use unique features only; this improves accuracy while using less memory. Usable with Markov and OSB modes. - use single-word features only; his makes CRM114 almost exactly equivalent to most other Bayesian classifiers. Works with the OSB, Winnow and hyperspace classifiers. - use orthogonal sparse bigram (OSB) features and Markovian classification instead of Markovian SBPH features. OSB uses a subset of SBPH featuers with about 1/4 the memory and disk needs, and about 4x the speed of full Markovian, with basically the same accuracy. - use the Fidelis Confidence Factor local probability generator. This format is not compatible with the default, but with singlesided threshold training ( typically pR of 10-30 ) achieves the best accuracy yet. - use the Winnow nonstatistical classifier and the OSB frontend feature generator. Winnow uses .cow files, which are not compatible with the .css files for the Markovian (default) and OSB classifers. - use hyperspace matching; each learned document represents a light source in a 4-billion-dimensional hyperspace, and the set of sources that shines most brightly onto the unknown document's hyperspatial location is the matching class. - use the bit-entropy classifier. This uses compressibility of the unknown given the prior learned text as a perfect compressor model. No tokenization happens- this classifier works one bit at a time, always. - use the fast substring compression matcher. This measures the compressibility of an unknown text using the known texts as a compression dictionary. Tokenization is used as a compressibility filter (default tokenization is /./ which makes FSCM equivalent to LZ77 with an infinite window. - use the SVM classifier. This uses SVM (support vector machine) techniques. NB: for now VERY EXPERIMENTAL; OSB or unigram features (default OSB features), 2-class only, uses a single statistics file and REFUTE for negative examples. - use the PCA classifier. This classifier is two-class only, uses a single statistics file, and uses REFUTE to learn negative examples. EXPERIMENTAL. - use the String Kernel SVM. THIS FILTER IS STRONGLY DEPRECATED. Use instead. - use a three-layer neural network with stochastic back-propagation training. Use to reinitialize the network neurons to a small random state in case it gets stuck in a (rare) local minimum. Use to update weights after each document in a pass (default is to update weights only after all documents are seen in a pass). EXPERIMENTAL!!! - use the full correlative matcher. Very slow, but capable of matching stemmed words in any language and of matching binary files (:c1: ... - file or files to consider "success" files. The CLASSIFY succeeds if these files as a group match best. if not, the CLASSIFY does a FAIL. | - optional separator. Spaces on each side of the " | " are required. .... :cN:) - optional files to the right of " | " are considered as a group to "fail". If statement fails, execution skips to end of enclosing {..} block, which exits with a FAIL status (see ALIUS for why this is useful). (:stats:) - optional var that will be surgically changed to contain a formatted matching summary. In some versions, must pre-exist. [:in:] - restrict statistical measure to the string inside :in: [:in: n m] - take a substring of :in:, starting at n and including m characters [:in: /regex/] - take a substring of :in: that matches the regex /word-pat/ - regex to describe what a parseable word is. Default is /[[:graph:]]+/ /pR_offset/ - OSBF: change the classify threshold; with this optional parameter the success/failure decision point can be changed from the default 0 to what you specify. If given, the pR in 'stats' will be printed in the form pR/pR_offset. /svm-specific controls/ - a vector of seven parameters for SVM-classifiers clump [:text:] (clumpfile) (stat) /regex/ /params/ - does incremental parametric clustering of documents to generate document groups. No pre-judged corpus is required. [:text:] - input text; var-restriction allowed (clumpfile) - name of file to hold the clumps (all docs go into the same clumpfile) (status) - Status output, for the result of the clump. Clumping the null input text will give a status dump of all the documents in the entire clumpfile. - special control flags; unigram, unique, and refute are supported, with the same meanings as in LEARN and CLASSIFY. Default clustering is by document-to-document nearest-neighbor hyperspatial distance. If you add the bychunk flag, then the distance is to the cluster's centroid. /regex/ - optional tokenization regex; default is /[[:graph:]]+/ /params/ - control parameters: "tag=somename" label to later refer to this document. "clump=somename" forces a name onto a cluster. "n_clusters=N" says how many doc clusters you want; if N=0 then it will simply store the document and wait for more (much faster computationally). If N < 0 the number of clusters is determined automatically. debug - drop immediately into the interactive debugger. eval (:result:) /instring/ - repeatedly evaluates /instring/ until it ceases to change, then surgically places that result as the value of :result: . EVAL uses smart (but foolable) heuristics to avoid infinite loops, like evaluating a string that evaluates to a request to evaluate itself again. The error rate is about 1 / 2^62 and (in the default configuration) will detect looping chain groups of length 4096 or less. If the instring uses math evaluation (see section below on math operations) and the evaluation has an inequality test, (>, >=, <, <=, =, or !=) then if the test fails, the EVAL will FAIL to the end of block. Math is IEEE-compliant, so unreasonable things like divide-by-zero may yield NaN (Not A Number) or +/- INF exit /:exitcode:/ - ends program execution. If supplied, the return value is converted to an integer and returned as the exit code of the crm114 program. /:exitcode:/ - variable to be converted to an integer and returned. If no exit code is supplied, the exit code value is 0. fail - skips down to end of the current { } block and causes that block to exit with a FAIL status (see ALIUS for why this is useful) fault /faultstr/ - forces a FAULT with the given string as the reason. /faultstr/ - the val-expanded fault reason string goto /:label:/ - unconditional branch (you can use a variable as the goal, e.g. /:*:there:/ ) hash (:result:) /input/ - compute a fast 32-bit hash of the /input/, and ALTER :result: to the hexadecimal hash value. HASH is _not_ warranted to be constant across major releases of CRM114, nor across changes of platform (32 v 64 bit), nor is it cryptographically secure. (:result:) - value that gets result. /input/ - string to be hashed (can contain expanded :vars: , defaults to the data window :_dw: ) intersect (:out:) [:var1: :var2: ...] - makes :out: contain the part of the data window that is the intersection of :var1 :var2: ... ISOLATEd vars are ignored. This only resets the value of the captured :out: variable, and does NOT alter any text in the data window. isolate (:var:) [/initial-value]/ - puts :var: into a data area outside of the default data window buffer; subsequent changes to this var don't change the data buffer (though they may change the value of any var subsequently set inside of this var). If the var already was ISOLATED, this is will stay isolated but it will surgically alter the value if a /value/ is given. - only create and set var if it didn't exist before (ideal for setting defaults) (:var:) - name of ISOLATEd var (var-expanded) [/initial-value]/ - optional initial value for :var: (var-expanded); use either [] or // to enclose the value. If no value is supplied, the previous value is retained/copied. If there is no previous value and no value is supplied, a NULL string is used. input (:result:) [:filename:] - read in the content of filename if no filename, then read stdin - read one line only (:result:) - var that gets the input value (surgical overwrite). [:filename:] - the file to read. The first blank-delimited word is taken and var-expanded; the result is the filename, even if it includes embedded spaces. Default is to read stdin. [:filename: offset len] - optionally, move to offset in the file, and read len bytes. Offset and len are individually blank-delimited, and var-expanded with mathematics enabled. If len is unspecified, the read extends to EOF or buffer limit. learn (:class:) [:in:] /word-pat/ - learn the statistics of the :in: var (or the input window if no var) as an example of class :class: - flag this is as an anti-example of this class- unlearn it! - ignore case in word-pat, does not ignore case in actual text (use tr() or the TRANSLATE command to do that on :in: if you want it) - enable the microgroomer to purge less-important information automatically whenever the statistics file gets to crowded. However, this disables other optimizations that can speed up - use orthogonal sparse bigram (OSB) features and Markovian classification instead of Markovian SBPH features. OSB uses a subset of SBPH featuers with about 1/4 the memory and disk needs, and about 4x the speed of full Markovian, - use the Fidelis Confidence Factor local probability generator. This format is not compatible with the default, but with singlesided threshold training ( typically pR of 10-30 ) achieves the best accuracy yet. - use the Winnow nonstatistical classifier and the OSB frontend feature generator. Winnow uses .cow files, which are not compatible with the .css files for the Markovian (default) and OSB classifers. Remember that for Winnow to be at its best in accuracy, it has to be trained both with positive cases that failed to make a minimum threshold (typically with a per-file (not overall) match quality that was below a pR of .2 or more) as well as for "negative reinforcement" training for any "not in class" per-file match qualities that weren't at a pR of -.2 or less.) - use hyperspace matching; each learned document represents a light source in a 4-billion-dimensional hyperspace, and the set of sources that shines most brightly onto the unknown document's hyperspatial location is the matching class. Slightly experimental. - use single-word features only; using this this makes CRM114 almost exactly equivalent to most other Bayesian classifiers. Also works with the Winnow and hyperspace classifiers. - use the bit-entropy classifier. This uses compressibility of the unknown given the prior learned text as a perfect compressor model. No tokenization happens- this classifier works one bit at a time. The tokenizer regex is ignored; the second // argument can hold an optional "fuzz factor" for how close an approximation is allowed. - use the full correlative matcher. Very slow, but capable of matching stemmed words in any language and of matching binary files. Correlative matching does not tokenize, and so you don't need to supply it with a word-pat. (:class:) - name of file holding hashed results; nominal file extension is .css [:in:] - captured var containing the text to be learned (if omitted, the full contents of the data window is used) [:in: n m] - take a substring of :in:, starting at n and including m characters [:in: /regex/] - take a substring of :in: that matches the regex /word-pat/ - regex that defines a "word". Things that aren't "words" are ignored. Default is /[[:graph:]]+/. Ignored in correllation and bit-entropy. /entropy_fuzz/ Bit-entropy: this number is the "fuzz" factor in determining when to loop back the compression algorithm Markov chain versus allocating new nodes. You must specify an empty word-pat to use entropy fuzz. /svm-specific controls/ - a vector of seven parameters for SVM-classifiers liaf - skips UP to START of the current {} block (LIAF is FAIL spelled backwards) match (:var1: ...) [:in:] /regex/ - Attempt to match the given regex; if match succeds, variables are bound; if match fails, program skips to the closing '}' of this block - statement succeeds if match not present - ignore case when matching - No special characters in regex (only supported with newer TRE based versions) - start match at start of the [:in:] var - start match at start of previous successful match on the [:in:] var - start match at one character past the start of the previous successful match on the [:in:] var - start match at one character past the end of prev. match on this [:in:] var - require match to end after end of prev. match on this [:in:] var - search backward in the [:in:] variable from the last successful match. - execute the search in blocks of one line of text each, so the result will never span a line. This means that ^ and $ will match at the beginning and end of each line, rather than the beginning and end of the full text. (:var1: ...) - optional result vars. The first var gets the text matched by the full regex. The second, third, etc. vars get each subsequent parenthesized subexpression, in left-to-right order of the subexpression's left parenthesis. These are "captures", not ALTERs, so text overlapping prior :var: values is left unchanged. [:in:] - search only in the variable specified; if omitted, :_dw: (the full input data window) is used [:in: :start: :len:] - search in the :in: input var, limiting the area searched to :start: to :len: (zero-origin counted) [:in: /inregex/ ] - search in the :in: input var, limiting the searched area to whatever matches the inregex (this doesn't use or affect previous successful match values) /regex/ - POSIX regex (with \ escapes as needed) output [filename] /output-text/ - output an arbitrary string with captured values expanded. - append to the file (otherwise, the previous contents of the file is lost). [:filename:] - the file to write. The first blank-delimited word is taken and var-expanded; the result is the filename, even if it includes embedded spaces. Default output is to stdout. stderr is recognized. [:filename: offset len] - optionally, move to offset in the file, and maximum write len bytes. Offset and len are individually blank-delimited, and var-expanded with mathematics enabled. If len is unspecified, the write is the length of the expansion of /output-text/ /output-text/ - string to output (var-expanded) pmulc (clumpfile) [:text:] /regex/ - use the clumpfile as a lookup to translate documents to their appropriate clusters. The text does not get added into the clumpfile. [:text:] - input text; var-restriction allowed. (clumpfile) - name of file to holding the clumps /regex/ - optional tokenization regex; default is /[[:graph:]]+/ - The optional flags are bychunk, unique, and unigram, with the same functions as under clump. return /returnval/ - return from a CALL. Note that since CALL executes in shared space with the caller, all changes made in the CALLed routine are shared with the caller. /returnval/ - this (var-expanded) value is returned to the caller (or if the caller doesn't accept return values, it's discarded). syscall (:in:) (:out:) (:status:) [/command_or_label]/ - execute a shell command or fork to the specified label. This happens in a fresh copy of the environment; there is no communication with the main program except via the :in:, :out:, and :status: vars. Output over the buffer length is discarded unless you the process around for multiple readings. - don't send an EOF after feeding the full input (this will usually keep the syscalled process around). Later syscalls with the same :status: var will continue feeding to and reading from the kept proc. - don't wait for process to output an EOF; just grab what's available in the process's output pipe and proceed (default limit per syscall is 256 Kb). The process then runs to completion independently and asynchronously. (This is "fire and forget" mode, and is mutually exclusive with . ) (:in:) - var-expanded string to feed to command as input (can be null if you don't want to send the process something.) You _MUST_ specify this if you want to specify an :out: variable. (:out:) - var-expanded varname to place results into (MUST pre-exist, can be null if you don't want to read the process's output (yet, or at all). Limit per syscall is 256 Kbytes. You _MUST_ specify this if you want to use the :status: variable). This is a surgical alter. (:status:) - if you want to keep a minion proc around, or catch the exit status of the process, specify a varname here. The minion process's PID and pipes will be stored here. The program can access the proc again with another syscall by using this var again. When the process exits, its exit code will be surgically stored here (unless you specified ) [/command_or_label]/ - the command or entrypoint you want to run. This arg is var-expanded; if the first word is a :label:, the fork begins execution at the label. If the first word is not a :label:, then the entire string is handed off to the shell to be executed as a shell command. It can be enclosed in either [] or //. translate (:dest:) [:src] /from_charset/ /to_charset/ - do a tr()-like translation of 8-bit characters in the from_charset to the corresponding characters in the to_charset. - repeated sequential copies of the same char in from_charset are replaced by a single copy, then translated. - from_charset and to_charset are literal, no var-expansion, ranging, or inversion performed. [:src:] - source of data. Can be var-restricted. Default is the default data window :_dw: (:dest:) - destination to put result. defaults to the default data window :_dw: /from_charset/ - var-expanded charset of characters to be translated from. Use hyphens for ranges like a-e meaning abcde . Reversed ranges such as e-a meaning edcba work. (this is different than tr() !) Set inversion as in ^a-z mean all characters that aren't lower case characters works. Character duplication is not an error. To use - as a literal character, make it the first or last character. To use ^ as a literal character, make it any but the first character. ASCII \-escapes like \n and \xFF work. /to_charset/ - charset of characters to be translated to. Same rules as from_charset; excess characters are ignored; if not enough characters are available, start over using the to_charset characters from the beginning (this is different than tr().) If to_charset is not given, then all chars in from_charset are deleted. trap (:reason:) /trap_regex/ - traps faults from both FAULT statements and program errors occurring anywhere in the preceding bracket-block. If no fault exists, TRAP does a SKIP to end of block. If there is a fault and the fault reason string matches the trap_regex, the fault is trapped, and execution continues with the line after the TRAP, otherwise the fault is passed up to the next surrounding trapped bracket block. (:reason:) - the fault message that caused this FAULT. If it was a user fault, this is the text the user supplied in the FAULT statement. This variable is allocated as an ISOLATED variable. /trap_regex/ - the regex that determines what kind of faults this TRAP will accept. Putting a wildcard here (e.g. /.*/ means that ALL trappable faults will be trapped. union (:out:) [:var1: :var2: ...] - makes :out: contain the union of the data window segments that contains var1, var2... plus any intervening text as well. Any ISOLATEd var is ignored. This is non-surgical, and does not alter the data window window (:w-var:) (:s-var:) /cut-regex/ /add-regex/ - window slider. This deletes to and including the cut-regex from :var: (default: use the data window), then reads adds from std. input till we find add-regex (inclusive). - ignore case when matching cut- and add- regexes - (default) read one char at a time and check input for add-regex every character, so never reads "too much" from stdin.. - reads as much data as available, then checks with the regex. ( unused characters are kept around for later) - wait for EOF to check add-regex (unused characters are kept around for later) - accept an EOF as being a successful regex match ( default is only a successful add-regex matches. CAUTION: can cause rapid looping!) - keep reading past an EOF; reset the stream and wait again for more input. (default is to FAIL on EOF. CAUTION: this can cause rapid looping!) (:w-var:) - what var to window (:s-var:) - what var to use for source (defaults to stdin, if you use a source var you _must_ specify the windowed var.) /cut-regex/ - var-expanded cut pattern. Everything up to and including this is deleted. /add-regex/ - var-expanded add pattern, if absent reads till EOF. This pattern is a minimal match pattern, so if the pattern can match a zero-length string ( say, /.*/ ), this can yield zero characters added. Use a pattern like /.+/ to prevent this. ***** If both cut-regex and add-regex are omitted, and this window statement is an executable no-op... EXCEPT that if it's the _first_ _executable_ statement in the program, then the WINDOW statement configures CRM114 to _not_ wait to read a anything from standard input input before starting program execution. ------------ A Quick Regex Intro --------- A regex is a pattern match. Do a "man 7 regex" for details. Matches are, by default "first starting point that matches, then longest match possible that can fit". a through z A through Z - all match themselves 0 thorugh 9 most punctuation - matches itself, but check below! . - the 'period' char, matches any character * - repeat preceding 0 or more times + - repeat preceding 1 or more times ? - repeat preceding 0 or 1 time [abcde] any one of the letters a, b, c, d, or e [a-q] the letters a through q (just one of them) [a-eh-mqzt] the letters a through e, plus h through m, plus q, z, and t [^xyz] any one letter EXCEPT one of x, y, or z [^a-e] any one letter EXCEPT one of a through e {n} repetition count: match the preceding exactly n times {n,} repetition count: match the preceding at least n times {n,m} repetition count: match the preceding at least n and no more than m times (sadly, POSIX restricts this to a maximum of 255 repeats. Nested repeats like (.{255}){10} will work, but are very very slow). [[:<:]] no longer supported, GNU regex only \< matches at the start of a word, TRE regex only [[:>:]] no longer supported, GNU regex only \> matches at the end of a word, TRE regex only ^ As the first character in a match, it matches only at the start of a block; this usually means start of the input variable. If you use then each line is its own block and so ^ means "start of line". ^ As the last character in a match, it matches only at the end of a block; this usually means the end of the input variable. If you use then each line is its own block and so $ means "end of line". . (a period) matches any _single_ character (except start-of-line or end of line "virtual characters", but it does match a newline). (parenthesis) - the () themselves go away, the item inside the parenthesis is treated as a group (so you can (foo)* to get repeated foos), and whatever string matched inside is captured as a submatch for any match variables. Use \( and \) to match actual parenthesis. a|b match a _or_ b, such as foo|bar which will match "foo" or "bar" (multiple characters!). To get a shorter extent of ORing, use parenthesis, e.g. /f(oo|ba)r/ matches "foor" or "fbar", but not foo or bar. The following are other POSIX expressions, which mostly do what you'd guess they'd do from their names. [[:alnum:]] <-- a-z, A-Z and 0-9 [[:alpha:]] <-- a-z and A-Z [[:blank:]] <-- space and tab only [[:space:]] <-- whitespace: moves printhead, doesn't put ink / light pixel [[:cntrl:]] <-- control characters [[:digit:]] <-- 0-9 [[:lower:]] <-- lower-case letters a-z [[:upper:]] <-- upper-case letters A-Z [[:graph:]] <-- any character that puts ink on paper or lights a pixel [[:print:]] <-- any character that moves the "print head" or cursor. [[:punct:]] <-- punctuation characters [[:xdigit:]] <-- hex digits 0-9, a-f and A-F ----- The following are only available with the TRE-based versions ----- *?, +?, ??, {n,m}? - repeat the preceding expression 0-or-more, 1-or-more, 0-or-1, or n-to-m times, but _shortest_ match that fits, given the already-selected start point of the regex. This is an "anti-greedy" match, unlike the normal match that wants to have the longest possible resultiing match \N - where N is 1 through 9 - matches the N'th parenthesized previous subexpression. You don't have to backslash-escape the backslash (e.g. write this as \2 or as \\2, either will work) \Q - start verbatim quoting - all following characters represent exactly themselves; no repcounts or wildcards apply. This is _only_ terminated by a \E or the end of the regex. \E - end of verbatim quoting. \< - start of a word (doesn't use up a character) \> - end of a word (doesn't use up a character) \d - a digit \D - not a digit \s - a space \S - not a space \w - a word char ( a-z, A-Z, 0-9, or _ ) \W - not a word char (?:some-regex) - parenthesize a subexpression, but _don't_ capture a submatch for it. (?inr-inr:regex) - Let you turn on or off case independence, nomultiline, and right-associative (rather than the default left-associative) matching. These nest as well. i - case independent matching. examples: /(?i:abc)/ matches 'abc', 'AbC', 'ABC', etc... /(?i:ABC(?-i:de)FGH)/ matches ABCdeFGH, abcdefgh, but not ABCdEFGH or ABCDEFGH n - don't match newlines with wildcards such as .* or with anti-wildcards like [^j-z]. "-n" _allows_ matching of newlines (this is slightly counterintuitive). eg: /(?n:a.*z)/ matches 'abcxyz' but not 'abc xyz' /(?-n:a.*z)/ matches both (this does NOT override the flag; essentially "blocks" the searched text at newlines, and searches within those blocks only) r - right-associate matching. This changes only sub-matches, never whether the match itself succeeds or fails. (I haven't come up with a good example for this; any suggestions?) -------------- Notes on Sequence of Evaluation ------------- By default, CRM114 supports string length and mathematical evaluation only in an EVAL statement, although it can be set to allow these in any place where a var-expanded variable is allowed (see the -q flag). The default value ( zero ) allows stringlength and math evaluation only in EVAL statements, and uses non-precedence (that is, strict left-to-right unless parenthesis are used) algebraic notation. -q 1 uses RPN instead of algebraic, again allowing stringlength and math evaluation only in EVAL expressions. Modes 2 and 3 allow stringlength and math evaluation in _any_ var-expanded expression, with non-precedence algebraic notation and RPN notation respectively. You can overide whether to use Algebraic or RPN precedence of any math evaluation by using an A or an R as the first character of the math evaluation string. Evaluation is always left-to-right; there is no precedence of operators beyond the sequential passes noted below. The evaluation is done in four sequential passes: 1) \-constants like \n, \o377 and \x3F are substituted in. You must use three digits for octal and two digits for hex. To write something that will literally appear as one of these constants, escape the backslasn with another backslash, i.e. to output '\o075' use '\\o075'. 2) :*:var: variables are substituted (note the difference between a constant like '\n' and a variable like ":*:_nl:" here - constants are substituted first, then variables are substituted.). If there is no such variable, then the 'variable name' is its own result, so :*:I_am_not_defined: yields "I_am_not_defined". 3) :+:var: indirection variables are substituted. This is equivalent to taking :*: twice immediately ( note that :*::*:foo: does not do this!) Note that if a regular variable is indirected, the result is unchanged (just as if a non-variable is :*: substituted; the result is the input) 4) :#:var: string-length operations are performed. (you don't have to expand a :var: first, you can take the string length directly, as in :#:_dw: to get the length of the default data window. Thus, you can take the length of a string that contains a :, which would normally "end" the :#: operator ). 5) :@:expression: mathematical expressions are performed; syntax is either RPN or non-precedenced (parens required) algebraic notation. Embedded non-evaluated strings in a mathematical expression is currently a no-no. If the first character of the math string is an A or an R, it forces Algebraic or RPN evaluation; otherwise the -q value determines which evaluator to use. Allowed operators are: + - * / % ^ v > < = >= <= != e E f F g G x X only. The '^' operator is exponentiation; A ^ B is A raised to the B power. The 'v' operator is any-base log; A v B is the log of B in logbase A ; note that the logbase is _required_ and there is no default. Only >, >=, <, <=, = and != set logical results; they also evaluate to 1 and 0 for continued chain operations - e.g. ((:*:a: > 3) + (:*:b: > 5) + (:*:c: > 9) > 2) is true IFF any of the following is true a > 3 and b > 5 a > 3 and c > 9 b > 5 and c > 9 Formatting operators: e E f F g G x X - the left side value is unchanged, but the right side value is used as a formatting precision value (note that x and X do not change precision), (i.e. the speed of light expressed in E 7.2 precision such as by 299792458 E 7.2 is 3.00E+08) The operators e, E, f, F, g, G, x, and X operators have the same meaning as in C. (beware a precision after the decimal of 10 though; and note that an x or X format is limited to 32 bits.) -------------- Notes on Approximate REGEX matching --------- The TRE regex engine (which is the default engine) supports approximate matching. The previously used GNU engine did not. Approximate matching is specified similarly to a "repetition count" in a regular regex, using brackets. This approximation applies to the previous parenthesized expression (again, just like repetion counts). You can specify maximum total changes, and how many inserts, deletes, and substitutions you wish to allow. The minimum-error match is found and reported, if it exists within the bounds you state. The basic syntax is: (text-to-match){~[maxerrs] [#maxsubsts] [+maxinserts] [-maxdeletes]} Note that the '~' (tilde, with an optional maxerr count) is _required_ (that's how we know it's an approximate regex rather than just a rep-count); if you don't specify a max error count, you will get the best match, if you do, the match will have at most that many errors. Remember that you specify the changes to the text in the _pattern_ necessary to make it match the text in the string being searched. You cannot use approximate regexes and backrefs (like \1) in the same regex. This is a limitation of in TRE at this point. You can also use an inequality in addition to the basic syntax above: (text-to-match){~[maxerrs] [basic-syntax] [nI + mD + oS < K] } where n, m, and o are the costs per insertion, deletion, and substitution respectively, 'I', 'D', and 'S' are indicators to tell which cost goes with which kind of error, and K is the total cost of the errors; the cost of the errors is always strictly less than K. Here are some examples. (foobar) - exactly matches "foobar" (foobar){~} - finds the closest match to "foobar", with the minimum number of inserts, deletes, and substitutions. This match always succeeds, as six substitutions or additions is always enough to turn any string into one that contains 'foobar'. (foobar){~3} - finds the closest match to "foobar", with no more than 3 inserts, deletes, or substitutions (foobar){~2 +2 -1 #1) - find the closest match to "foobar", with at most two errors total, and at most two inserts, one delete, and one substitution. (foobar){~4 #1 1i + 2d < 5 } - find the closest match to "foobar", with at most four errors total, at most one substitution, and with the number of insertions plus 2x the number of deletions less than 5. (foo){~1}(bar){~1) - find the closest match to "foobar", with at most one error in the "foo" and one error in the "bar". ------------ Notes on Classifier Choices ------- CRM114 allows the user a whole gamut of different classification algorithms, and various tunings on classifications. The default classifier is a Markovian classifier that attempts to model the language as a Markov Random Field with site size of 5 (in plainspeak, it looks at each word in the context of a window 5 words long; words within that window are considered "directly related" and are used to generate local probabilities. Words outside that 5-word window are not considered in relation to each word, but get considered when the window slides over to them). The Markovian classifier is quite fast; more than fast enough for a single user or even a small office. Filtering speed varies- with no optimization and overflow safegaurding (that is, with enabled) filtering speed is usually in excess of what a fractional T1 line can downlink. The Markovian filter can be sped up considerably by turning off overflow safegaurding by not using ; this optimization speeds up learning significantly, but it means that learning is unsafe. System operators must instead manually monitor the fullness of the .css files and either manually groom them or expand them as required (or a script must be used to atomate this maintenance, which can be done "in flight"). [ This classifier is the original CRM114 classifier and should be considered deprecated for new work, although it is still supported. The recommended classifier right now for production work is OSB or OSBF. ] The next generation filter (and one of the two recommended for new production work] is the OSB filter, based on orthogonal sparse bigrams. OSB is natively about 4x faster than full Markovian, but loses some of this advantage if overflow safegaurding (no ) is used. OSB is almost as accurate as Markovian if disk space is unlimited, and more accurate than Markovian if disk space is limited. OSB is the recommended default for new users because it works very well across a broad range of inputs. OSB uses .css files as well, but (because of a coding error that was released into the wild and unnoticed until most people were already using it in the incompatible form) OSB is, by default, incompatible with Markov .css files; there is a compile-time switch to make it compatible if you want. Another related classifier is the OSBF (OSB with Fidelis mods such as the ECCF dynamic weighting) filter. The good news is that OSBF can sometimes be even more accurate than OSB or Winnow, by using an exponential weighting to determine local probabilities, giving a filter is that it works very, Very, VERY well. It's incompatible with any of the other filters (uses .cfc files). It's also a good choice for new production work. Another filter with excellent statistics is the Winnow filter. Winnow is a non-statistical method that uses the OSB front end feature generator. Winnow is different than the statistical filters, in that it absolutely requires both positive training and negative training to work, but then it works _very_ well. With Winnow, you don't just train errors into the correct class (i.e. in emulation of an SVM). Instead, you set a "thick threshold" (usually about +/- 0.2 in the pR scale), and any positive class that doesn't get a per-correct-file score of at least 0.2 pR gets trained as a positive example. Symmetrically, any negative class and negative example that doesn't get below -0.2 of pR needs to be trained as a negative example (that is, using the flags .) This means that with Winnow, on an error you train one or both files. Even if the classifier gives the correct overall result, if the per-file pR values are inside the -0.2 <= per_file_pR <= 0.2 thick-threshold, you may have to train one or both files as well. (these per-file pR values are in the statistics output variable). The slowest classifier is the correlative filter. This filter is based on a full NxM correlation of the unknown text against each of the known text corpora. It's very slow (perhaps 100x slower than Markovian) but is capable of classifying texts containing stemmed words, of texts composed of binary files, and texts that cannot be reasonably "tokenized". The filter should be considered perpetually an experimental feature, and it is not as well characterized as the Markovian or OSB filters. The correlative filter is not recommended for general production work. A nonlinear filter is the Hyperspace filter; this uses a variation on the K-Nearest-Neighbor method. It's usually not quite as accurate as OSB, but it can filter againts very high levels of intentional obfuscation. Hyperspace uses a different (and self-growing) file format. Hyperspace usually trains best with a small thick-threshold training, similar to Winnow; as of 20061101 the factors have been renormalized so that Hyperspace values within +/- 0.1 pR units give a good thick-threshold for training. The bit-entropy filter is a different *kind* of filter; instead of using tokens, it constructs an optimized compression system out of the known texts at the bit by bit level (yes, bits! This filter doesn't even care about byte alignment). It then tries to predict the unknown text as much as possible, using the known texts as prior probabilities. Better prediction implies closer match. The amazing thing about this is that it works at all- and it actually works very well. Because there's no tokenizer, the entropy filter can work against languages that don't use spaces to delimit words, such as some Asian languages. It works quite well against spam. Because this filter works at the bit-at-a-time level, the tokenizing regex is ignored. This filter is still experimental and noncompatible upgrades may occur - keep your training data if you use this filter! The FSCM filter is a realization of an infinite-window LZ-77 classifier. The known-example texts are used as a position+runlength dictionary and used to compress the unknown. The better the compression, the better the match (unlike plain LZ77, there is an additional bonus for very long run matches). By default, the quantum of match in FSCM is one byte, and the "minimum match length" is three bytes, but you can change this with the tokenizing regex. Note that a minumum match length much smaller than three bytes will cause this filter to run _very_ slowly as the number of possible dictionary checks needed grows very quickly. The SVM support vector machine (as of 200909) is one of the "new breed" of CRM114 filter - it uses only ONE statistics file rather than two or more. Like Winnow, the SVM needs both positive and negative examples; the SVM tries to find the maximum error margin decision weighting between the two classes, as a linear weighting. Positive examples are trained as usual; negative examples are trained with the REFUTE flag. Erasures are done with the ERASE flag. By default, the SVM does a complete retraining using Thorsten Joachim's public algorithm; if you are going to train a large set of examples (very fast, but still takes linear time), use the APPEND flag to prevent training on individual cases. Training happens only when something is LEARNed but APPEND is not set (it's OK to train a NULL text to trigger the solver). The solver will take about 30 seconds per megabyte of example text. This algorithm is not capable of handling more than two simultaneous classes. The SVM is one of the few classifiers that works well in a TEFT (Train Every Thing) training environment. The PCA principal component analysis classifier is like the SVM classifier - it uses only one statistics file, and requires both positive and negative examples, with negative examples set with REFUTE and removals set with ERASE flag. The PCA classifier uses the Von Mises (1929) algorithm to solve for the first eigenvector - that is, the linear weighting vector that best describes the variation across all of the known inputs, with a heavy weight placed on known class membership. The dividing threshold is then optimally placed. The PCA algorithm is much faster than the SVM algorithm, but not quite as accurate. The PCA works equally well in TEFT and TOE (train on error) situations. ------------ Overall Language Notes ------------ Here's how to remember what goes where in the CRM114 language. Unlike most computer languages, CRM114 uses inflection (or declension) rather than position to describe what role each part of a statement plays. The declensions are marked by the delimiters- the /, ( and ), < and >, and [ and ]. By and large, you can mix up the arguments to each kind of statement without changing their meaning. Only the ACTION needs to be first. Other parts of the statement can occur in any order, save that multiple (paren_args) and /pattern_args/ must stay in their nominal order but can go anywhere in the statement. They do not need to be consecutive. The parts of a CRM114 statement are: ACTION - the verb. This is at the start of the statement. /pattern/ - the overall pattern the verb should use, analogous to the "subject" of the statement. - modifies how the ACTION does the work. You'd call these "adverbs" in human languages. (vars) - what variables to use as adjuncts in the action (what would be called the "direct objects"). These can get changed when the action happens. [limited-to] - where the action is allowed to take place (think of it as the "indirect object"). Generally these are not directly changed by the action. These may contain "adjectival phrases" - var restrictions, either by subscript or by regex or both. crm114-20100106-BlameMichelson.src/crm_expr_classify.c0000644000000000017500000001227311321154266020662 0ustar rootwsy// crm_expr_classify.c - learn and classify functions for different schema // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // include the routine declarations file #include "crm114.h" // OSBF declarations #include "crm114_osbf.h" // Dispatch a LEARN statement // int crm_expr_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { char *txt; long start; long len; int retval; long long classifier_flags = 0; if (crm_exec_box_restriction(csl, apb, &txt, &start, &len) != 0) return 0; // get our flags... the only ones we're interested in here // are the ones that specify _which_ algorithm to use. classifier_flags = apb->sflags; // Joe thinks that this should be a table or a loop. classifier_flags = classifier_flags & ( CRM_OSB_BAYES | CRM_CORRELATE | CRM_OSB_WINNOW | CRM_OSBF | CRM_HYPERSPACE | CRM_ENTROPY | CRM_SVM | CRM_SKS | CRM_FSCM | CRM_NEURAL_NET | CRM_PCA); if (classifier_flags & CRM_OSB_BAYES) { retval = crm_expr_osb_bayes_learn (csl, apb, txt, start, len); } else if (classifier_flags & CRM_CORRELATE) { retval = crm_expr_correlate_learn (csl, apb, txt, start, len); } else if (classifier_flags & CRM_OSB_WINNOW) { retval = crm_expr_osb_winnow_learn (csl, apb, txt, start, len); } else if (classifier_flags & CRM_OSBF ) { retval = crm_expr_osbf_bayes_learn (csl, apb, txt, start, len); } else if (classifier_flags & CRM_HYPERSPACE) { retval = crm_expr_osb_hyperspace_learn(csl, apb, txt, start, len); } else if (classifier_flags & CRM_ENTROPY) { retval = crm_expr_bit_entropy_learn(csl, apb, txt, start, len); } else #ifndef PRODUCTION_CLASSIFIERS_ONLY if (classifier_flags & CRM_SVM) { //retval = crm_expr_svm_learn(csl, apb, txt, start, len); retval = crm_svm_learn(csl, apb, txt, start, len); } else if (classifier_flags & CRM_SKS) { retval = crm_expr_sks_learn(csl, apb, txt, start, len); } else if (classifier_flags & CRM_FSCM) { retval = crm_fast_substring_learn(csl, apb, txt, start, len); } else if (classifier_flags & CRM_NEURAL_NET) { retval = crm_neural_net_learn (csl, apb, txt, start, len); } else if (classifier_flags & CRM_PCA) { retval = crm_pca_learn(csl, apb, txt, start, len); } else #endif // !PRODUCTION_CLASSIFIERS_ONLY { // Default with no classifier specified is Markov apb->sflags = apb->sflags | CRM_MARKOVIAN; retval = crm_expr_markov_learn (csl, apb, txt, start, len); }; return (retval); } // Dispatch a CLASSIFY statement // int crm_expr_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { char *txt; long start; long len; long retval; long long classifier_flags = 0; if (crm_exec_box_restriction(csl, apb, &txt, &start, &len) != 0) return 0; // get our flags... the only ones we're interested in here // are the ones that specify _which_ algorithm to use. classifier_flags = apb->sflags; classifier_flags = classifier_flags & ( CRM_OSB_BAYES | CRM_CORRELATE | CRM_OSB_WINNOW | CRM_OSBF | CRM_HYPERSPACE | CRM_ENTROPY | CRM_SVM | CRM_SKS | CRM_FSCM | CRM_NEURAL_NET | CRM_PCA); if (classifier_flags & CRM_OSB_BAYES) { retval = crm_expr_osb_bayes_classify (csl, apb, txt, start, len); } else if (classifier_flags & CRM_CORRELATE) { retval = crm_expr_correlate_classify (csl, apb, txt, start, len); } else if (classifier_flags & CRM_OSB_WINNOW) { retval = crm_expr_osb_winnow_classify (csl, apb, txt, start, len); } else if (classifier_flags & CRM_OSBF ) { retval = crm_expr_osbf_bayes_classify (csl, apb, txt, start, len); } else if (classifier_flags & CRM_HYPERSPACE) { retval = crm_expr_osb_hyperspace_classify (csl, apb, txt, start, len); } else if (classifier_flags & CRM_ENTROPY) { retval = crm_expr_bit_entropy_classify (csl, apb, txt, start, len); } else #ifndef PRODUCTION_CLASSIFIERS_ONLY if (classifier_flags & CRM_SVM) { //retval = crm_expr_svm_classify (csl, apb, txt, start, len); retval = crm_svm_classify (csl, apb, txt, start, len); } else if (classifier_flags & CRM_SKS) { retval = crm_expr_sks_classify (csl, apb, txt, start, len); } else if (classifier_flags & CRM_FSCM) { retval = crm_fast_substring_classify (csl, apb, txt, start, len); } else if (classifier_flags & CRM_NEURAL_NET) { retval = crm_neural_net_classify (csl, apb, txt, start, len); } else if (classifier_flags & CRM_PCA) { retval = crm_pca_classify(csl, apb, txt, start, len); } else #endif // !PRODUCTION_CLASSIFIERS_ONLY { // Default with no classifier specified is Markov apb->sflags = apb->sflags | CRM_MARKOVIAN; retval = crm_expr_markov_classify (csl, apb, txt, start, len); }; return (0); } crm114-20100106-BlameMichelson.src/exectest.crm0000755000000000017500000000454011321154266017332 0ustar rootwsy#! /usr/bin/crm # # exectest.crm - test system execution # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. window { isolate (:lsout:) output /\n ----- executing an ls -----\n/ syscall ( ) (:lsout:) /ls/ output /:*:lsout:/ isolate (:calcout:) output /\n ----- calculating 1 + 2 + 3 using bc -----\n/ syscall ( 1 + 2 + 3 \n) (:calcout:) /bc/ output /:*:calcout:/ isolate (:lslaout:) output /\n ----- executing an ls -la -----\n/ syscall ( ) (:lslaout:) /ls -la / output /:*:lslaout:/ isolate (:catout:) output /\n ----- outputting to a file using cat -----\n/ syscall ( This is a cat out \n) (:catout:) /cat > e1.out/ output /:*:catout:/ isolate (:c1:) isolate ( :proc:) output /\n ----- keeping a process around ---- \n/ output /\n preparing... :*:proc:/ syscall ( a one \n ) ( ) (:proc:) /cat > e2.out/ output /\n did one... :*:proc:/ syscall ( and a two \n ) () (:proc:) // output /\n did it again...:*:proc:/ syscall ( and a three \n) () (:proc:) // output /\n and done ...:*:proc: \n/ output /\n ----- testing keep reads from proc -----\n/ isolate (:lslaout:) / / # syscall < keep > ( ) (:lslaout:) (:proc:) /ls -la \/dev / syscall < keep > ( ) (:lslaout:) (:proc:) /ls -la / output /--- got immediate : \n:*:lslaout: \n ---end-----/ :async_test_sleeploop: syscall () (:lslaout:) (:proc:) // output /--- got this later : \n-:*:lslaout:-\n/ { # try it first sleeping only .1 second { syscall /sleep .1/ syscall () (:lslaout:) (:proc:) output /--- .1 sleep: got : -:*:lslaout:-/ match [:lslaout:] /....../ goto /:async_test_sleeploop:/ } # try again, sleeping 1 second { syscall <> /sleep 1/ syscall () (:lslaout:) (:proc:) output /--- 1.0 sleep: got : \n-:*:lslaout:-\n/ match [:lslaout:] /......./ goto /:async_test_sleeploop:/ } # try onece again, sleeping 3 second { syscall <> /sleep 3/ syscall () (:lslaout:) (:proc:) output /--- 3.0 sleep: got : \n-:*:lslaout:\n/ match [:lslaout:] /......./ goto /:async_test_sleeploop:/ } } syscall <> () (:lslaout:) (:proc:) // output /--- and finally synch\/eof : \n:*:lslaout: \n ---end-----/ output /Testing fire-and-forget asynch \n/ syscall < async > ( ) (:lslaout:) (:proc:) /ls -la \/dev / output /fire and forget ls got: :*:lslaout: \n/ } crm114-20100106-BlameMichelson.src/crm_fast_substring_compression.c0000644000000000017500000021134111321154266023462 0ustar rootwsy// crm_fast_substring_compression.c - fast substring compression tools // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *tempbuf; ///////////////////////////////////////////////////////////////// // // Compression-match classification // // This classifier is based on the use of the Lempel-Ziv LZ77 // (published in 1977) algorithm for fast compression; more // compression implies better match. // // The basic idea of LZ77 is to encode strings of N characters // as a small doublet (Nstart, Nlen), where Nstart and Nlen are // backward references into previously seen text. If there's // no previous correct back-reference string, then don't compress // those characters. // // Thus, LZ77 is a form of _universal_ compressor; it starts out knowing // nothing of what it's to compress, and develops compression // tables "on the fly". // // It is well known that one way of doing text matching is to // compare relative compressibility - that is, given known // texts K1 and K2, the unknown text U is in the same class as K1 // iff the LZ77 compression of K1|U is smaller (fewer bytes) than // the LZ77 compression of K2|U . (remember you need to subtract // the compressed lengths of K1 and K2 respectively). // // There are several ways to generate LZ compression fast; one // way is by forward pointers on N-letter prefixes. Another // way is to decide on a maximum string depth and build transfer // tables. // // One problem with LZ77 is that finding the best possible compression // is NP-hard. Consider this example: // // ABCDEFGHI DEFGHIJLMN BCDEFGHIJ JKLMNOPQ ABCDEFGHIJKLMNOP // // Is it better to code the final part with the A-J segment // followed by the J-P segment, or with a literal ABC, then D-N, // then the literal string OP? Without doing the actual math, you // can't easily decide which is the better compression. In the // worst case, the problem becomes the "knapsack" problem and // is thus NP-hard. // // To avoid this, we take the following heuristic for our first // coding attempt: // // Look for the longest match of characters that // match the unknown at this point and use that. // // In the worst case, this algorithm will traverse the entire // known text for each possible starting character, and possibly // do a local search if that starting character matches the // character in the unknown text. Thus, the time to run is // bounded by |U| * |K|. // // Depending on the degree of overlap between strings, this // heuristic will be at best no worse than half as good as the // best possible heuristic, and (handwave not-proven) not worse // than one quarter as good as the best possible heuristic. // // As a speedup, we can use a prefix lookforward based on N // (the number of characters we reqire to match before switching // from literal characters to start/length ranges). Each character // also carries an index, saying "for the N-character lookforward // prefix I am at the start of, you can find another example of this // at the following index." // // For example, the string "ABC DEF ABC FGH ABC XYZ" would have // these entries inserted sequentially into the lookforward table: // // ABC --> 0 // BC --> 1 // C D --> 2 // DE --> 3 // DEF --> 4 // EF --> 5 // F A --> 6 // AB --> 7 // // At this point, note that "ABC" recurs again. Since we want to // retain both references to "ABC" strings, we place the index of // the second ABC (== 8) into the "next occurrence" tag of the // first "ABC". (or, more efficiently, set the second ABC to point // to the first ABC, and then have the lookforward table point to the // second ABC (thus, the chain of ABCs is actually in the reverse order // of their encounters). // // For prefix lengths of 1, 2, and 3, the easiest method is to // direct-map the prefixes into a table. The table lengths would // be 256, 65536, and 16 megawords (64 megabytes). The first two are // eminently tractable; the third marginally so. The situation // can be improved by looking only at the low order six bits of // the characters as addresses into the direct map table. For normal // ASCII, this means that the control characters are mapped over the // capital letters, and the digits and punctuation are mapped over // lowercase letters, and uses up only 16 megabytes for the table entries. // // Of course, a direct-mapped, 1:1 table is not the only possible // table. It is also possible to create a hash table with overflow // chains. For example, an initial two-character table (256Kbytes) yields // the start of a linear-search chain; this chain points to a linked list of // all of the third characters yet encountered. // // Here's some empirical data to get an idea of the size of the // table actually required: // // for SA's hard_ham // lines words three-byte sequences // 114K 464K 121K // 50K 210K 61K // 25K 100K 47K // 10K 47K 31K // 5K 24K 21K // // For SA's easy_ham_2: // lines words three-byte sequences // 134K 675K 97K // 25K 130K 28K // // For SA's spam_2: // lines words three-byte sequences // 197K 832K 211K // 100K 435K 116K // // So, it looks like in the long term (and in English) there is // an expectation of roughly as many 3-byte sequences as there are // lines of text, probably going asymptotic at around // a quarter of a million unique 3-byte sequences. Note that // the real maximum is 2^24 or about 16 million three-byte // sequences; however some of them would never occur except in // binary encodings. // // // ----- Embedded Limited-Length Markov Table Option ----- // // Another method of accomplishing this (suitable for N of 3 and larger) // is to use transfer tables allocated only on an as-needed basis, // and to store the text in the tables directly (thus forming a sort of // Markov chain of successive characters in memory). // // The root table contains pointers to the inital occurrence in the known // text of all 256 possible first bytes; each subsequent byte then // allocates another transfer table up to some predetermined limit on // length. // // A disadvantage of this method is that it uses up more memory for // storage than the index chaining method; further it must (by necessity) // "cut off" at some depth thereby limiting the longest string that // we want to allocate another table for. In the worst case, this // system generates a doubly diagonal tree with |K|^2 / 4 tables. // On the other hand, if there is a cutoff length L beyond which we // don't expand the tables, then only the tables that are needed get // allocated. As a nice side effect, the example text becomes less // trivial to extract (although it's there, you have to write a // program to extract it rather than just using "strings", unlike the // bit entropy classifier where a well-populated array contains a lot // of uncertainty and it's very difficult to even get a single // byte unambiguously. // // Some empirical data on this method: // // N = 10 // Text length (bytes) Tables allocated // 1211 7198 // 69K 232K // 96K 411K // 204K 791K // // N=5 // Text length (bytes) Tables allocated // 1210 2368 // 42K 47K // 87K 79K // 183K 114K // 386K 177K // 841K 245K // 1800K 342K // 3566K 488K // 6070K 954K // 8806K 1220K // // N=4 // Text length (bytes) Tables allocated // 87K 40K // 183K 59K // 338K 89K // 840K 121K // 1800K 167K // 3568K 233K // 6070K 438K // // N=3 // Text length (bytes) Tables allocated // 87K 14K // 183K 22K // 386K 31K // 840K 42K // 1800K 58K // 3568K 78K // 6070K 132K // // // Let's play with the numbers a little bit. Note that // the 95 printable ASCII characters could have a theoretical // maximum of 857K sequences, and the full 128-character ASCII // (low bit off) is 2.09 mega-sequences. If we casefold A-Z onto // a-z and all control characters to "space", then the resulting // 69 characters is only 328K possible unique sequences. // // A simpler method is to fold 0x7F-0x8F down onto 0x00-0x7F, and // then 0x00-0x3F onto 0x40-0x7F (yielding nothing but printable // characters- however, this does not preserve whitespace in any sense). // When folded this way, the SA hard_ham corpus (6 mbytes, 454 words, 114 // K lines) yields 89,715 unique triplets. // // Of course, for other languages, the statistical asymptote is // probably present, but casefolding in a non-Roman font is probably // going to give us weak results. // // --- Other Considerations --- // // Because it's unpleasant and CPU-consuming to read and write // non-binary-format statistics files (memory mapping is far // faster) it's slightly preferable to have statistics files // that don't have internal structures that change sizes (appending is // more pleasant than rewriting with new offsets). Secondarily, // a lot of users are much more comfortable with the knowledge // that their statistics files won't grow without bound. Therefore, // fixed-size structures are often preferred. // // // --- The text storage --- // // In all but the direct-mapped table method, the text itself needs // to be stored because the indexing method only says where to look // for the first copy of any particular string head, not where all // of the copies are. Thus, each byte of the text needs an index // (usually four bytes) of "next match" information. This index // points to the start of the next string that starts with the // current N letters. // // Note that it's not necessary for the string to be unique; the // next match chains can contain more than one prefix. As long // as the final matching code knows that the first N bytes need // to be checked, there's no requirement that chains cannot be // shared among multiple N-byte prefixes. Indeed, in the limit, // a simple sequential search can be emulated by a shared-chain // system with just ONE chain (each byte's "try next" pointer // points to the next byte in line). These nonunique "try next" // chains may be a good way to keep the initial hash table // manageabley small. However, how to efficiently do this // "in line" is unclear (the goal of in-line is to hide the // known text so that "strings" can't trivially extract it; // the obvious solution is to have two data structures (one by // bytes, one by uint32's, but the byte structure is then easily // perusable). // // Another method would be to have the initial table point not // to text directly, but to a lookforward chain. Within the chain, // cells are allocated only when the offset backward exceeds the // offset range allowed by the in-line offset size. For one-byte // text and three-byte offsets, this can only happen if the text // grows beyond 16 megabytes of characters (64 megabyte footprint) // // --- Hash Tables Revisited --- // // Another method is to have multiple hash entries for every string // starting point. For example, we might hash "ABC DEF ABC", "ABC DEF", // and "ABC" and put each of these into the hash table. // // We might even consider that we can _discard_ the original texts // if our hash space is large enough that accidental clashes are // sufficiently rare. For example, with a 64-bit hash, the risk of // any two particular strings colliding is 1E-19, and the risk of // any collision whatsoever does not approach 50% with less than // 1 E 9 strings in the storage. // // ------- Hashes and Hash Collisions ----- // // To see how the hashes would collide with the CRM114 function strnhash, // we ran the Spamassasin hard-ham corpus into three-byte groups, and // then hashed the groups. Before hashing, there were 125,434 unique // three-byte sequences; after hashing, there were 124,616 unique hashes; // this is 818 hash collisions (a rate of 0.65%). This is a bit higher // than predicted for truly randomly chosen inputs, but then again, the // SA corpus has very few bytes with the high order bit set. // // ------ Opportunistic chain sharing ----- // // (Note- this is NOT being built just yet; it's just an idea) - the // big problem with 24-bit chain offsets is that it might not have // enough "reach" for the less common trigrams; in the ideal case any // matching substring is good enough and losing substrings is anathema. // However, if we have a number of sparse chains that are at risk for // not being reachable, we can merge those chains either together or // onto another chain that's in no danger of running out of offsets. // // Note that it's not absolutely necessary for the two chains to be // sorted together; as long as the chains are connected end-to-end, // the result will still be effective. // // ----- Another way to deal with broken chains ----- // // (also NOT being built yet; this is just another possibility) // Another option: for systems where there are chains that are about // to break because the text is exceeding 16 megabytes (the reach of // a 24-bit offset), at the end of each sample document we can insert // a "dummy" forwarding cell that merely serves to preserve continuity // of any chain that might be otherwise broken because the N-letter prefix // string has not occured even once in the preceding 16 megacells. // (worst case analysis: there are 16 million three-byte prefixes, so // if all but ONE prefix was actually ever seen in a 16-meg block, we'd // have a minor edge-case problem for systems that did not do chain // folding. With chain-folding down to 18 bits (256K different chains) // we'd have no problem at all, even in the worst corner case.) // // However, we still need to insert these chain-fixers preemptively // if we want to use "small" lookforward cells, because small (24-bit) // cells don't have the reach to be able to index to the first occurrence // of a chain that's never been seen in the first 16 megacharacters. // This means that at roughly every 16-megacell boundary we would // insert a forwarding dummy block (worst case size of 256K cells, on // the average a lot fewer because some will actually get used in real // chains.) That sounds like a reasonable tradeoff in size, but the // bookkeeping to keep it all straight is goint to be painful to code and // test rigorously. // // // ------- Hashes-only storage ------ // // In this method, we don't bother to store the actual text _at all_, // but we do store chains of places where it occurred in the original // text. In this case, we LEARN by sliding our window of strnhash(N) // characters over the text. Each position generates a four-byte // backward index (which may be NULL) to the most recent previous // encounter of this prefix; this chain grows basically without limit. // // To CLASSIFY, we again slide our strnhash(N) window over the text; // and for each offset position we gather the (possibly empty) list of // places where that hash occurred. Because the indices are pre-sorted // (always in descending order) it is O(n) in the length of the chains // to find out any commonality because the chains can be traversed by the // "two finger method" (same as in the hyperspace classifier). The // result for any specific starting point is the list of N longest matches // for the leading character position as seen so far. If we choose to // "commit on N characters matching, then longest that starts in that // chain" then the set of possible matches is the tree of indices and // we want the longest branch. // // This is perhaps most easily done by an N-finger method where we keep // a list of "fingers" to the jth, j+1, j+2... window positions; at each // position j+k we merely insure that there is an unbroken path from j+0 // to j+k. (we could speed this up significantly by creating a lookaside // list or array that contains ONLY the valid positions at j+k; moving the // window to k+1 means only having to check through that array to find at // least one member equal to the k+1 th window chain. In this case, the // "two-finger" method suffices, and the calculation can be done "in place". // When the path breaks (that is, no feasible matches remain), we take // N + k - 1 to be the length of the matched substring and begin again // at j = N + k + j. // // Another advantage of this method is that there is no stored text to // recover directly; a brute-force attack, one byte at a time, will // recover texts but not with absolute certainty as hash collisions // might lead to unexpected forks in the chain. // // ------- Design Decision ----- // // Unless something better comes up, if we just take the strnhash() of the // N characters in the prefix, we will likely get a fairly reasonable // distribution of hash values which we can then modulo down to whatever // size table we're actually using. Thus, the size of the prefix and the // size of the hah table are both freely variable in this design. // // We will use the "hash chains only" method to store the statistics // information (thus providing at least moderate obscuration of the // text, as well as moderately efficient storage. // // As a research extension, we will allow an arbitrary regex to determine // the meaning of the character window; repeated regexing with k+1 starting // positions yield what we will define as "legitimately adjacent window // positions". We specifically define that we do not care if these are // genuinely adjacent positions; we can define these things as we wish (and // thereby become space-invariant, if we so choose. // // A question: should the regex define the next "character", or should // it define the entire next window? The former allows more flexibility // (and true invariance over charactersets); the latter is easier to // implement and faster at runtime. Decision: implement as "defines the // whole window". Then we use the count of subexpressions to define our // window length; this would allow skipping arbitrary text - with all the // programming power and danger of abuse that entails. Under this paradigm, // the character regex is /(.)(.)(.)/ for an N=3 minimum chain. // // A quick test shows that strnhash on [a-zA-Z0-9 .,-] shows no // duplications, nor any hash clashes when taken mod 256. Thus, // using a Godel coding scheme (that is, where different offsets are // each multiplied by a unique prime number and then those products // are added together ) will *probably* give good hash results. // Because we're moduloing (taking only the low order bits) the // prime number "2" is a bit problematic and we may choose to skip it. // Note that a superincreasing property is not useful here. // // Note that the entire SA corpus is only about 16 megabytes of // text, so a full index set of the SA corpus would be on the // order of 68 megabytes ( 4 megs of index, then another 64 megs // of index chains) // // Note also that there is really no constraint that the chains start // at the low indices and move higher. It is equally valid for the chains // to start at the most recent indices and point lower in memory; this // actually has some advantage in speed of indexing; each chain element // points to the _previous_ element and we do the two-finger merge // toward lower indices. // // Note also that there is no place the actual text or even the actual // hashes of the text are stored. All hashes that map to the same place // in the "seen at" table are deemed identical text (and no record is kept); // similarly each cell of "saved text" is really only a pointer to the // most recent previous location where something that mapped to the // same hash table bucket was seen). Reconstruction of the prior text is // hence marginal in confidence. This ambiguity can be increased by // making the hash table smaller (and thus forcing unreconstructable // collisions). // /////////////////////////////////////////////////////////// #ifdef NEED_PRIME_NUMBERS /////////////////////////////////////////////////////////////// // // Some prime numbers to use as weights. // // GROT GROT GROT note that we have a 1 here instead of a 2 as the // first prime number! That's strictly an artifice to use all of the // hash bits and is not an indication that we don't know that 2 is prime. static unsigned long primes [ 260 ] = { 1, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657 } ; #endif // NEED_PRIME_NUMBERS //////////////////////////////////////////////////////////////// // // Headers and self-identifying files are a good idea; we'll // have it happen here. // typedef struct { int prefix_hash_table_length; // # buckets in prefix hash table } FSCM_HEADER; // The prefix array maps a hash to the most recent occurrence of // that hash in the text. typedef struct { unsigned int index; } FSCM_PREFIX_TABLE_CELL; typedef struct { unsigned int next; } FSCM_HASH_CHAIN_CELL; //////////////////////////////////////////////////////////////////////// // // How to learn in FSCM - two parts: // 1) append our structures to the statistics file in // the FSCM_CHAR_STRUCT format; // 2) index the new structures; if no prior exists on the chain, // let previous link be 0, otherwise it's the prior value in the // hash table. // 3) Nota bene: Originally this code grew the structures downward; // this turned out to be a bad idea because some types of documents // of interest contained long runs (1000+) of identical characters and // the downward-growing structures took geometrically long // periods of time to traverse repeatedly. // int crm_fast_substring_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen) { // learn the compression version of this input window as // belonging to a particular type. Note that the word regex // is ignored in this classifier. // // learn (classname) // long i, j; char htext[MAX_PATTERN]; // the hash name buffer char hashfilename [MAX_PATTERN]; // the hashfile name FILE *hashf; // stream of the hashfile unsigned long textoffset, textmaxoffset; long hlen; struct stat statbuf; // for statting the statistics file long fscm_file_length = 0; char *file_pointer; STATISTICS_FILE_HEADER_STRUCT *file_header; FSCM_PREFIX_TABLE_CELL *prefix_table; // the prefix indexing table, unsigned long prefix_table_size; FSCM_HASH_CHAIN_CELL *chains, *newchains; // the chain area unsigned int newchainstart; // offset in cells of new chains long sense; long microgroom; long unique; long use_unigram_features; long fev; // Dummies used for the vector tokenizer char ptext [MAX_PATTERN]; // regex pattern long plen = 0; int *ca = NULL; // Coefficient Array (we'll take the default) long pipelen = 0; long pipe_iters = 0; long next_offset = 0; // unk_hashes is tempbuf, but casting-aliased to FSCM chains long unk_hashcount; unsigned *unk_hashes; unk_hashes = (unsigned *) tempbuf; statbuf.st_size = 0; fev = 0; if (user_trace) fprintf (stderr, "executing an FSCM LEARN\n"); // extract the hash file name crm_get_pgm_arg ((char *)htext, MAX_PATTERN, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar ((char *)htext, hlen, MAX_PATTERN); // set flags sense = +1; if (apb->sflags & CRM_NOCASE) { if (user_trace) fprintf (stderr, "turning on case-insensitive match\n"); }; if (apb->sflags & CRM_REFUTE) { ///////////////////////////////////// // Take this out when we finally support refutation //////////////////////////////////// fatalerror5 ("FSCM Refute is NOT SUPPORTED YET\n", "If you want refutation, this is a good time to" "learn to code.", CRM_ENGINE_HERE); return (0); sense = -sense; if (user_trace) fprintf (stderr, " refuting learning\n"); }; microgroom = 0; if (apb->sflags & CRM_MICROGROOM) { microgroom = 1; if (user_trace) fprintf (stderr, " enabling microgrooming.\n"); }; unique = 0; if (apb->sflags & CRM_UNIQUE) { unique = 1; if (user_trace) fprintf (stderr, " enabling uniqueifying features.\n"); }; use_unigram_features = 0; if (apb->sflags & CRM_UNIGRAM) { use_unigram_features = 1; if (user_trace) fprintf (stderr, " using only unigram features.\n"); }; // // grab the filename, and stat the file // note that neither "stat", "fopen", nor "open" are // fully 8-bit or wchar clean... // i = 0; // while (htext[i] < 0x021) i++; // j = i; // while (htext[j] >= 0x021) j++; crm_nextword ( (char *) htext, hlen, 0, (long *) &i, (long *) &j); // filename starts at i, ends at j. null terminate it. htext[j] = '\000'; strcpy (hashfilename, &htext[i]); if (user_trace) fprintf (stderr, "Target file file %s\n", hashfilename); textoffset = txtstart; textmaxoffset = txtstart + txtlen; { long nosuchfile; // Check to see if we need to create the file; if we need // it, we create it here. // nosuchfile = stat ( hashfilename, &statbuf); if (nosuchfile) { // Note that "my_header" is a local buffer. STATISTICS_FILE_HEADER_STRUCT my_header; FSCM_HEADER my_fscm_header; if (user_trace) fprintf (stderr, "No such statistics file %s; must create it.\n", hashfilename); // Set the size of the hash table. fscm_file_length = sparse_spectrum_file_length; if (fscm_file_length == 0) fscm_file_length = FSCM_DEFAULT_HASH_TABLE_SIZE; // choose well for speed/accuracy ///////////////////////////////////////////////// // START OF STANDARD HEADER SETUP memset(&my_header, '\0', sizeof(my_header)); memset(&my_fscm_header, '\0', sizeof(my_fscm_header)); strcpy ((char *)my_header.file_ident_string , "CRM114 Classdata FSCM V2 (hashed) "); // offset of a member of my_header from the beginning of the structure #define OFF_M(member) ((char *)&my_header.member - (char *)&my_header) // header info, chunk 0 - the ident string my_header.chunks[0].start = OFF_M(file_ident_string); my_header.chunks[0].length = sizeof(my_header.file_ident_string); my_header.chunks[0].tag = 1; // // header info chunk 1 - the header chunking info itself my_header.chunks[1].start = OFF_M(chunks); my_header.chunks[1].length = sizeof (my_header.chunks); my_header.chunks[1].tag = 0; // // header info, chunk 2 - our specific header my_header.chunks[2].start = sizeof(my_header); my_header.chunks[2].length = sizeof(my_fscm_header); my_header.chunks[2].tag = 2; #undef OFF_M // END OF STANDARD HEADER SETUP ////////////////////////////////////////////////// // header info chunk 3 - the prefix hash table, fixed size my_header.chunks[3].start = sizeof(STATISTICS_FILE_HEADER_STRUCT); my_header.chunks[3].length = fscm_file_length * sizeof(FSCM_PREFIX_TABLE_CELL); my_header.chunks[3].tag = 3; // ... and the length of that hash table will be, in cells: my_fscm_header.prefix_hash_table_length = fscm_file_length; // // header info chunk 4 - the previous-seen pointers, growing. my_header.chunks[4].start = my_header.chunks[3].start + my_header.chunks[3].length; // Although the starting length is really zero, zero is a sentinel // so we start at 1 bucket further in... my_header.chunks[4].length = sizeof (FSCM_HASH_CHAIN_CELL); my_header.chunks[4].tag = 4; // Write out the initial file header.. hashf = fopen (hashfilename, "wb+"); dontcare = fwrite (&my_header, sizeof (STATISTICS_FILE_HEADER_STRUCT), 1, hashf); dontcare = fwrite (&my_fscm_header, sizeof(FSCM_HEADER), 1, hashf); fclose (hashf); }; }; ///////////////////////////////////////////////////////////// // // Grow-the-file code. // // This happens whether or not this is a new file. // /////////////////////////////////////////////////////// if (sense > 0) { ///////////////// // THIS PATH TO LEARN A TEXT // 1) Make room! Append enough unsigned int zeroes that // we will have space for our hashes. // 2) MMAP the file // 3) actually write the hashes // 4) adjust the initial-look table to point to those hashes; // while modifying those hashes to point to the most recent // previous hashes; // 5) MSYNC the output file. As we already did a file system // write it should not be necessary to do an mtime-fixup write. // ///////////////// // Write out the initial previous-seen hash table (all zeroes): { FSCM_PREFIX_TABLE_CELL my_zero_table; my_zero_table.index = 0; hashf = fopen (hashfilename, "ab+"); for (i = 0; i < fscm_file_length; i++) dontcare = fwrite (&my_zero_table, sizeof (FSCM_PREFIX_TABLE_CELL), 1, hashf); // ... and write a single 32-bit zero to cover index zero. dontcare = fwrite (& (my_zero_table), sizeof (FSCM_PREFIX_TABLE_CELL), 1, hashf); // All written; the file now exists with the right setup. fclose (hashf); }; // We need one 32-bit zero for each character in the to-be-learned // text; we'll soon clobber the ones that are in previously // seen chains to chain members (the others can stay as zeroes). { FSCM_HASH_CHAIN_CELL my_zero_chain; // Now it's time to generate the actual string hashes. // By default (no regex) it's a string kernel, length 6, // but it can be any prefix one desires. // // Generate the hashes. crm_vector_tokenize_selector (apb, // the APB txtptr, // intput string txtstart, // starting offset txtlen, // how many bytes ptext, // parser regex plen, // parser regex len ca, // tokenizer coeff array pipelen, // tokenizer pipeline len pipe_iters, // tokenizer pipeline iterations unk_hashes, // where to put the hashed results data_window_size / sizeof (*unk_hashes), // max number of hashes &unk_hashcount, // how many hashes we actually got &next_offset); // where to start again for more hashes if (internal_trace) { fprintf (stderr, "L.Total %ld hashes - first 16 values:\n" "%u %u %u %u %u %u %u %u\n", unk_hashcount, unk_hashes[0], unk_hashes[1], unk_hashes[2], unk_hashes[3], unk_hashes[4], unk_hashes[5], unk_hashes[6], unk_hashes[7]); fprintf (stderr, "%u %u %u %u %u %u %u %u\n", unk_hashes[8], unk_hashes[9], unk_hashes[10], unk_hashes[11], unk_hashes[12], unk_hashes[13], unk_hashes[14], unk_hashes[15]); }; // Now a nasty bit. Because there might be retained hashes of the // file, we need to force an unmap-by-name which will allow a remap // with the new file length later on. if (internal_trace) fprintf (stderr, "mmapping file %s for known state\n", hashfilename); crm_mmap_file (hashfilename, 0, 1, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); crm_force_munmap_filename (hashfilename); if (internal_trace) fprintf (stderr, "UNmmapped file %s for known state\n", hashfilename); if (user_trace) fprintf (stderr, "Opening FSCM file %s for append.\n", hashfilename); hashf = fopen ( hashfilename , "ab+"); if (user_trace) fprintf (stderr, "Writing to hash file %s\n", hashfilename); my_zero_chain.next = 0; // Note the "+ 3" here - to put in a pair of sentinels in // the output file: one at each end of a text segment. for (i = 0; i < unk_hashcount + 3; i++) dontcare = fwrite (& my_zero_chain, sizeof (FSCM_HASH_CHAIN_CELL), 1, hashf); fclose (hashf); }; // Now the file has the space; we can now mmap it and set up our // working pointers. stat (hashfilename, &statbuf); if (internal_trace) fprintf (stderr, "mmapping_2 file %s\n", hashfilename); file_pointer = crm_mmap_file (hashfilename, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); if (internal_trace) fprintf (stderr, "mmapped_2 file %s\n", hashfilename); // set up our pointers for the prefix table and the chains file_header = (STATISTICS_FILE_HEADER_STRUCT *) file_pointer; #if 0 { FSCM_HEADER *f = (FSCM_HEADER *)(file_header + 1); prefix_table_size = f->prefix_hash_table_length; } #else prefix_table_size = file_header->chunks[3].length / sizeof (FSCM_PREFIX_TABLE_CELL); #endif prefix_table = (FSCM_PREFIX_TABLE_CELL *) &file_pointer[file_header->chunks[3].start]; chains = (FSCM_HASH_CHAIN_CELL *) &file_pointer[file_header->chunks[4].start]; // Note the little two-step dance to recover the starting location // of the new chain space. // newchainstart = 1 + file_header->chunks[4].length / sizeof (FSCM_HASH_CHAIN_CELL); if (internal_trace) fprintf (stderr, "Chain field: %lu (entries %lu) new chainstart offset %u\n", (unsigned long)file_header->chunks[4].start / sizeof (FSCM_HASH_CHAIN_CELL), (unsigned long)file_header->chunks[4].length / sizeof (FSCM_HASH_CHAIN_CELL), newchainstart ); newchains = (FSCM_HASH_CHAIN_CELL *) &chains [newchainstart]; // ... and this is the new updated length. file_header->chunks[4].length += (unk_hashcount + 3) * sizeof (FSCM_HASH_CHAIN_CELL); // For each hash, insert it into the prefix table // at the right place (that is, at hash mod prefix_table_size). // If the table had a zero, it becomes nonzero. If the table // is nonzero, we walk the chain and modify the first zero // to point to our new hash. if (internal_trace) { fprintf (stderr, "\n\nPrefix table size: %lu, starting at offset %u\n", prefix_table_size, newchainstart); }; for (i = 0; i < unk_hashcount; i++) { unsigned int pti, cind; pti = unk_hashes[i] % prefix_table_size; if (internal_trace) { fprintf (stderr, "offset %ld icn: %lu hash %u tableslot %u" " (prev offset %u)\n", i, i + newchainstart, unk_hashes[i], pti, prefix_table [pti].index ); cind = prefix_table[pti].index; while ( cind != 0) { fprintf (stderr, " ... now location %u forwards to %u \n", cind, chains[cind].next); cind = chains[cind].next; }; }; // Desired State: // chains [old] points to chains [new] // prefix_table [pti] = chains [old] if (prefix_table[pti].index == 0) { // first entry in this chain, so fill in the table. prefix_table[pti].index = i + newchainstart; chains [i + newchainstart].next = 0; } else { // not first entry-- chase the chain, we go at the end cind = prefix_table[pti].index; while (chains[cind].next != 0) cind = chains [cind].next; // cdr down to end of chain chains[cind].next = i + newchainstart; // point at our cell. chains [i + newchainstart].next = 0; }; }; // forcibly let go of the mmap to force an msync if (internal_trace) fprintf (stderr, "UNmmapping file %s\n", hashfilename); crm_force_munmap_filename (hashfilename); }; return (0); }; // A helper (but crucial) function - given an array of hashes and, // and a prefix_table / chain array pair, calculate the compressibility // of the hashes; this is munged (reports say best accuracy if // raised to the 1.5 power) and that is the returned value. // // // The algorithm here is actually suboptimal. // We take the first match presented (i.e. where unk_hashes[i] maps // to a nonzero cell in the prefix table) then for each additional // hash (i.e. unk_hashes[i+j] where there is a consecutive chain // in the chains[q, q+1, q+2]; we sum the J raised to the q_exponent // power for each such chain and report that result back. // // The trick we employ here is that for each starting position q // all possible solutions are on q's chain, but also on q+1's // chain, on q+2's chain, on q+3's chain, and so on. // // At this point, we can go two ways: we can use a single index (q) // chain and search forward through the entire chain, or we can use // multiple indices and an n-way merge of n chains to cut the number of // comparisons down significantly. // // Which is optimal here? Let's assume the texts obey something like // Zipf's law (Nth term is 1/Nth as likely as the 1st term). Then the // probabile number of comparisons to find a string of length Q in // a text of length |T| by using the first method is // (1/N) + ( 1/ N) + ... = Q * (1/N) and we // can stop as soon as we find a string of Q elements (but since we // want the longest one, we check all |T| / N occurrences and that takes // |T| * Q / N^2 comparisons, and we need roughly |U| comparisons // overall, it's |T| * |U| * Q / N^2 . // // In the second method (find all chains of length Q or longer) we // touch each of the Q chain members once. The number of members of // each chain is roughly |T| / N and we need Q such chains, so the // time is |T| * Q / N. However, at the next position // we can simply drop the first chain's constraint; all of the other // calculations have already been done once; essentially this search // can be carried out *in parallel*; this cuts the work by a factor of // the length of the unkonown string. However, dropping the constraint // is very tricky programming and so we aren't doing that right now. // // We might form the sets where chain 1 and chain 2 are sequential in the // memory. We then find where chains 2 and 3 are sequential in the // memory; where chains 3 and 4 are sequential, etc. This is essentially // a relational database "join" operation, but with "in same record" // being replaced with "in sequential slots". // // Assume we have a vector "V" of indices carrying each chain's current // putative pointer for a sequential match. (assume the V vector is // as long as the input string). // // 0) We initialize the indexing vector "V" to the first element of each // chan (or NULL for never-seen chains), and "start", "end", and // "total" to zero. // 1) We set the chain index-index "C" to 0 (the leftmost member // of the index vector V). // 2.0) We do a two-finger merge between the C'th and C + 1 chains, // moving one chain link further on the lesser index in each cycle. // (NB the current build method causes link indicess to be descending in // numerical value; so we step to the next link on the _greater_ of the // two chains. // 2a) we stop the two-finger merge when: // V[C] == V[C+1] + 1 // in which case // >> C++, // >> if C > end then end = C // >> go to 2.0 (repeat the 2-finger merge); // 2b) if the two-finger merge runs out of chain on either the // C chain or the C++ chain (that is, a NULL): // >> set the "out of chain" V element back to the innitial state; // >> go back one chain pair ( "C = C--") // If V[C] == NULL // >> report out (end-start) as a maximal match (incrementing // total by some amount), // >> move C over to "end" in the input stream // >> reset V[end+1]back to the chain starts. Anything further // hasn't been touched and so can be left alone. // >> go to 2.0 // // This algorithm still has the flaw that for the input string // ABCDE, the subchain BCDE is searched when the input string is at A. // and then again at B. However, any local matches BC... are // gauranteed to be captured in the AB... match (we would look at // only the B's that follwed A's, not all of the B's even, so perhaps // this isn't much extra work after all. // // Note: the reason for passing array of hashes rather than the original // string is that the calculation of the hashes is necessary and it's // more efficient to do it once and reuse. Also, it means that the // hashes can be computed with a non-orthodox (i.e. not a string kernel) // method and that might take serious computes and many regexecs. //////////////////////////////////////////////////////////////////////// // // Helper functions for the fast match. // //////////////////////////////////////////////////////////////////////// // given a starting point, does it exist on a chain? static unsigned int chain_search_one_chain_link ( FSCM_HASH_CHAIN_CELL *chains, unsigned int chain_start, unsigned int must_match, int init_cache ) { int i, cachedex; typedef struct { unsigned int chstart; unsigned int cval0; unsigned int cval1; } FSCM_CHAIN_CACHE_CELL; static FSCM_CHAIN_CACHE_CELL cache [FSCM_CHAIN_CACHE_SIZE]; // zero the cache if requested if ( init_cache ) { if (internal_trace) fprintf (stderr, "initializing the chain cache.\n"); for (i = 0; i < FSCM_CHAIN_CACHE_SIZE; i++) { cache[i].chstart = cache[i].cval0 = cache[i].cval1 = 0; }; return (0); }; if (internal_trace) { unsigned int j; fprintf (stderr, " ... chain_search_one_chain chain %u mustmatch %u\n", chain_start, must_match); j = chain_start; fprintf (stderr, "...chaintrace from %u: (next: %u)", j, chains[j].next); while (j != 0) { fprintf (stderr, " %u", j); j = chains[j].next; }; fprintf (stderr, "\n"); }; // Does either or both of our cache elements have a tighter bound // on the mustmatch than the initial chainstart? cachedex = chain_start % FSCM_CHAIN_CACHE_SIZE; if (chain_start == cache[cachedex].chstart) { if ( cache[cachedex].cval0 < must_match && cache[cachedex].cval0 > chain_start) chain_start = cache[cachedex].cval0; if ( cache[cachedex].cval1 < must_match && cache[cachedex].cval1 > chain_start) chain_start = cache[cachedex].cval1; } else // forcibly update the cache to the new chain_start { cache[cachedex].chstart = chain_start; cache[cachedex].cval0 = chain_start; cache[cachedex].cval1 = chain_start; } while ( chain_start < must_match && chain_start > 0) { if (internal_trace) fprintf (stderr, " .... from %u to %u\n", chain_start, chains[chain_start].next); chain_start = chains[chain_start].next; cache[cachedex].cval1 = cache[cachedex].cval0; cache[cachedex].cval0 = chain_start; } if ( chain_start == must_match ) { if (internal_trace) fprintf (stderr, " ...Success at chainindex %u\n", chain_start ); return (chain_start); }; if (internal_trace) fprintf (stderr, " ...Failed\n"); return 0; } // From this point in chainspace, how long does this chain run? // // Do NOT implement this recursively, as a document matched against // itself will recurse for each character, so unless your compiler // can fix tail recursion, you'll blow the stack on long documents. // static unsigned int this_chain_run_length ( FSCM_HASH_CHAIN_CELL *chains, // the known-text chains unsigned int *unk_indexes, // index vector to head of each chain unsigned int unk_len, // length of index vctor unsigned int starting_symbol, // symbol where we start unsigned int starting_chain_index // where it has to match (in chainspace) ) { unsigned int offset; unsigned int chain_start; unsigned int in_a_chain; if (internal_trace) fprintf (stderr, "Looking for a chain member at symbol %u chainindex %u\n", starting_symbol, starting_chain_index); offset = 0; // The "offset" applies to both the unk_hashes _and_ the // offset in the known chainspace. in_a_chain = unk_indexes[starting_symbol + offset]; while ( (starting_symbol + offset < unk_len) && in_a_chain ) { chain_start = unk_indexes[starting_symbol + offset]; if (internal_trace) fprintf (stderr, "..searching at [symbol %u offset %u] chainindex %u\n", starting_symbol, offset, chain_start); in_a_chain = chain_search_one_chain_link ( chains, chain_start, starting_chain_index + offset, 0); if (in_a_chain) offset++; }; if (internal_trace) fprintf (stderr, "chain_run_length finished at chain index %u (offset %u)\n", starting_chain_index + offset, offset); return (offset); } // Note- the two-finger algorithm works- but it's actually kind of // hard to program in terms of it's asymmetry. So instead, we use a // simpler repeated search algorithm with a cache at the bottom // level so we don't repeatedly search the same (failing) elements // of the chain). // // // NB: if this looks a little like how the genomics BLAST // match algorithm runs, yeah... I get that feeling too, although // I have not yet found a good description of how BLAST actually works // inside, and so can't say if this would be an improvement. However, // it does beg the question of whether a BLAST-like algorithm might // work even _better_ for text matching. Future note: use additional // flag to allow short interruptions of match stream. // // longest_run_starting_here returns the length of the longest match // found that starts at exactly index[starting_symbol] // static unsigned int longest_run_starting_here ( FSCM_HASH_CHAIN_CELL *chains, // array of interlaced chain cells unsigned int *unk_indexes, // index vector to head of each chain unsigned int unk_len, // length of index vector unsigned int starting_symbol // index of starting symbol ) { unsigned int chain_index_start; // Where in the primary chain we are. unsigned int this_run, max_run; if (internal_trace) fprintf (stderr, "\n*** longest_run: starting at symbol %u\n", starting_symbol); chain_index_start = unk_indexes[starting_symbol]; this_run = max_run = 0; if (chain_index_start == 0) { if (internal_trace) fprintf (stderr, "longest_run: no starting chain here; returning\n"); return 0; // edge case - no match }; // If we got here, we had at +least+ a max run of one match found // (that being chain_index_start) this_run = max_run = 1; if (internal_trace) fprintf (stderr, "longest_run: found a first entry (chain %u)\n", chain_index_start); while (chain_index_start != 0) { unsigned int chain_index_old; if (internal_trace) fprintf (stderr, "Scanning chain starting at %u\n", chain_index_start); this_run = this_chain_run_length (chains, unk_indexes, unk_len, starting_symbol+1, chain_index_start+1); // if (internal_trace) fprintf (stderr, "longest_run: chainindex run at %u is length %u\n", chain_index_start, this_run); if (this_run > max_run) { if (internal_trace) fprintf (stderr, "longest_run: new maximum\n"); max_run = this_run; } else { if (internal_trace) fprintf (stderr, "longest_run: not an improvement\n"); }; // And go around again till we hit a zero chain index chain_index_start = chains[chain_index_start].next; // skip forward till end of currently found best (Boyer-Moore opt) chain_index_old = chain_index_start; while (chain_index_start > 0 && chain_index_start < chain_index_old + this_run) chain_index_start = chains [chain_index_start].next; }; if (internal_trace) fprintf (stderr, "Final result at symbol %u run length is %u\n", starting_symbol, max_run); if (max_run > 0) return ( max_run + FSCM_DEFAULT_CODE_PREFIX_LEN); else return (0); } // compress_me is the top-level calculating routine which calls // all of the prior routines in the right way. static double compress_me ( unsigned int *unk_indexes, // prefix chain-entry table unsigned int unk_len, // length of the entry table FSCM_HASH_CHAIN_CELL *chains, // array of interlaced chain cells double q_exponent // exponent of match ) { unsigned int current_symbol, this_run_length; double total_score, incr_score; int blast_lookback; // Only use if BLAST is desired. total_score = 0.0; current_symbol = 0; blast_lookback = 0; chain_search_one_chain_link (0, 0, 0, 1); // init the chain-cache while (current_symbol < unk_len) { this_run_length = longest_run_starting_here (chains, unk_indexes, unk_len, current_symbol); incr_score = 0; if (this_run_length > 0) { //this_run_length += blast_lookback; incr_score = pow (this_run_length, q_exponent); //blast_lookback = this_run_length; }; //blast_lookback --; //if (blast_lookback < 0) blast_lookback = 0; //if (this_run_length > 2) // fprintf (stderr, " %ld", this_run_length); //else // fprintf (stderr, "_"); total_score = total_score + incr_score; if (internal_trace) fprintf (stderr, "Offset %u compresses %u score %lf\n", current_symbol, this_run_length, incr_score); if (this_run_length > 0) current_symbol = current_symbol + this_run_length; else current_symbol++; }; return (total_score); } // How to do an Improved FSCM CLASSIFY of some text. // int crm_fast_substring_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen) { // classify the compressed version of this text // as belonging to a particular type. // // Much of this code should look very familiar- it's cribbed from // the code for LEARN // long i, k; char ptext[MAX_PATTERN]; // the regex pattern long plen; // the hash file names long htext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN; // the match statistics variable char stext [MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100)]; long stext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100); long slen; char svrbl[MAX_PATTERN]; // the match statistics text buffer long svlen; long fnameoffset; long use_unique; long not_microgroom = 1; long use_unigram_features; long next_offset; // UNUSED for now! struct stat statbuf; // for statting the hash file regex_t regcb; // Total hits per statistics file - one hit is nominally equivalent to // compressing away one byte // long totalhits[MAX_CLASSIFIERS]; // // long totalfeatures; // total features double tprob; // total probability in the "success" domain. double ptc[MAX_CLASSIFIERS]; // current running probability of this class // Classifier Coding Clarification- we'll do one file at a time, so // these variables are moved to point to different statistics files // in a loop. char *file_pointer; STATISTICS_FILE_HEADER_STRUCT *file_header; // the FSCM_PREFIX_TABLE_CELL *prefix_table; // the prefix indexing table, unsigned long prefix_table_size; FSCM_HASH_CHAIN_CELL *chains; // the chain area unsigned int *unk_indexes; long fn_start_here; char htext [MAX_PATTERN]; // the text of the names (unparsed) long htextlen; char hfname [MAX_PATTERN]; // the current file name long fnstart, fnlen; char hashfilenames [MAX_CLASSIFIERS][MAX_FILE_NAME_LEN]; // names (parsed) long hashfilebytelens [MAX_CLASSIFIERS]; long hashfilechainentries [MAX_CLASSIFIERS]; long succhash; // how many hashfilenames are "success" files? long vbar_seen; // did we see '|' in classify's args? long maxhash; long bestseen; double scores [MAX_CLASSIFIERS]; // per-classifier raw score. int *ca = NULL; long pipelen = 0; long pipe_iters = 0; // We'll generate our unknown string's hashes directly into tempbuf. long unk_hashcount; unsigned *unk_hashes; unk_hashes = (unsigned *) tempbuf; if (internal_trace) fprintf (stderr, "executing a Fast Substring Compression CLASSIFY\n"); // extract the hash file names crm_get_pgm_arg (htext, htext_maxlen, apb->p1start, apb->p1len); htextlen = apb->p1len; htextlen = crm_nexpandvar (htext, htextlen, htext_maxlen); // extract the "this is a compressible character" regex. // Note that by and large this is not used! // crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len); plen = apb->s1len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); // extract the optional "match statistics" variable // crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len); svlen = apb->p2len; svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN); { long vstart, vlen; crm_nextword (svrbl, svlen, 0, &vstart, &vlen); memmove (svrbl, &svrbl[vstart], vlen); svlen = vlen; svrbl[vlen] = '\000'; }; if (user_trace) fprintf (stderr, "Status out var %s (len %ld)\n", svrbl, svlen); // status variable's text (used for output stats) // stext[0] = '\000'; slen = 0; // set flags not_microgroom = 1; if (apb->sflags & CRM_MICROGROOM) { not_microgroom = 0; if (user_trace) fprintf (stderr, " disabling fast-skip optimization.\n"); }; use_unique = 0; if (apb->sflags & CRM_UNIQUE) { use_unique = 1; if (user_trace) fprintf (stderr, " unique engaged - repeated features are ignored \n"); }; use_unigram_features = 0; if (apb->sflags & CRM_UNIGRAM) { use_unigram_features = 1; if (user_trace) fprintf (stderr, " using only unigram features. \n"); }; // Create our hashes; we do this once outside the loop and // thus save time inside the loop. unk_hashcount = 0; next_offset = 0; crm_vector_tokenize_selector (apb, // the APB txtptr, // intput string txtstart, // starting offset txtlen, // how many bytes ptext, // parser regex plen, // parser regex len ca, // tokenizer coeff array pipelen, // tokenizer pipeline len pipe_iters, // tokenizer pipeline iterations unk_hashes, // where to put the hashed results data_window_size / sizeof(unsigned), // max number of hashes &unk_hashcount, // how many hashes we actually got &next_offset); // where to start again for more hashes if (internal_trace) { fprintf (stderr, "C.Total %ld hashes - first 16 values:\n" "%u %u %u %u %u %u %u %u\n", unk_hashcount, unk_hashes[0], unk_hashes[1], unk_hashes[2], unk_hashes[3], unk_hashes[4], unk_hashes[5], unk_hashes[6], unk_hashes[7]); fprintf (stderr, "%u %u %u %u %u %u %u %u\n", unk_hashes[8], unk_hashes[9], unk_hashes[10], unk_hashes[11], unk_hashes[12], unk_hashes[13], unk_hashes[14], unk_hashes[15]); }; if (user_trace) fprintf (stderr, "Total of %lu initial features.\n", unk_hashcount); unk_indexes = (unsigned int *) calloc (unk_hashcount+1, sizeof (unsigned int)); // Now, we parse the filenames and do a mmap/match/munmap loop // on each file. The resulting number of hits is stored in the // the loop to open the files. vbar_seen = 0; maxhash = 0; succhash = 0; fnameoffset = 0; // now, get the file names and mmap each file // get the file name (grody and non-8-bit-safe, but doesn't matter // because the result is used for open() and nothing else. // GROT GROT GROT this isn't NULL-clean on filenames. But then // again, stdio.h itself isn't NULL-clean on filenames. if (user_trace) fprintf (stderr, "Classify list: -%s- \n", htext); fn_start_here = 0; fnlen = 1; while ( fnlen > 0 && ((maxhash < MAX_CLASSIFIERS-1))) { crm_nextword (htext, htextlen, fn_start_here, &fnstart, &fnlen); if (fnlen > 0) { strncpy (hfname, &htext[fnstart], fnlen); fn_start_here = fnstart + fnlen + 1; hfname[fnlen] = '\000'; strncpy (hashfilenames[maxhash], hfname, fnlen); hashfilenames[maxhash][fnlen] = '\000'; if (user_trace) fprintf (stderr, "Classifying with file -%s- succhash=%ld, maxhash=%ld\n", hashfilenames[maxhash], succhash, maxhash); if ( hfname[0] == '|' && hfname[1] == '\000') { if (vbar_seen) { nonfatalerror5 ("Only one ' | ' allowed in a CLASSIFY. \n" , "We'll ignore it for now.", CRM_ENGINE_HERE); } else { succhash = maxhash; }; vbar_seen ++; } else { // be sure the file exists // stat the file to get it's length k = stat (hfname, &statbuf); // quick check- does the file even exist? if (k != 0) { nonfatalerror5 ("Nonexistent Classify table named: ", hfname, CRM_ENGINE_HERE); } else { // file exists - do the open/process/close // hashfilebytelens[maxhash] = statbuf.st_size; // mmap the hash file into memory so we can bitwhack it file_pointer = crm_mmap_file (hfname, 0, hashfilebytelens[maxhash], PROT_READ, MAP_SHARED, NULL); if (file_pointer == MAP_FAILED ) { nonfatalerror5 ("Couldn't memory-map the table file :", hfname, CRM_ENGINE_HERE); } else { // GROT GROT GROT // GROT Actually implement this someday!!! // Check to see if this file is the right version // GROT GROT GROT // set up our pointers for the prefix table and // the chains file_header = (STATISTICS_FILE_HEADER_STRUCT *) file_pointer; if (internal_trace) fprintf (stderr, "Prefix table at %lu, chains at %lu\n", (long unsigned) file_header->chunks[3].start, (long unsigned) file_header->chunks[4].start); prefix_table = (FSCM_PREFIX_TABLE_CELL *) &file_pointer[file_header->chunks[3].start]; #if 0 { FSCM_HEADER *f = (FSCM_HEADER *)(file_header + 1); prefix_table_size = f->prefix_hash_table_length; } #else prefix_table_size = file_header->chunks[3].length / sizeof (FSCM_PREFIX_TABLE_CELL); #endif chains = (FSCM_HASH_CHAIN_CELL *) &file_pointer[file_header->chunks[4].start]; // GROT GROT GROT pointer arithmetic is gross!!! hashfilechainentries[maxhash] = file_header->chunks[4].length / sizeof (FSCM_HASH_CHAIN_CELL); if (internal_trace) fprintf (stderr, " Prefix table size = %ld\n", prefix_table_size); // initialize the index vector to the chain starts // (some of which are NULL). for (i = 0; i < unk_hashcount; i++) { unsigned int uhmpts; uhmpts = unk_hashes[i] % prefix_table_size; unk_indexes[i] = (unsigned int) prefix_table [uhmpts].index; if (internal_trace) fprintf (stderr, "unk_hashes[%ld] = %u, index = %u, " " prefix_table[%u] = %u \n", i, unk_hashes[i], uhmpts, uhmpts, prefix_table[uhmpts].index); }; // Now for the nitty-gritty - run the compression // of the unknown versus tis statistics file. // For thk=0.1, power of 1.2 --> 36 errs, // 1.5--> 49 errs, 1.7-->52, and 1.0 bogged down // At thk=0.0 exponent 1.0-->191 and 18 min // thk 0.1 exp 1.35 --> 34 in 12min. and exp 1.1 -> 43 // thk 0.05 exp 1.1--> 50. scores [maxhash] = compress_me (unk_indexes, unk_hashcount, chains, (double) 1.35); }; maxhash++; }; }; if (maxhash > MAX_CLASSIFIERS-1) nonfatalerror5 ("Too many classifier files.", "Some may have been disregarded", CRM_ENGINE_HERE); }; }; // // If there is no '|', then all files are "success" files. if (succhash == 0) succhash = maxhash; if (user_trace) fprintf (stderr, "Running with %ld files for success out of %ld files\n", succhash, maxhash ); // sanity checks... Uncomment for super-strict CLASSIFY. // // do we have at least 1 valid .css files? if (maxhash == 0) { nonfatalerror5 ("Couldn't open at least one .css files for classify().", "", CRM_ENGINE_HERE); }; // do we have at least 1 valid .css file at both sides of '|'? // if (!vbar_seen || succhash < 0 || (maxhash < succhash + 2)) // { // nonfatalerror ( // "Couldn't open at least 1 .css file per SUCC | FAIL category " // " for classify().\n","Hope you know what are you doing."); // }; /////////////////////////////////////////////////////////// // // To translate score (which is exponentiated compression) we // just normalize to a sum of 1.000 . Note that we start off // with a minimum per-class score of "tiny" to avoid divide-by-zero // problems (zero scores on everything => divide by zero) tprob = 0.0; for (i = 0; i < MAX_CLASSIFIERS; i++) ptc[i] = 0.0; for (i = 0; i < maxhash; i++) { ptc[i] = scores [i] ; if (ptc[i] < 0.0001) ptc[i] = 0.0001; tprob = tprob + ptc[i]; }; // Renormalize probabilities for (i = 0; i < maxhash; i++) ptc[i] = ptc[i] / tprob; if (user_trace) { for (k = 0; k < maxhash; k++) fprintf (stderr, "Match for file %ld: compress: %f prob: %f\n", k, scores[k], ptc[k]); }; bestseen = 0; for (i = 0; i < maxhash; i++) if (ptc[i] > ptc[bestseen]) bestseen = i; // Reset tprob to contain sum of probabilities of success classes. tprob = 0.0; for (k = 0; k < succhash; k++) tprob = tprob + ptc[k]; if (svlen > 0) { char buf[1024]; double accumulator; double remainder; double overall_pR; long m; buf [0] = '\000'; accumulator = 1000 * DBL_MIN; for (m = 0; m < succhash; m++) { accumulator = accumulator + ptc[m]; }; remainder = 1000 * DBL_MIN; for (m = succhash; m < maxhash; m++) { remainder = remainder + ptc[m]; }; if (internal_trace) fprintf (stderr, "succ: %ld, max: %ld, acc: %lf, rem: %lf\n", succhash, maxhash, accumulator, remainder); // constant "200" below determined empirically for SSTTT at 10 pR's // (used to be 10) overall_pR = 200 * (log10 (accumulator) - log10(remainder)); // note also that strcat _accumulates_ in stext. // There would be a possible buffer overflow except that _we_ control // what gets written here. So it's no biggie. if (tprob > 0.5000) { sprintf (buf, "CLASSIFY succeeds; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR ); } else { sprintf (buf, "CLASSIFY fails; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR ); }; if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); remainder = 1000 * DBL_MIN; for (m = 0; m < maxhash; m++) if (bestseen != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "Best match to file #%ld (%s) prob: %6.4f pR: %6.4f \n", bestseen, hashfilenames[bestseen], ptc [bestseen ], // "200" is for SSTTT, was 10 200 * (log10 (ptc [bestseen]) - log10 ( remainder ) ) ); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); sprintf (buf, "Total features in input file: %ld\n", unk_hashcount); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); for (k = 0; k < maxhash; k++) { long m; remainder = 1000 * DBL_MIN; for (m = 0; m < maxhash; m++) if (k != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "#%ld (%s):" " features: %ld, chcs: %6.2f, prob: %3.2e, pR: %6.2f \n", k, hashfilenames[k], hashfilechainentries[k], scores[k], ptc[k], 200 * (log10 (ptc[k]) - log10 (remainder) ) ); // strcat (stext, buf); if (strlen(stext)+strlen(buf) <= stext_maxlen) strcat (stext, buf); }; // check here if we got enough room in stext to stuff everything // perhaps we'd better rise a nonfatalerror, instead of just // whining on stderr if (strcmp(&(stext[strlen(stext)-strlen(buf)]), buf) != 0) { nonfatalerror5( "WARNING: not enough room in the buffer to create " "the statistics text. Perhaps you could try bigger " "values for MAX_CLASSIFIERS or MAX_FILE_NAME_LEN?", " ", CRM_ENGINE_HERE); }; crm_destructive_alter_nvariable (svrbl, svlen, stext, strlen (stext)); }; // cleanup time! // and let go of the regex buffery if (ptext[0] != '\0') crm_regfree (®cb); if (tprob > 0.5000) { // all done... if we got here, we should just continue execution if (user_trace) fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n"); } else { if (user_trace) fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n"); // and do what we do for a FAIL here csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1; return (0); }; // // regcomp_failed: return (0); }; crm114-20100106-BlameMichelson.src/approxtest.crm0000755000000000017500000000153711321154266017722 0ustar rootwsy#! /usr/bin/crm # # approxtest.crm - test approximate regex matching, # usually reading from approxtest-args.txt # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. window output / Start of approximate match testing. :*:_nl:/ isolate (:words:) /molasses anaconda foo bar baz agentsmith mranderson / output /:*:_nl:Input string to match against: ":*:words:":*:_nl::*:_nl:/ isolate (:patterns:) input (:patterns:) isolate (:p:) /bleah/ { window (:p:) (:patterns:) /.*/ /:*:_nl:/ match (:z: :pa:) [:p:] /(.+)./ output /:*:pa:/ { { { match (:out:) [:words:] /:*:pa:/ output / ---> :*:out:/ } alius { output / / } } trap (:zeta:) /.*/ { output / / } } output /:*:_nl:/ liaf } output / end of approximate match testing. :*:_nl:/ crm114-20100106-BlameMichelson.src/crm114-mode.el0000644000000000017500000001741511321154266017260 0ustar rootwsy;;; crm114-mode.el --- major mode for editing crm114 scripts ;; Copyright (C) 2005 Haavard Kvaalen ;; This file is under GPLv3, as described in COPYING. ;; ;; Keywords: languages ;; ;; To automatically invoke this mode whenever you edit a .crm file, ;; make sure crm114-mode.el is in your site's default .el directory, ;; and add the following to your .emacs file in your home directory ;; without the ';;' commenting in front, of course! ;; ;; (load "crm114-mode.el" ) ;; (add-to-list 'auto-mode-alist '("\\.crm\\'" . crm114-mode)) ;; ;; $Revision: 1.7 $ (defvar crm114-indent-level 4 "Indentation of blocks") (defvar crm114-mode-syntax-table (let ((table (make-syntax-table))) (modify-syntax-entry ?# "< 4" table) (modify-syntax-entry ?\n ">" table) (modify-syntax-entry ?/ "\"" table) (modify-syntax-entry ?\\ "\\ 3" table) (modify-syntax-entry ?< "(>" table) (modify-syntax-entry ?> ")<" table) (modify-syntax-entry ?: "." table) (modify-syntax-entry ?! "_" table) (modify-syntax-entry ?\" "_" table) (modify-syntax-entry ?$ "_" table) (modify-syntax-entry ?% "_" table) (modify-syntax-entry ?& "_" table) (modify-syntax-entry ?' "_" table) (modify-syntax-entry ?* "_" table) (modify-syntax-entry ?+ "_" table) (modify-syntax-entry ?, "_" table) (modify-syntax-entry ?- "_" table) (modify-syntax-entry ?. "_" table) (modify-syntax-entry ?= "_" table) (modify-syntax-entry ?? "_" table) (modify-syntax-entry ?@ "_" table) (modify-syntax-entry ?^ "_" table) (modify-syntax-entry ?_ "_" table) (modify-syntax-entry ?` "_" table) (modify-syntax-entry ?| "_" table) (modify-syntax-entry ?~ "_" table) table)) (defvar crm114-mode-map (let ((map (make-sparse-keymap))) (define-key map "\t" 'crm114-indent-line) (define-key map "}" 'crm114-electric-brace) map)) (defvar crm114-font-lock-keywords (list ;; goto labels '("\\(^\\|;\\)\\s-*:\\(\\(:?\\sw\\|\\s_\\)+\\):\\s-*\\($\\|;\\|#\\)" 2 'font-lock-constant-face nil) ;; functions '("\\(^\\|;\\)\\s-*:\\(\\(:?\\sw\\|\\s_\\)+\\):\\s-*(" 2 'font-lock-function-name-face nil) ;; variables '("\\(:\\*\\)?:\\([^: \t\n]+\\):" 2 'font-lock-variable-name-face nil) ;; statements (list (concat "\\(^\\|;\\)\\s-*" (regexp-opt '("accept" "alius" "alter" "call" "classify" "eval" "exit" "fail" "fault" "goto" "hash" "input" "insert" "intersect" "isolate" "learn" "liaf" "match" "noop" "output" "return" "syscall" "trap" "union" "window") 'words)) 2 'font-lock-keyword-face nil) )) (defvar crm114-font-lock-syntactic-keywords (list ;; '#' and '/' are allowed within variable names so we need to ;; change their syntax at those places. '(":[#@]:[^ \t\n]*:\\|:[^: \t\n]*:" ("#\\|/" (progn (goto-char (match-beginning 0)) (setq crm114-end-syntactic (match-end 0))) (goto-char crm114-end-syntactic) (0 "_"))))) (defun crm114-end-of-line () (save-excursion (end-of-line) (point))) (defun crm114-calculate-indent () "Calculate and return indentation for the current line." (save-excursion (beginning-of-line) (let (ret cont (n 0) (bol (point))) (while (looking-at "[ \t]*}") (goto-char (match-end 0)) (setq n (1- n))) ;; Find last line that is not empty and is not all comment (while (and (or (= (forward-line -1) 0) (progn (setq ret 0) nil)) (looking-at "[ \t]*\\(#\\|$\\)"))) (crm114-beginning-of-syntax) (or ret (let ((indent (current-indentation)) (end (crm114-end-of-line)) (search-string (concat "\\(#\\)\\|\\(/\\)\\|\\(:\\)\\|" "\\({\\)\\|\\(}\\)\\|\\(\\\\\\)?$"))) ;; Leading closing brace affects previous line (while (looking-at "[ \t]*}") (goto-char (match-end 0))) (while (and (re-search-forward search-string end t) (cond ((match-beginning 1) ; comment (re-search-forward "\\\\#" end t)) ((match-beginning 2) ; string ;; The extra paranthesis are here to work ;; around what seems to be a bug seen on ;; xemacs 21.4.6 (on debian) (while (and (not (and (looking-at "/\\|\\(.*?[^\\\n]\\)/") (goto-char (match-end 0)))) (forward-line 1) (setq end (crm114-end-of-line)) (or (< (point) bol) (not (setq ret 0))))) t) ((match-beginning 3) ; variable (goto-char (match-beginning 3)) (crm114-skip-variable end)) ((match-beginning 4) ; opening brace (setq n (1+ n))) ((match-beginning 5) ; closing brace (setq n (1- n))) (t ; eol (setq cont (match-beginning 6)) (forward-line 1) (setq end (crm114-end-of-line)))) (< (point) bol) (not ret))) (or ret (progn (when cont (setq n (+ n 2))) (+ indent (* n crm114-indent-level))))))))) (defun crm114-indent-line () (interactive) (let (beg (pos (- (point-max) (point))) (indent (crm114-calculate-indent))) (beginning-of-line) (setq beg (point)) (skip-chars-forward " \t") (delete-region beg (point)) (indent-to (crm114-calculate-indent)) (if (> (- (point-max) pos) (point)) (goto-char (- (point-max) pos))))) (defun crm114-electric-brace (arg) (interactive "p") (if (> arg 0) (progn (insert-char last-command-char arg) (crm114-indent-line) (delete-char (- arg)) (self-insert-command arg)))) (defun crm114-skip-variable (max) (unless (looking-at ":") (error "Not at variable start")) (if (looking-at ":[#@]:\\([^ \t\n]*\\)\\(:\\)") (progn (goto-char (match-beginning 1)) (let ((last (match-beginning 2)) (end (match-end 2))) (when (re-search-forward ":" last t) (goto-char (match-beginning 0)) (crm114-skip-variable last)) (goto-char end))) (if (looking-at ":\\*:[^ \t\n:]*:") (goto-char (match-end 0)) (goto-char (1+ (point))) (re-search-forward ":" max 'to-end)))) (defun crm114-beginning-of-syntax () "Go backwards until the start of the current statement." (beginning-of-line) (when (> (- (point) (point-min)) 1) (let ((pos (point))) (goto-char (- (point) 2)) (if (looking-at "\\\\$") (progn (beginning-of-line) (while (and (or (re-search-forward "\\(#\\)\\|\\(/\\)\\|\\(:\\)" pos t) (progn (crm114-beginning-of-syntax) nil)) (cond ((match-beginning 1) (if (re-search-forward "\\\\#" pos t) t (goto-char pos) nil)) ((match-beginning 2) (if (looking-at "/\\|.*?[^\\\n]/") (goto-char (match-end 0)) (crm114-beginning-of-syntax) nil)) ((match-beginning 3) (goto-char (match-beginning 3)) (crm114-skip-variable pos) t))))) (goto-char pos))))) (defun crm114-mode () "Major mode for editing crm scripts. CRM114, also known as The Controllable Regex Mutilator is a language designed for implementation of contextual filters. Turning on crm114 mode runs `crm114-mode-hook'." (interactive) (kill-all-local-variables) (setq mode-name "crm114") (setq major-mode 'crm114-mode) (use-local-map crm114-mode-map) (set-syntax-table crm114-mode-syntax-table) (make-local-variable 'indent-line-function) (setq indent-line-function 'crm114-indent-line) (setq font-lock-defaults '(crm114-font-lock-keywords nil nil nil crm114-beginning-of-syntax (font-lock-syntactic-keywords . crm114-font-lock-syntactic-keywords))) (make-local-variable 'comment-start) (setq comment-start "# ") (make-local-variable 'comment-end) (setq comment-end "") (make-local-variable 'comment-start-skip) (setq comment-start-skip "#+[ \t]*") (run-hooks 'crm114-mode-hook)) (provide 'crm114-mode) crm114-20100106-BlameMichelson.src/crm_expr_translate.c0000644000000000017500000001334311321154266021041 0ustar rootwsy// crm_expr_translate.c - translate routine // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *tempbuf; // And the translate routine. We use strntrn to do the hard work; // this code here is just glue code. // int crm_expr_translate (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { long strntrn_flags; char destination[MAX_VARNAME]; long destination_len, dst_nstart; // for source, we use tempbuf long vmidx; char *mdwptr; long offset; long len, retlen; char errstr[MAX_PATTERN]; long i; // the "from" charset char fromset[MAX_PATTERN]; long fromset_len; // the "to" charset char toset[MAX_PATTERN]; long toset_len; // strntrn_flags = 0; // Go through the flags // // UNIQUE flag set? // if (apb->sflags & CRM_UNIQUE) { if (user_trace) fprintf (stderr, " uniquing flag turned on...\n"); strntrn_flags = strntrn_flags | CRM_UNIQUE; }; // // How about the LITERAL flag if (apb->sflags & CRM_LITERAL) { if (user_trace) fprintf (stderr, " literal (no invert or ranges) turned on...\n"); strntrn_flags = strntrn_flags | CRM_LITERAL ; }; // Get the destination for the translation // crm_get_pgm_arg (destination, MAX_VARNAME, apb->p1start, apb->p1len); destination_len = crm_nexpandvar (destination, apb->p1len, MAX_VARNAME); //if (destination_len == 0) // { // strcpy (destination, ":_dw:"); // destination_len = 5; // }; if (internal_trace) fprintf (stderr, " destination: ***%s*** len=%ld\n", destination, destination_len); crm_nextword (destination, destination_len, 0, &dst_nstart, &destination_len); if (destination_len < 3) { strcpy (destination, ":_dw:"); destination_len = 5; } // here's where we look for a [] var-restriction source // // Experimentally, we're adding [ :foo: 123 456 ] to // allow an externally specified start and length. crm_get_pgm_arg (tempbuf, data_window_size, apb->b1start, apb->b1len); // Use crm_restrictvar to get start & length to look at. i = crm_restrictvar(tempbuf, apb->b1len, &vmidx, &mdwptr, &offset, &len, errstr); if (internal_trace) fprintf (stderr, "restriction out: vmidx: %ld mdw: %ld start: %ld len: %ld\n", vmidx, (long) mdwptr, offset, len); if ( i < 0) { long curstmt; curstmt = csl->cstmt; if (i == -1) nonfatalerror5 (errstr, "", CRM_ENGINE_HERE); if (i == -2) fatalerror5 (errstr, "", CRM_ENGINE_HERE); // // did the FAULT handler change the next statement to execute? // If so, continue from there, otherwise, we FAIL. if (curstmt == csl->cstmt) { csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk [ csl->mct[csl->cstmt]->nest_level ] = -1; }; goto nonfatal_route_outwards; }; // No problems then. We can just memmove the result into tempbuf memmove (tempbuf, &mdwptr[offset], len); // get the FROM charset out of the first // slashes crm_get_pgm_arg (fromset, MAX_PATTERN, apb->s1start, apb->s1len); if (internal_trace) fprintf (stderr, " FROM-charset: =%s=\n", fromset ); // if not LITERAL, then expand them as well fromset_len = apb->s1len; if ( ! (strntrn_flags & CRM_LITERAL)) fromset_len = crm_nexpandvar (fromset, apb->s1len, MAX_PATTERN); if (user_trace) fprintf (stderr, " from-charset expands to =%s= len %ld \n", fromset, fromset_len); // get the TO charset out of the second // slashes crm_get_pgm_arg (toset, MAX_PATTERN, apb->s2start, apb->s2len); if (internal_trace) fprintf (stderr, " TO-charset: =%s=\n", toset ); // if not LITERAL, then expand them as well toset_len = apb->s2len; if ( ! (strntrn_flags & CRM_LITERAL)) toset_len = crm_nexpandvar (toset, apb->s2len, MAX_PATTERN); if (user_trace) fprintf (stderr, " to-charset expands to =%s= len %ld\n", toset, toset_len); // We have it all now - the [expanded] input in tempbuf, the // from-charset, the to-charset, and the flags. We can now // make the big call to strntrn and get the new (in-place) string. retlen = strntrn ((unsigned char *)tempbuf, &len, data_window_size, (unsigned char *)fromset, fromset_len, (unsigned char *)toset, toset_len, strntrn_flags); if (retlen < 0) { nonfatalerror5 ("Messy problem in TRANSLATE.", "Try again with -t tracing maybe?", CRM_ENGINE_HERE); goto nonfatal_route_outwards; }; // // OK, we have final result and a valid length. Now push that // back into the destination. //tempbuf[retlen] = '\0'; //if (user_trace) // fprintf (stderr, "Result of TRANSLATE: %s len %ld\n", // tempbuf, retlen); if (user_trace) { long i; fprintf (stderr, "Result of TRANSLATE: -"); for(i=0;i<--and this, with nothing in between. --- the next thing you see SHOULD be an error "unable to read-open"--- Caught the error - fault text was ./crm114: *ERROR* For some reason, I was unable to read-open the file named zoob.zipulrlfjf Sorry, but this program is very sick and probably should be killed off. This happened at line 25 of file fataltraptest.crm (runtime system location: crm_expr_file_io.c(135) in routine: crm_expr_input) --- and again, the next thing you see SHOULD be an error "unable to write-open"--- Caught the error - fault text was ./crm114: *ERROR* For some reason, I was unable to write-open the file named /No/Such/Directory/frotz.mumble Sorry, but this program is very sick and probably should be killed off. This happened at line 35 of file fataltraptest.crm (runtime system location: crm_expr_file_io.c(329) in routine: crm_expr_output) Start of insert processor testing start here ... the first middle bit... the really middle-middle bit ... the last middle bit... and the last bit of normal text now we test nonexistent file overrides this should not cause a FATAL ERROR, because we trap it. We caught the nonexistent file fault here. Message was: <> Couldn't insert the file named 'there_is_no_such_file_as_this.txt' that you asked for. This is probably a bad thing. <> via the TRAP End of insert testing. CRM114 testing match functionality Testing exact match on foo... found exact match on 'foo' Testing exact match on foo... Testing absent match on foo... match says "no foo found". Testing absent match on foo... Testing nocase match on foo... found a nocase match on 'fOo' Testing nocase match on foo... Testing nocase absent match on foo... match says "no foo found". Testing nocase absent match on foo... Testing multiline match on foo... found an allowed multiline match on ' multiline: this is a multiline test of foo multiline-- should see both lines ZZZ' Testing multiline match on foo... Testing nomultiline match on foo... found a nomultiline match on 'nomultiline: this is a nomultiline test of foo' Testing nomultiline match on foo... Testing fromendchar match for foo then bar...... found the foo...then bar Testing fromendchar match for foo then bar...... found the foo... Testing fromnext match for f+oo+.. first 'foo'.. found one on 'ffoooo' Testing fromnext match for f+oo+.. first 'foo'.. Testing newend match for f+oo+.. first 'foo'.. found one on 'ffooo' Testing newend match for f+oo+.. first 'foo'.. Testing indirect goto ":twist:" and ":shout:".... got :twist:... got to TWIST. Testing indirect goto ":twist:" and ":shout:".... got :shout2:... got to SHOUT. Testing self-supplied-match ... found '123' ... found 'smith 123 anderson ZZZ' Testing self-supplied-match ... found '123' ... Testing indep start/end ... found 'foo bar 1 2 foo' ... found '1' ... found 'bar 1 2 foo bar'... found '2' Testing indep start/end ... found 'foo bar foo' ... Testing indep start/end ... found 'foo 1 foo' ... found '1' ... found 'bar 2 bar'... found '2' Testing indep start/end ... found 'foo 2 bar 1 bar foo' ... found '1' ... found 'bar 1 bar'... Testing box region control got 10-17 as this: correct (should be 'correct', no angles) Testing box region control, part 2 got 10-17 as this: rrec (should be 'rrec') Testing box region control, isolated variable got 10-17 as this: rrec (should be 'rrec') Fromend match: esult (should be 'esult') Testing box region indexed control got this: 'correct' (should be 'correct', no angles) Testing box region regex control got this: '>correct<' (should be '>correct<', with angles) Testing box region failing regex control got this: '' (should be '', the empty string) Testing box region regex/index control got this: 'correct' (should be 'correct', no angles) Testing box region index/regex control got this: '>correct<' (should be '>correct<', with angles) Testing box region index/index control got this: 'correct' (should be 'correct', no angles) Testing box region regex with spaces control got this: '> correct <' (should be '> correct <', with angles) Testing box region regex with spaces control got this: 'correct' (should be 'correct', no angles) testing versus binding (the '67' bug) The next stmt --- SHOULD--- get an error message! Caught the error, fault text was: ./crm114: *WARNING* This program specifies an 'absent' match, and also tries to bind variables that, by the above, aren't matched! We'll ignore these variable bindings for now. I'll try to keep working. This happened at line 298 of file matchtest.crm (runtime system location: crm_expr_match.c(413) in routine: crm_expr_match) ------------------END OF TESTING---------------------- searching for a foo...found a foo, searching for bar.ooo... no bar.ooo in front of the foo. searching for a foo...found a foo, searching for bar.ooo... no bar.ooo in front of the foo. CRM114 testing ALTERation and copying ----- Whole file ----- 01abcdefghijkl89---------------------- 1: abcde, 2: cdefg, 3: fghij z: cdefg ---altering the z copy ----- 1: abcde, 2: cdefg, 3: fghij z: CDEEEFG ----- Whole file ----- 01abcdefghijkl89 ---altering the original ----- 1: abCDEEE, 2: CDEEEFG, 3: FGhij z: CDEEEFG ----- Whole file ----- 01abCDEEEFGhijkl89 ---altering the original again----- 1: abCD, 2: CDFG, 3: FGhij z: CDEEEFG ----- Whole file ----- 01abCDFGhijkl89---------------------- ----- Rematched Whole file ----- 01abCDFGhijkl89 ---------------------- ------putting things back----- ----- Whole file ----- 01abcdefghijkl89 ---------------------- ---------------------- :q: = cdefg ISOLATEing :q: - this should copy :q: = cdefg ALTERing :q: to ZZZZZ :q: = ZZZZZ ----- Whole file ----- 01abcdefghijkl89 ---------------------- Checking initialization of isolation AAAAA --- CRM114 testing string rewrites ------------------------ abc frobnitz_singleline this should trigger def frobnitz_multiline zebra giraffe and so should this. testpattern mno ------------------------ abc single-line rewrite ran OK. def multi-line rewrite ran OK. mno CRM114 testing that start / length works in matches DEBUG: whitelist==<> DEBUG: matched==<> DEBUG: matched==<> DEBUG: matched==<> DEBUG: matched==<> DEBUG: matched==<> DEBUG: matched==<> DEBUG: matched==<> DEBUG: [TheEnd] CRM114 testing outcall and status codes ----- keeping a process around ---- preparing... OK_IF_PID_CHANGES: one... MINION PROC PID: 20682 from-pipe: 6 to-pipe: 5 OK_IF_PID_SAME_AS_ABOVE: again... MINION PROC PID: 20682 from-pipe: 6 to-pipe: 5 and done ...DEAD MINION, EXIT CODE: 0 exit value test got DEAD MINION, EXIT CODE: 123 CRM114: Testing union and intersection We start with this: ' a b c d e f g h i j k l m n o p q r s t u v w x y z ' intersection of abc and cde is t1: 'c' union of l thru s is t2: 'l m n o p q r s' intersection of abc and t2 is t3: '' union of zulu and t1 is t4: 'c d e f g h i j k l m n o p q r s t u v w x y z' CRM114: test syscall 'printf beep' CRM114: testing default command line args blah 1 = :blah: blah 2 = new value CRM114: testing default command line args blah 1 = command override blah 2 = command override CRM114: testing windowing on windows and variables test one- input by chars, delimited by 'A' Got: This is the test one result A test two- input by EOFs, delimited by 'A' Got: this is the test two result A test three- window an isolated var by chars, delimited by 'A' Got: this is the test three result A test four- isolated var, input by EOFs, delimited by 'A' Got: this is the test four result A and lastly- did the data window stay constant? this is the test two result A CRM114: testing windowing on windows from a variable Input stuff is : This is the test one result A this is the test two result A this is the test three result A this is the test four result A this is the test 5 result A this is the test six result A this is extra stuff and should trigger exit from the loop since it doesn't have the proper delimiter. testing delimited by 'A' Data Window: data window test text Windowed value = This is the test one result A Remaining stuff = this is the test two result A this is the test three result A this is the test four result A this is the test 5 result A this is the test six result A this is extra stuff and should trigger exit from the loop since it doesn't have the proper delimiter. Data Window: data window test text Windowed value = this is the test two result A Remaining stuff = this is the test three result A this is the test four result A this is the test 5 result A this is the test six result A this is extra stuff and should trigger exit from the loop since it doesn't have the proper delimiter. Data Window: data window test text Windowed value = this is the test three result A Remaining stuff = this is the test four result A this is the test 5 result A this is the test six result A this is extra stuff and should trigger exit from the loop since it doesn't have the proper delimiter. Data Window: data window test text Windowed value = this is the test four result A Remaining stuff = this is the test 5 result A this is the test six result A this is extra stuff and should trigger exit from the loop since it doesn't have the proper delimiter. Data Window: data window test text Windowed value = this is the test 5 result A Remaining stuff = this is the test six result A this is extra stuff and should trigger exit from the loop since it doesn't have the proper delimiter. Data Window: data window test text Windowed value = this is the test six result A Remaining stuff = this is extra stuff and should trigger exit from the loop since it doesn't have the proper delimiter. Bounced out of the WINDOW loop -- no further stuff End of window-from-variable testing Start of approximate match testing. Input string to match against: "molasses anaconda foo bar baz agentsmith mranderson " (foo) {1} ---> foo (fou){~} ---> foo (foo) {1} ---> foo (fou){~0} (foo) {1} ---> foo (fou){~1} ---> foo (foo) {1} ---> foo (fou){~2} ---> foo (fou){~3} ---> foo (fuu){~} ---> foo (fuu){~0} (fuu){~1} (fuu){~2} ---> foo (fuu){~3} ---> foo (fou){#} ---> foo (fou){#0} (fou){#1} ---> foo (fou){#2} ---> foo (fou){#3} ---> foo (fou){# ~1} ---> foo (fou){#0 ~1} (fou){#1 ~1} ---> foo (fou){#2 ~1} ---> foo (fou){#3 ~1} ---> foo (fuu){#} ---> foo (fuu){#0} (fuu){#1} (fuu){#2} ---> foo (fuu){#3} ---> foo (fuu){# ~1} (fuu){#0 ~1} (fuu){#1 ~1} (fuu){#2 ~1} (fuu){#3 ~1} (fuu){# ~2} ---> foo (fuu){#0 ~2} (fuu){#1 ~2} (fuu){#2 ~2} ---> foo (fuu){#3 ~2} ---> foo (fuu){# ~3} ---> foo (fuu){#0 ~3} (fuu){#1 ~3} (fuu){#2 ~3} ---> foo (fuu){#3 ~3} ---> foo (fuu){# ~} ---> foo (fuu){#0 ~} (fuu){#1 ~} (fuu){#2 ~} ---> foo (fuu){#3 ~} ---> foo (fou){#} ---> foo (fou){#0} (fou){+1 -1} ---> fo (fou){+2 -2} ---> fo (fou){+3 -3} ---> fo (fou){# ~1} ---> foo (fou){#0 ~1} (fou){+1 -1 ~1} ---> fo (fou){+2 -2 ~1} ---> fo (fou){+3 -3 ~1} ---> fo (fou){# ~2} ---> foo (fou){#0 ~2} (fou){+1 -1 ~2} ---> fo (fou){+2 -2 ~2} ---> fo (fou){+3 -3 ~2} ---> fo (fou){# ~3} ---> foo (fou){#0 ~3} (fou){+1 -1 ~3} ---> fo (fou){+2 -2 ~3} ---> fo (fou){+3 -3 ~3} ---> fo (fou){# ~} ---> foo (fou){#0 ~} (fou){+1 -1 ~} ---> fo (fou){+2 -2 ~} ---> fo (fou){+3 -3 ~} ---> fo (fuu){#} ---> foo (fuu){#0} (fuu){+1 -1} (fuu){+2 -2} ---> f (fuu){+3 -3} ---> f (fuu){# ~1} (fuu){#0 ~1} (fuu){+1 -1 ~1} (fuu){+2 -2 ~1} (fuu){+3 -3 ~1} (fuu){# ~2} ---> foo (fuu){#0 ~2} (fuu){+1 -1 ~2} (fuu){+2 -2 ~2} ---> f (fuu){+3 -3 ~2} ---> f (fuu){# ~3} ---> foo (fuu){#0 ~3} (fuu){+1 -1 ~3} (fuu){+2 -2 ~3} ---> f (fuu){+3 -3 ~3} ---> f (fuu){# ~} ---> foo (fuu){#0 ~} (fuu){+1 -1 ~} (fuu){+2 -2 ~} ---> f (fuu){+3 -3 ~} ---> f (anaconda){~} ---> anaconda (anaonda){ 1i + 1d < 1 } (anaonda){ 1i + 1d < 2 } ---> anaconda (ananda){ 1i + 1d < 1 } (ananda){ 1i + 1d < 2 } (ananda){ 1i + 1d < 3 } ---> anaconda (ana123conda){ 1i + 1d < 2 } (ana123conda){ 1i + 1d < 3 } (ana123conda){ 1i + 1d < 4 } ---> anaconda (ana123cona){ 1i + 1d < 4 } (ana123cona){ 1i + 1d < 5 } ---> anaconda (ana123coa){ 1i + 1d < 4 } (ana123coa){ 1i + 1d < 5 } ---> anaco (ana123coa){ 1i + 1d < 6 } ---> anaco (ana123ca){ 1i + 1d < 4 } (ana123a){ 1i + 1d < 4 } (ana123a){ 1i + 1d < 3 } (anukeonda){~} ---> anaconda (anaconda){ 1i + 1d < 1} ---> anaconda (anaconda){ 1i + 1d < 1, #1} ---> anaconda (anaconda){ 1i + 1d < 1 #1 ~10 } ---> anaconda (anaconda){ #1, ~1, 1i + 1d < 1 } ---> anaconda (anaconda){ #1 ~1 1i + 1d < 1 } ---> anaconda (anacnda){ #1 ~1 1i + 1d < 1 } (agentsmith){~} ---> agentsmith (annndersen){~} ---> anderson (anentemstn){~} ---> agentsmith (anacda){~} ---> anaconda (anacda){ #1 ~1 1i + 1d < 1 } (znacnda){ #1 ~1 1i + 1d < 1 } (znacnda){ #1 ~2 1i + 1d < 1 } (znacnda){ #1 ~3 1i + 1d < 1 } (znacnda){ #1 ~3 1i + 1d < 2 } ---> anaconda (anac){~1}(onda){~1} ---> anaconda (aac){~1}(onda){~1} ---> naconda (ac){~1}(onda){~1} ---> aconda (anac){~1}(oda){~1} ---> anaconda (aac){~1}(oa){~1} ---> nacon (ac){~1}(oa){~1} ---> acon (anac){~1}(onda){~1} ---> anaconda (anZac){~1}(onZda){~1} ---> anaconda (anZZac){~1}(onZda){~1} (anZac){~1}(onZZda){~1} ([a-n]){3,100} ---> anac ([a-n]){3,100}? ---> ana end of approximate match testing. Algebraic math test string value of x is 12345, string value of pi is 3.14159 length of x is 5 , length of pi is 7 string rep of X is shorter than pi matheval of x + pi is 12348.14159 Algebraic matheval of (2*3)+(4*5) is 26 and adding 3.14159 to that is 29.14159 Testing RPN mode math string value of x is 12345, string value of pi is 3.14159 length of x is 5 , length of pi is 7 string rep of X is shorter than pi matheval of x + pi is 12348.14159 RPN matheval of (2*3)+(4*5) is 26 and adding 3.14159 to that is 29.14159 This test checks to see that EVAL properly detects blowups You should see an error message as the next thing, with a large string of Z's. We should get an error here, due to the (intentional!) loop ./crm114: *WARNING* The variable you're attempting to EVAL seems to eval infinitely, and hence I cannot compute it. I did try a lot, though. I got this far before I gave up: :*:a:ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ(...truncated) I'll try to keep working. This happened at line 20 of file eval_infiniteloop.crm (runtime system location: crm_expr_alter.c(125) function crm_expr_eval) Original: The quick brown fox jumped over the lazy dog's back delete file then append: alpha bravo charlie prior file, no append: The quick brown fox jumped over the lazy dog's back prior file, and append: The quick brown fox jumped over the lazy dog's back alpha bravo charlie Grab chars 10 thru 25: brown fox jumpe Grab chars 17 and length 12: ox jumped ov Now, change the fox to cat: uick brown cat jumped over the Final check- did it really get written? The quick brown cat jumped over the lazy dog's back alpha bravo charlie isolate :a: as 'hdgdgb aaa hdgdb', match b as /aaa/ a=hdgdgb aaa hdgdb - b=aaa alter :a: as 'x' a=x - b= re-isolate :a: as 'hdgdgb bbb hdgdb' a=hdgdgb bbb hdgdb - b= now match :b: to :a:'s 'bbb' section a=hdgdgb bbb hdgdb - b=bbb now alter :a: to 'x' again a=x - b= re-re-isolate :a: as 'hdgdgb ccc hdgdb' a=hdgdgb ccc hdgdb - b= now match :b: to :a:'s 'ccc' section a=hdgdgb ccc hdgdb - b=ccc now alter :a: to 'x' again a=x - b= Content-Type: text/html; charset=us-ascii Content-Transfer-Encoding: 7bit Content-Type: image/gif; name="clonic.GIF" Content-Transfer-Encoding: base64 Content-ID: Content-Disposition: inline; filename="clonic.GIF" :boundary:=------------000708090009050006030006 :c:=1 Content-Type: text/html; charset=us-ascii Content-Transfer-Encoding: 7bit :c:=2 Content-Type: image/gif; name="clonic.GIF" Content-Transfer-Encoding: base64 Content-ID: Content-Disposition: inline; filename="clonic.GIF" :boundary:=--0123456789 :c:=1 Content-Type: text Content-Transfer-Encoding: 7bit :c:=2 Content-Type: image name="clonic.GIF" Content-Transfer-Encoding: base64 iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii :boundary:=--0123 :c:=1 C-T: txt C-T-E: 7 :c:=2 C-T: img name="clonic.GIF" C-T-E: b64 iiii Match-Isolate reclamation test 1. If this program doesn't error out, the test is passed. (we allocate about 80 megs, well past the window size, but in small (400K) chunks that alternately are allocated and become reclaimable) OK_IF_SIZE_CHANGES: Size of isolation at start: 933509 Bytes used per pass: 931467 Passes done: 100 Total flux through isolation: 186293400 OK_IF_SIZE_CHANGES: Final isolation size: 933525 OK_IF_LESS_THAN_100: Total growth: 16 Match-Isolate reclamation test. If this program doesn't error out, the test is passed. (we allocate about 80 megs, well past the window size, but in small (400K) chunks that alternately are allocated and become reclaimable) OK_IF_SIZE_CHANGES: Size of isolation at start: 935934 Bytes used per pass: 931467 Passes done: 10 Total flux through isolation: 18629340 OK_IF_SIZE_CHANGES: Final isolation size: 935948 OK_IF_LESS_THAN_10: Total growth: 14 Bytes used per pass: 931467 Passes done: 20 Total flux through isolation: 37258680 OK_IF_SIZE_CHANGES: Final isolation size: 935953 OK_IF_LESS_THAN_20: Total growth: 19 Bytes used per pass: 931467 Passes done: 30 Total flux through isolation: 55888020 OK_IF_SIZE_CHANGES: Final isolation size: 935953 OK_IF_LESS_THAN_30: Total growth: 19 Bytes used per pass: 931467 Passes done: 40 Total flux through isolation: 74517360 OK_IF_SIZE_CHANGES: Final isolation size: 935953 OK_IF_LESS_THAN_40: Total growth: 19 Starting Testing local and forking call and returns. (level 0) The foo (level 1) Middle (level 0) The bar was > a b c d e < (level 1) Got back >z y x w v< End (level 0) Doing factorial with a mutating argument arglist return factorial call entry, args = 5 1 (level 1) arglist return factorial call entry, args = 4 5 (level 2) arglist return factorial call entry, args = 3 20 (level 3) arglist return factorial call entry, args = 2 60 (level 4) arglist return factorial call entry, args = 1 120 (level 5) Recursion bottomed out, returning :out: = 120 Call return, :out: = 120 ( level 4) Call return, :out: = 120 ( level 3) Call return, :out: = 120 ( level 2) Call return, :out: = 120 ( level 1) 5 factorial is 120 (level 0) Doing factorial with inplace args. downward recurse call entry, args = 5 1 (level 1) downward recurse call entry, args = 4 5 (level 2) downward recurse call entry, args = 3 20 (level 3) downward recurse call entry, args = 2 60 (level 4) downward recurse call entry, args = 1 120 (level 5) Recursion bottomed out, returning :out: = 120 Call return, :out: = 120 (level 4) Call return, :out: = 120 (level 3) Call return, :out: = 120 (level 2) Call return, :out: = 120 (level 1) 5 factorial_inplace is 120 (level 0) Doing factorial with return args Call factorial_returnarg entry, arg = 5 (level 1) N is 5 NM1 is 4 Call factorial_returnarg entry, arg = 4 5 (level 2) N is 4 NM1 is 3 Call factorial_returnarg entry, arg = 3 4 5 (level 3) N is 3 NM1 is 2 Call factorial_returnarg entry, arg = 2 3 4 5 (level 4) N is 2 NM1 is 1 Call factorial_returnarg entry, arg = 1 2 3 4 5 (level 5) Calling bottomed out with N-1 <= 0.00 (level 5) Call returned, return value was 1 2 3 4 5 (level 4) p1: 1, p2: 2, rest: 3 4 5 multiply p1 and p2, put that on the front, and return :out: = 2 3 4 5 (level 4) Call returned, return value was 2 3 4 5 (level 3) p1: 2, p2: 3, rest: 4 5 multiply p1 and p2, put that on the front, and return :out: = 6 4 5 (level 3) Call returned, return value was 6 4 5 (level 2) p1: 6, p2: 4, rest: 5 multiply p1 and p2, put that on the front, and return :out: = 24 5 (level 2) Call returned, return value was 24 5 (level 1) p1: 24, p2: 5, rest: multiply p1 and p2, put that on the front, and return :out: = 120 (level 1) 5 factorial_returnarg is 120 (level 0) and now do some fully isolated forking calls Returned output = In the fork... data window is This is the original data window. It shouldn't change (level 0) Inputted; now the data window is: this string is your input Changing the data window in this fork. Now the data window is: This is a _changed_ data window, local to this SYSCALLed fork. (level 0) Returned status: DEAD MINION, EXIT CODE: 123 And the data window is now: This is the original data window. It shouldn't change TRANSLATE basic functionality test original: foo bar zappa 0123456789.,?`~!@#$%^&*()_-+= XYZZY QUICK BROWN FOX a-e rot13: sbb one mnccn 0123456789.,?`~!@#$%^&*()_-+= XYZZY QUICK BROWN FOX n-r again: foo bar zappa 0123456789.,?`~!@#$%^&*()_-+= XYZZY QUICK BROWN FOX a-e invrange: ull yzi azkkz 9876543210.,?`~!@#$%^&*()_-+= XYZZY QUICK BROWN FOX z-v unique: fo bar zapa 0123456789.,?`~!@#$%^&*()_-+= XYZZY QUICK BROWN FOX a-e invrange: ull yzi azkkz 0123456789.,?`~!@#$%^&*()_-+= XYZZY QUICK BROWN FOX z-v literal: foo bzr azppz 0123456789.,?`~!@#$%^&*()_-+= XYZZY QUICK BROWN FOX z-e lit+uniq: foo bzr azppz 0123456789.,?`~!@#$%^&*()_-+= XYZZY QUICK BROWN FOX z-e deletion: 02468.,?`~!@#$%^&*()_-+= XYZZY QUICK BROWN FOX - del-inv: foobarzappa13579ae del-inv2: xyywxwxwxwzzww0y2z4w6x8yywxwwxyzwxyyywxzxzzwXYZZYwQUICKwBROWNwFOXwwxw srcvar: FOO BAR ZAPPA 0123456789.,?`~!@#$%^&*()_-+= xyzzy quick brown fox A-E src-dest: ULL YZI AZKKZ 0123456789.,?`~!@#$%^&*()_-+= cbaab jfrxp yildm ulc Z-V version: 20100106-BlameMichelson ( TRE 0.7.5 (LGPL) ) noSpaces before: Are The Russian Involved ? noSpaces after : AreTheRussianInvolved? withSpaces before: Nuclear Combat Toe To Toe With The Rooskies ! withSpaces after : NuclearCombatToeToToeWithTheRooskies! #! /usr/bin/crm # # quine.crm - produce program's own source code as output # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. { window output /:*:_pgm_text:/ } one two three ***** checking return and exit codes Status: DEAD MINION, EXIT CODE: 123 ***** check that failed syscalls will code right Status: DEAD MINION, EXIT CODE: 127 result -->you hit the jackpot<--->and that's good<--->:frobotz:<--- **** Default (SBPH Markovian) classifier type A CLASSIFY fails; success probability: 0.0000 pR: -11.4360 Best match to file #1 (a_test.css) prob: 1.0000 pR: 11.4360 Total features in input file: 192 #0 (m_test.css): features: 63522, hits: 129, prob: 3.66e-12, pR: -11.44 #1 (a_test.css): features: 68546, hits: 4319, prob: 1.00e+00, pR: 11.44 type M CLASSIFY succeeds; success probability: 0.9875 pR: 1.8989 Best match to file #0 (m_test.css) prob: 0.9875 pR: 1.8989 Total features in input file: 130 #0 (m_test.css): features: 63522, hits: 749, prob: 9.88e-01, pR: 1.90 #1 (a_test.css): features: 68546, hits: 248, prob: 1.25e-02, pR: -1.90 **** OSB Markovian classifier type A CLASSIFY fails; success probability: 0.0007 pR: -3.1804 Best match to file #1 (a_test.css) prob: 0.9993 pR: 3.1804 Total features in input file: 48 #0 (m_test.css): features: 15881, hits: 7, prob: 6.60e-04, pR: -3.18 #1 (a_test.css): features: 17137, hits: 488, prob: 9.99e-01, pR: 3.18 type M CLASSIFY succeeds; success probability: 0.8832 pR: 0.8787 Best match to file #0 (m_test.css) prob: 0.8832 pR: 0.8787 Total features in input file: 40 #0 (m_test.css): features: 15881, hits: 340, prob: 8.83e-01, pR: 0.88 #1 (a_test.css): features: 17137, hits: 8, prob: 1.17e-01, pR: -0.88 **** OSB Markov Unique classifier type A CLASSIFY fails; success probability: 0.0007 pR: -3.1365 Best match to file #1 (a_test.css) prob: 0.9993 pR: 3.1365 Total features in input file: 48 #0 (m_test.css): features: 15047, hits: 7, prob: 7.30e-04, pR: -3.14 #1 (a_test.css): features: 14993, hits: 426, prob: 9.99e-01, pR: 3.14 type M CLASSIFY succeeds; success probability: 0.8612 pR: 0.7928 Best match to file #0 (m_test.css) prob: 0.8612 pR: 0.7928 Total features in input file: 40 #0 (m_test.css): features: 15047, hits: 112, prob: 8.61e-01, pR: 0.79 #1 (a_test.css): features: 14993, hits: 4, prob: 1.39e-01, pR: -0.79 **** OSB Markov Chisquared Unique classifier type A CLASSIFY fails; success probability: 0.0000 pR: -6.0741 Best match to file #1 (a_test.css) prob: 1.0000 pR: 6.0741 Total features in input file: 48 #0 (m_test.css): features: 15047, hits: 1, chi2: 3.20e+01, pR: -6.07 #1 (a_test.css): features: 14993, hits: 33, chi2: 2.94e-02, pR: 6.07 type M CLASSIFY succeeds; success probability: 0.8238 pR: 0.6700 Best match to file #0 (m_test.css) prob: 0.8238 pR: 0.6700 Total features in input file: 40 #0 (m_test.css): features: 15047, hits: 9, chi2: 1.11e+01, pR: 0.67 #1 (a_test.css): features: 14993, hits: 1, chi2: 2.40e+01, pR: -0.67 **** OSBF Local Confidence (Fidelis) classifier type A CLASSIFY fails; success probability: 0.0538 pR: -1.2453 Best match to file #1 (a_test.css) prob: 0.9462 pR: 1.2453 Total features in input file: 48 #0 (m_test.css): hits: 1, ufeats: 1, prob: 5.38e-02, pR: -1.25 #1 (a_test.css): hits: 33, ufeats: 33, prob: 9.46e-01, pR: 1.25 type M CLASSIFY succeeds; success probability: 0.6823 pR: 0.3319 Best match to file #0 (m_test.css) prob: 0.6823 pR: 0.3319 Total features in input file: 40 #0 (m_test.css): hits: 9, ufeats: 9, prob: 6.82e-01, pR: 0.33 #1 (a_test.css): hits: 1, ufeats: 1, prob: 3.18e-01, pR: -0.33 **** OSB Winnow classifier type A CLASSIFY fails; success probability: 0.4347 pR: -1.1415 Best match to file #1 (a_test.css) weight: 55.3809 pR: 2.2485 Total features in input file: 48 #0 (m_test.css): features: 30529.10, unseen: 1.50e+01, weight: 4.26e+01, pR: 1.11 #1 (a_test.css): features: 30507.50, unseen: 1.50e+01, weight: 5.54e+01, pR: 2.25 type M CLASSIFY succeeds; success probability: 0.5199 pR: 0.3454 Best match to file #0 (m_test.css) weight: 41.8609 pR: 6.6757 Total features in input file: 40 #0 (m_test.css): features: 30529.10, unseen: 3.10e+01, weight: 4.19e+01, pR: 6.68 #1 (a_test.css): features: 30507.50, unseen: 3.10e+01, weight: 3.87e+01, pR: 6.33 **** Now verify that winnow learns affect only the named file (m_test.css) type M CLASSIFY succeeds; success probability: 0.5213 pR: 0.3697 Best match to file #0 (m_test.css) weight: 42.0957 pR: 6.7000 Total features in input file: 40 #0 (m_test.css): features: 47122.80, unseen: 3.10e+01, weight: 4.21e+01, pR: 6.70 #1 (a_test.css): features: 30507.50, unseen: 3.10e+01, weight: 3.87e+01, pR: 6.33 and now refute-learn into a_test.css type M CLASSIFY succeeds; success probability: 0.5224 pR: 0.3892 Best match to file #0 (m_test.css) weight: 42.0957 pR: 6.7000 Total features in input file: 40 #0 (m_test.css): features: 47122.80, unseen: 3.10e+01, weight: 4.21e+01, pR: 6.70 #1 (a_test.css): features: 42181.02, unseen: 3.10e+01, weight: 3.85e+01, pR: 6.31 **** Unigram Bayesian classifier type A CLASSIFY fails; success probability: 0.1570 pR: -0.7300 Best match to file #1 (a_test.css) prob: 0.8430 pR: 0.7300 Total features in input file: 12 #0 (m_test.css): features: 3972, hits: 125, prob: 1.57e-01, pR: -0.73 #1 (a_test.css): features: 4286, hits: 283, prob: 8.43e-01, pR: 0.73 type M CLASSIFY succeeds; success probability: 0.7226 pR: 0.4157 Best match to file #0 (m_test.css) prob: 0.7226 pR: 0.4157 Total features in input file: 10 #0 (m_test.css): features: 3972, hits: 205, prob: 7.23e-01, pR: 0.42 #1 (a_test.css): features: 4286, hits: 240, prob: 2.77e-01, pR: -0.42 **** unigram Winnow classifier type A CLASSIFY fails; success probability: 0.4432 pR: -0.9918 Best match to file #1 (a_test.css) weight: 13.7145 pR: 0.5800 Total features in input file: 12 #0 (m_test.css): features: 3056.52, unseen: 0.00e+00, weight: 1.09e+01, pR: -0.41 #1 (a_test.css): features: 2901.72, unseen: 0.00e+00, weight: 1.37e+01, pR: 0.58 type M CLASSIFY succeeds; success probability: 0.5652 pR: 1.1393 Best match to file #0 (m_test.css) weight: 10.4009 pR: 1.7197 Total features in input file: 9 #0 (m_test.css): features: 3056.52, unseen: 2.00e+00, weight: 1.04e+01, pR: 1.72 #1 (a_test.css): features: 2901.72, unseen: 2.00e+00, weight: 8.00e+00, pR: 0.58 **** OSB Hyperspace classifier type A CLASSIFY fails; success probability: 0.0010 pR: -30.1365 Best match to file #1 (a_test.css) prob: 0.9990 pR: 30.1365 Total features in input file: 47 #0 (m_test.css): features: 15048, hits: 1, radiance: 6.63e-05, prob: 9.68e-04, pR: -30.14 #1 (a_test.css): features: 14994, hits: 32, radiance: 6.84e-02, prob: 9.99e-01, pR: 30.14 type M CLASSIFY succeeds; success probability: 1.0000 pR: 216.2810 Best match to file #0 (m_test.css) prob: 1.0000 pR: 216.2810 Total features in input file: 39 #0 (m_test.css): features: 15048, hits: 8, radiance: 4.25e-03, prob: 1.00e+00, pR: 216.28 #1 (a_test.css): features: 14994, hits: 0, radiance: 0.00e+00, prob: 2.35e-22, pR: -216.28 **** OSB three-letter Hyperspace classifier type A CLASSIFY fails; success probability: 0.1947 pR: -6.1669 Best match to file #1 (a_test.css) prob: 0.8053 pR: 6.1669 Total features in input file: 11 #0 (m_test.css): features: 4348, hits: 5, radiance: 5.75e-03, prob: 1.95e-01, pR: -6.17 #1 (a_test.css): features: 4214, hits: 10, radiance: 2.38e-02, prob: 8.05e-01, pR: 6.17 type M CLASSIFY succeeds; success probability: 0.6695 pR: 3.0659 Best match to file #0 (m_test.css) prob: 0.6695 pR: 3.0659 Total features in input file: 14 #0 (m_test.css): features: 4348, hits: 13, radiance: 3.90e-02, prob: 6.70e-01, pR: 3.07 #1 (a_test.css): features: 4214, hits: 9, radiance: 1.92e-02, prob: 3.30e-01, pR: -3.07 **** Unigram Hyperspace classifier type A CLASSIFY fails; success probability: 0.1628 pR: -7.1107 Best match to file #1 (a_test.css) prob: 0.8372 pR: 7.1107 Total features in input file: 11 #0 (m_test.css): features: 1794, hits: 5, radiance: 1.39e-02, prob: 1.63e-01, pR: -7.11 #1 (a_test.css): features: 1406, hits: 10, radiance: 7.17e-02, prob: 8.37e-01, pR: 7.11 type M CLASSIFY succeeds; success probability: 0.9660 pR: 14.5322 Best match to file #0 (m_test.css) prob: 0.9660 pR: 14.5322 Total features in input file: 9 #0 (m_test.css): features: 1794, hits: 6, radiance: 2.01e-02, prob: 9.66e-01, pR: 14.53 #1 (a_test.css): features: 1406, hits: 1, radiance: 7.09e-04, prob: 3.40e-02, pR: -14.53 **** String Hyperspace classifier type A CLASSIFY fails; success probability: 0.0955 pR: -9.7666 Best match to file #1 (a_test.css) prob: 0.9045 pR: 9.7666 Total features in input file: 61 #0 (m_test.css): features: 22670, hits: 17, radiance: 1.27e-02, prob: 9.55e-02, pR: -9.77 #1 (a_test.css): features: 22452, hits: 52, radiance: 1.21e-01, prob: 9.05e-01, pR: 9.77 type M CLASSIFY succeeds; success probability: 0.8340 pR: 7.0113 Best match to file #0 (m_test.css) prob: 0.8340 pR: 7.0113 Total features in input file: 72 #0 (m_test.css): features: 22670, hits: 45, radiance: 8.94e-02, prob: 8.34e-01, pR: 7.01 #1 (a_test.css): features: 22452, hits: 20, radiance: 1.78e-02, prob: 1.66e-01, pR: -7.01 **** String Unigram Hyperspace classifier type A CLASSIFY fails; success probability: 0.2121 pR: -5.7002 Best match to file #1 (a_test.css) prob: 0.7879 pR: 5.7002 Total features in input file: 11 #0 (m_test.css): features: 3971, hits: 5, radiance: 6.30e-03, prob: 2.12e-01, pR: -5.70 #1 (a_test.css): features: 4285, hits: 10, radiance: 2.34e-02, prob: 7.88e-01, pR: 5.70 type M CLASSIFY succeeds; success probability: 0.9750 pR: 15.9041 Best match to file #0 (m_test.css) prob: 0.9750 pR: 15.9041 Total features in input file: 9 #0 (m_test.css): features: 3971, hits: 6, radiance: 9.08e-03, prob: 9.75e-01, pR: 15.90 #1 (a_test.css): features: 4285, hits: 1, radiance: 2.33e-04, prob: 2.50e-02, pR: -15.90 **** Vector 3-word-bag Hyperspace classifier type A CLASSIFY fails; success probability: 0.0000 pR: -220.5875 Best match to file #1 (a_test.css) prob: 1.0000 pR: 220.5875 Total features in input file: 11 #0 (m_test.css): features: 3971, hits: 0, radiance: 0.00e+00, prob: 8.73e-23, pR: -220.59 #1 (a_test.css): features: 4285, hits: 7, radiance: 1.14e-02, prob: 1.00e+00, pR: 220.59 type M CLASSIFY succeeds; success probability: 1.0000 pR: 204.0055 Best match to file #0 (m_test.css) prob: 1.0000 pR: 204.0055 Total features in input file: 9 #0 (m_test.css): features: 3971, hits: 1, radiance: 2.52e-04, prob: 1.00e+00, pR: 204.01 #1 (a_test.css): features: 4285, hits: 0, radiance: 0.00e+00, prob: 3.98e-21, pR: -204.01 **** Bit-Entropy classifier type A CLASSIFY fails; success probability: 0.0000 pR: -12.0014 Best match to file #1 (a_test.css) prob: 1.0000 pR: 12.0014 Total features in input file: 496 #0 (m_test.css): features: 181352 (3%), entropy: 434.778519, jumps: 31, prob: 9.97e-13, pR: -12.00 #1 (a_test.css): features: 179608 (3%), entropy: 144.839378, jumps: 5, prob: 1.00e+00, pR: 12.00 type M CLASSIFY succeeds; success probability: 1.0000 pR: 6.3285 Best match to file #0 (m_test.css) prob: 1.0000 pR: 6.3285 Total features in input file: 584 #0 (m_test.css): features: 181352 (3%), entropy: 385.057958, jumps: 29, prob: 1.00e+00, pR: 6.33 #1 (a_test.css): features: 179608 (3%), entropy: 537.946666, jumps: 39, prob: 4.69e-07, pR: -6.33 **** Bit-Entropy Toroid classifier type A CLASSIFY fails; success probability: 0.0000 pR: -7.5206 Best match to file #1 (a_test.css) prob: 1.0000 pR: 7.5206 Total features in input file: 496 #0 (m_test.css): features: 181352 (100%), entropy: 464.920710, jumps: 33, prob: 3.02e-08, pR: -7.52 #1 (a_test.css): features: 179608 (100%), entropy: 283.231656, jumps: 13, prob: 1.00e+00, pR: 7.52 type M CLASSIFY succeeds; success probability: 1.0000 pR: 5.1811 Best match to file #0 (m_test.css) prob: 1.0000 pR: 5.1811 Total features in input file: 584 #0 (m_test.css): features: 181352 (100%), entropy: 550.166118, jumps: 41, prob: 1.00e+00, pR: 5.18 #1 (a_test.css): features: 179608 (100%), entropy: 675.335081, jumps: 53, prob: 6.59e-06, pR: -5.18 **** Fast Substring Compression Match Classifier type A CLASSIFY fails; success probability: 0.1272 pR: -167.2656 Best match to file #1 (a_test.css) prob: 0.8728 pR: 167.2656 Total features in input file: 62 #0 (m_test.css): features: 22673, chcs: 30.40, prob: 1.27e-01, pR: -167.27 #1 (a_test.css): features: 22455, chcs: 208.52, prob: 8.73e-01, pR: 167.27 type M CLASSIFY succeeds; success probability: 0.7143 pR: 79.5911 Best match to file #0 (m_test.css) prob: 0.7143 pR: 79.5911 Total features in input file: 73 #0 (m_test.css): features: 22673, chcs: 138.32, prob: 7.14e-01, pR: 79.59 #1 (a_test.css): features: 22455, chcs: 55.33, prob: 2.86e-01, pR: -79.59 **** Neural Network Classifier type A CLASSIFY fails; success probability: 0.495362 pR: -1.7894 Best match to file #1 (a_test.css) prob: 0.5046 pR: 95.1750 Total features in input file: 220 #0 (m_test.css): feats: 33029 ic: 0.95 oc: 0.04 prob: 4.95e-01, pR: 90.08 #1 (a_test.css): feats: 33029 ic: 0.98 oc: 0.03 prob: 5.05e-01, pR: 95.17 type A CLASSIFY fails; success probability: 0.448415 pR: -17.9280 Best match to file #1 (a_test.css) prob: 0.5516 pR: 93.7239 Total features in input file: 188 #0 (m_test.css): feats: 33029 ic: 0.77 oc: 0.20 prob: 4.48e-01, pR: 55.70 #1 (a_test.css): feats: 33029 ic: 0.97 oc: 0.04 prob: 5.52e-01, pR: 93.72 **** Alternating Example Neural Network Classifier TRAINING **** Alternating Example Neural Network Classifier RUNNING TEST type A CLASSIFY fails; success probability: 0.093212 pR: -81.8632 Best match to file #1 (a_test.css) prob: 0.9068 pR: 83.5828 Total features in input file: 220 #0 (m_test.css): feats: 5826 ic: 0.09 oc: 0.90 prob: 9.32e-02, pR: -80.16 #1 (a_test.css): feats: 5826 ic: 0.92 oc: 0.09 prob: 9.07e-01, pR: 83.58 type M CLASSIFY succeeds; success probability: 0.671733 pR: 39.6054 Best match to file #0 (m_test.css) prob: 0.6717 pR: 51.5804 Total features in input file: 175 #0 (m_test.css): feats: 5826 ic: 0.76 oc: 0.26 prob: 6.72e-01, pR: 51.58 #1 (a_test.css): feats: 5826 ic: 0.39 oc: 0.65 prob: 3.28e-01, pR: -27.42 **** Support Vector Machine (SVM) unigram classifier type M CLASSIFY succeeds success probability: 0.688203 pR: 1.5838 Best match to class #0 prob: 0.6882 pR: 1.5838 Total features in input file: 12 #0 (label +1): documents: 193, features: 3567, prob: 6.88e-01, pR: 1.58 #1 (label -1): documents: 57, features: 3174, prob: 3.12e-01, pR: -1.58 type M CLASSIFY succeeds success probability: 0.855189 pR: 7.4081 Best match to class #0 prob: 0.8552 pR: 7.4081 Total features in input file: 9 #0 (label +1): documents: 193, features: 3567, prob: 8.55e-01, pR: 7.41 #1 (label -1): documents: 57, features: 3174, prob: 1.45e-01, pR: -7.41 **** Support Vector Machine (SVM) classifier type M CLASSIFY succeeds success probability: 0.677904 pR: 1.4405 Best match to class #0 prob: 0.6779 pR: 1.4405 Total features in input file: 48 #0 (label +1): documents: 193, features: 15809, prob: 6.78e-01, pR: 1.44 #1 (label -1): documents: 57, features: 16604, prob: 3.22e-01, pR: -1.44 type M CLASSIFY succeeds success probability: 0.801854 pR: 4.3443 Best match to class #0 prob: 0.8019 pR: 4.3443 Total features in input file: 40 #0 (label +1): documents: 193, features: 15809, prob: 8.02e-01, pR: 4.34 #1 (label -1): documents: 57, features: 16604, prob: 1.98e-01, pR: -4.34 **** Alternating Example SVM Network Classifier TRAINING **** Alternating Example SVM Network Classifier RUNNING TEST type A CLASSIFY fails success probability: 0.075773 pR: -19.0619 Best match to class #1 prob: 0.9242 pR: 19.0619 Total features in input file: 150 #0 (label +1): documents: 16, features: 2025, prob: 7.58e-02, pR: -19.06 #1 (label -1): documents: 15, features: 2026, prob: 9.24e-01, pR: 19.06 type M CLASSIFY succeeds success probability: 0.784357 pR: 3.7026 Best match to class #0 prob: 0.7844 pR: 3.7026 Total features in input file: 143 #0 (label +1): documents: 16, features: 2025, prob: 7.84e-01, pR: 3.70 #1 (label -1): documents: 15, features: 2026, prob: 2.16e-01, pR: -3.70 **** String Kernel SVM (SKS) classifier type M CLASSIFY succeeds; success probability: 0.9749 pR: 15.8927 Best match to file #0 (m_test.css) prob: 0.9749 pR: 15.8927 Total features in input file: 62 #0 (m_test.css):documents: 193, features: 22281, prob: 9.75e-01, pR: 15.89 #1 (a_test.css):documents: 57, features: 22045, prob: 2.51e-02, pR: -15.89 type M CLASSIFY succeeds; success probability: 0.9782 pR: 16.5262 Best match to file #0 (m_test.css) prob: 0.9782 pR: 16.5262 Total features in input file: 73 #0 (m_test.css):documents: 193, features: 22281, prob: 9.78e-01, pR: 16.53 #1 (a_test.css):documents: 57, features: 22045, prob: 2.18e-02, pR: -16.53 **** String Kernel SVM (SKS) Unique classifier type M CLASSIFY succeeds; success probability: 0.9114 pR: 10.1239 Best match to file #0 (m_test.css) prob: 0.9114 pR: 10.1239 Total features in input file: 61 #0 (m_test.css):documents: 193, features: 19836, prob: 9.11e-01, pR: 10.12 #1 (a_test.css):documents: 57, features: 17656, prob: 8.86e-02, pR: -10.12 type M CLASSIFY succeeds; success probability: 0.8047 pR: 6.1506 Best match to file #0 (m_test.css) prob: 0.8047 pR: 6.1506 Total features in input file: 70 #0 (m_test.css):documents: 193, features: 19836, prob: 8.05e-01, pR: 6.15 #1 (a_test.css):documents: 57, features: 17656, prob: 1.95e-01, pR: -6.15 **** Bytewise Correlation classifier type A CLASSIFY fails; success probability: 0.0083 pR: -2.0751 Best match to file #1 (Alice_In_Wonderland_Chap_1_And_2.txt) prob: 0.9917 pR: 2.0751 Total features in input file: 22451 #0 (Macbeth_Act_IV.txt): features: 22669, L1: 83555 L2: 114702 L3: 173428, l4: 358614 prob: 8.34e-03, pR: -2.08 #1 (Alice_In_Wonderland_Chap_1_And_2.txt): features: 22451, L1: 96783 L2: 180165 L3: 1550947, l4: 49378101 prob: 9.92e-01, pR: 2.08 type M CLASSIFY succeeds; success probability: 0.9223 pR: 1.0742 Best match to file #0 (Macbeth_Act_IV.txt) prob: 0.9223 pR: 1.0742 Total features in input file: 22669 #0 (Macbeth_Act_IV.txt): features: 22669, L1: 406150 L2: 611044 L3: 2022288, l4: 28898980 prob: 9.22e-01, pR: 1.07 #1 (Alice_In_Wonderland_Chap_1_And_2.txt): features: 22451, L1: 476674 L2: 678005 L3: 1148703, l4: 2858981 prob: 7.77e-02, pR: -1.07 **** Clump / Pmulc Test ............................................... Unsure result PMULC fails; success probabilty: 0.0089 pR: -20.4448 Best match to clump #24 (clump_#24) prob: 0.0089 pR: -20.4448 Closest document: #38 (document_#38) affinity: 0.7076 Total features in input file: 211 1: (clump_#1): documents: 3 affinity: 0.1321 prob: 0.0000 pR: -70.0000 2: (clump_#2): documents: 2 affinity: 0.1186 prob: 0.0000 pR: -70.0000 3: (clump_#3): documents: 1 affinity: 0.1380 prob: 0.0000 pR: -70.0000 4: (clump_#4): documents: 4 affinity: 0.1298 prob: 0.0000 pR: -70.0000 5: (clump_#5): documents: 1 affinity: 0.1121 prob: 0.0000 pR: -70.0000 6: (clump_#6): documents: 1 affinity: 0.1431 prob: 0.0000 pR: -70.0000 7: (clump_#7): documents: 2 affinity: 0.1300 prob: 0.0000 pR: -70.0000 8: (clump_#8): documents: 1 affinity: 0.0980 prob: 0.0000 pR: -70.0000 9: (clump_#9): documents: 1 affinity: 0.1564 prob: 0.0000 pR: -70.0000 10: (clump_#10): documents: 7 affinity: 0.1173 prob: 0.0000 pR: -70.0000 11: (clump_#11): documents: 1 affinity: 0.1436 prob: 0.0000 pR: -70.0000 12: (clump_#12): documents: 1 affinity: 0.1392 prob: 0.0000 pR: -70.0000 13: (clump_#13): documents: 1 affinity: 0.1042 prob: 0.0000 pR: -70.0000 14: (clump_#14): documents: 2 affinity: 0.1344 prob: 0.0000 pR: -70.0000 15: (clump_#15): documents: 1 affinity: 0.1171 prob: 0.0000 pR: -70.0000 16: (clump_#16): documents: 1 affinity: 0.1116 prob: 0.0000 pR: -70.0000 17: (clump_#17): documents: 1 affinity: 0.1134 prob: 0.0000 pR: -70.0000 18: (clump_#18): documents: 2 affinity: 0.1162 prob: 0.0000 pR: -70.0000 19: (clump_#19): documents: 1 affinity: 0.1019 prob: 0.0000 pR: -70.0000 20: (clump_#20): documents: 1 affinity: 0.1322 prob: 0.0000 pR: -70.0000 21: (clump_#21): documents: 1 affinity: 0.1461 prob: 0.0000 pR: -70.0000 22: (clump_#22): documents: 1 affinity: 0.1169 prob: 0.0000 pR: -70.0000 23: (clump_#23): documents: 1 affinity: 0.1131 prob: 0.0000 pR: -70.0000 24: (clump_#24): documents: 9 affinity: 0.2366 prob: 0.0089 pR: -20.4448 Likely result: PMULC succeeds; success probabilty: 1.0000 pR: 60.3644 Best match to clump #2 (clump_#2) prob: 1.0000 pR: 60.3644 Closest document: #1 (document_#1) affinity: 0.7264 Total features in input file: 179 1: (clump_#1): documents: 3 affinity: 0.1365 prob: 0.0000 pR: -70.0000 2: (clump_#2): documents: 2 affinity: 0.4956 prob: 1.0000 pR: 60.3644 3: (clump_#3): documents: 1 affinity: 0.1242 prob: 0.0000 pR: -70.0000 4: (clump_#4): documents: 4 affinity: 0.1411 prob: 0.0000 pR: -70.0000 5: (clump_#5): documents: 1 affinity: 0.1009 prob: 0.0000 pR: -70.0000 6: (clump_#6): documents: 1 affinity: 0.0933 prob: 0.0000 pR: -70.0000 7: (clump_#7): documents: 2 affinity: 0.1343 prob: 0.0000 pR: -70.0000 8: (clump_#8): documents: 1 affinity: 0.1013 prob: 0.0000 pR: -70.0000 9: (clump_#9): documents: 1 affinity: 0.1053 prob: 0.0000 pR: -70.0000 10: (clump_#10): documents: 7 affinity: 0.1213 prob: 0.0000 pR: -70.0000 11: (clump_#11): documents: 1 affinity: 0.1484 prob: 0.0000 pR: -70.0000 12: (clump_#12): documents: 1 affinity: 0.1253 prob: 0.0000 pR: -70.0000 13: (clump_#13): documents: 1 affinity: 0.1077 prob: 0.0000 pR: -70.0000 14: (clump_#14): documents: 2 affinity: 0.1080 prob: 0.0000 pR: -70.0000 15: (clump_#15): documents: 1 affinity: 0.0877 prob: 0.0000 pR: -70.0000 16: (clump_#16): documents: 1 affinity: 0.1474 prob: 0.0000 pR: -70.0000 17: (clump_#17): documents: 1 affinity: 0.1346 prob: 0.0000 pR: -70.0000 18: (clump_#18): documents: 2 affinity: 0.1201 prob: 0.0000 pR: -70.0000 19: (clump_#19): documents: 1 affinity: 0.1053 prob: 0.0000 pR: -70.0000 20: (clump_#20): documents: 1 affinity: 0.1366 prob: 0.0000 pR: -70.0000 21: (clump_#21): documents: 1 affinity: 0.1094 prob: 0.0000 pR: -70.0000 22: (clump_#22): documents: 1 affinity: 0.1388 prob: 0.0000 pR: -70.0000 23: (clump_#23): documents: 1 affinity: 0.1018 prob: 0.0000 pR: -70.0000 24: (clump_#24): documents: 9 affinity: 0.1191 prob: 0.0000 pR: -70.0000 Likely result: 0 (document_#0) clump: 1 (clump_#1) affinity: 0.1255 1 (document_#1) clump: 2 (clump_#2) affinity: 0.2121 2 (document_#2) clump: 2 (clump_#2) affinity: 0.2121 3 (document_#3) clump: 3 (clump_#3) affinity: 0.0000 4 (document_#4) clump: 1 (clump_#1) affinity: 0.1628 5 (document_#5) clump: 4 (clump_#4) affinity: 0.1535 6 (document_#6) clump: 4 (clump_#4) affinity: 0.1334 7 (document_#7) clump: 4 (clump_#4) affinity: 0.1957 8 (document_#8) clump: 4 (clump_#4) affinity: 0.1598 9 (document_#9) clump: 5 (clump_#5) affinity: 0.0000 10 (document_#10) clump: 1 (clump_#1) affinity: 0.1821 11 (document_#11) clump: 6 (clump_#6) affinity: 0.0000 12 (document_#12) clump: 7 (clump_#7) affinity: 0.1364 13 (document_#13) clump: 7 (clump_#7) affinity: 0.1364 14 (document_#14) clump: 8 (clump_#8) affinity: 0.0000 15 (document_#15) clump: 9 (clump_#9) affinity: 0.0000 16 (document_#16) clump: 10 (clump_#10) affinity: 0.1744 17 (document_#17) clump: 10 (clump_#10) affinity: 0.1882 18 (document_#18) clump: 10 (clump_#10) affinity: 0.1999 19 (document_#19) clump: 10 (clump_#10) affinity: 0.1940 20 (document_#20) clump: 10 (clump_#10) affinity: 0.1593 21 (document_#21) clump: 10 (clump_#10) affinity: 0.1684 22 (document_#22) clump: 10 (clump_#10) affinity: 0.1682 23 (document_#23) clump: 11 (clump_#11) affinity: 0.0000 24 (document_#24) clump: 12 (clump_#12) affinity: 0.0000 25 (document_#25) clump: 13 (clump_#13) affinity: 0.0000 26 (document_#26) clump: 14 (clump_#14) affinity: 0.0916 27 (document_#27) clump: 15 (clump_#15) affinity: 0.0000 28 (document_#28) clump: 16 (clump_#16) affinity: 0.0000 29 (document_#29) clump: 17 (clump_#17) affinity: 0.0000 30 (document_#30) clump: 18 (clump_#18) affinity: 0.0892 31 (document_#31) clump: 18 (clump_#18) affinity: 0.0892 32 (document_#32) clump: 19 (clump_#19) affinity: 0.0000 33 (document_#33) clump: 20 (clump_#20) affinity: 0.0000 34 (document_#34) clump: 14 (clump_#14) affinity: 0.0916 35 (document_#35) clump: 21 (clump_#21) affinity: 0.0000 36 (document_#36) clump: 22 (clump_#22) affinity: 0.0000 37 (document_#37) clump: 23 (clump_#23) affinity: 0.0000 38 (document_#38) clump: 24 (clump_#24) affinity: 0.2282 39 (document_#39) clump: 24 (clump_#24) affinity: 0.2296 40 (document_#40) clump: 24 (clump_#24) affinity: 0.2475 41 (document_#41) clump: 24 (clump_#24) affinity: 0.2399 42 (document_#42) clump: 24 (clump_#24) affinity: 0.2422 43 (document_#43) clump: 24 (clump_#24) affinity: 0.1964 44 (document_#44) clump: 24 (clump_#24) affinity: 0.2248 45 (document_#45) clump: 24 (clump_#24) affinity: 0.2493 46 (document_#46) clump: 24 (clump_#24) affinity: 0.2191 **** Principal Component Analysis (PCA) unigram classifier type A CLASSIFY fails success probability: 0.499846 pR: -0.3073 Best match to class #1 prob: 0.5002 pR: 0.3073 Total features in input file: 11 #0 (label +1): documents: 193, features: 3567, prob: 5.00e-01, pR: -0.31 #1 (label -1): documents: 57, features: 3174, prob: 5.00e-01, pR: 0.31 type M CLASSIFY succeeds success probability: 0.506965 pR: 13.9301 Best match to class #0 prob: 0.5070 pR: 13.9301 Total features in input file: 8 #0 (label +1): documents: 193, features: 3567, prob: 5.07e-01, pR: 13.93 #1 (label -1): documents: 57, features: 3174, prob: 4.93e-01, pR: -13.93 **** Principal Component Analysis (PCA) classifier type A CLASSIFY fails success probability: 0.497977 pR: -4.0453 Best match to class #1 prob: 0.5020 pR: 4.0453 Total features in input file: 47 #0 (label +1): documents: 193, features: 15809, prob: 4.98e-01, pR: -4.05 #1 (label -1): documents: 57, features: 16604, prob: 5.02e-01, pR: 4.05 type M CLASSIFY succeeds success probability: 0.502855 pR: 5.7103 Best match to class #0 prob: 0.5029 pR: 5.7103 Total features in input file: 39 #0 (label +1): documents: 193, features: 15809, prob: 5.03e-01, pR: 5.71 #1 (label -1): documents: 57, features: 16604, prob: 4.97e-01, pR: -5.71 **** Alternating Example PCA Network Classifier TRAINING **** Alternating Example PCA Network Classifier RUNNING TEST type A CLASSIFY fails success probability: 0.171492 pR: -787.5435 Best match to class #1 prob: 0.8285 pR: 787.5435 Total features in input file: 149 #0 (label +1): documents: 16, features: 2025, prob: 1.71e-01, pR: -787.54 #1 (label -1): documents: 15, features: 2026, prob: 8.29e-01, pR: 787.54 type M CLASSIFY succeeds success probability: 0.779493 pR: 631.3570 Best match to class #0 prob: 0.7795 pR: 631.3570 Total features in input file: 142 #0 (label +1): documents: 16, features: 2025, prob: 7.79e-01, pR: 631.36 #1 (label -1): documents: 15, features: 2026, prob: 2.21e-01, pR: -631.36 crm114-20100106-BlameMichelson.src/rewrites.mfp0000644000000000017500000000031211321154266017341 0ustar rootwsyyourname@yourdomain.yourplace>->MyEmailAddress [[:space:]]yourname>-> MyEmailName yourlocal_mailrouter_name.yourdomain.yourplace>->MyLocalMailRouter yourlocal_mailrouter_IP_address>->MyLocalMailRouterIPcrm114-20100106-BlameMichelson.src/Alice_In_Wonderland_Chap_1_And_2.txt0000644000000000017500000005366311321154266023551 0ustar rootwsyCHAPTER I. Down the Rabbit-Hole Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?' So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her. There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually TOOK A WATCH OUT OF ITS WAISTCOAT-POCKET, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge. In another moment down went Alice after it, never once considering how in the world she was to get out again. The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well. Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it. 'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.) Down, down, down. Would the fall NEVER come to an end! 'I wonder how many miles I've fallen by this time?' she said aloud. 'I must be getting somewhere near the centre of the earth. Let me see: that would be four thousand miles down, I think--' (for, you see, Alice had learnt several things of this sort in her lessons in the schoolroom, and though this was not a VERY good opportunity for showing off her knowledge, as there was no one to listen to her, still it was good practice to say it over) '--yes, that's about the right distance--but then I wonder what Latitude or Longitude I've got to?' (Alice had no idea what Latitude was, or Longitude either, but thought they were nice grand words to say.) Presently she began again. 'I wonder if I shall fall right THROUGH the earth! How funny it'll seem to come out among the people that walk with their heads downward! The Antipathies, I think--' (she was rather glad there WAS no one listening, this time, as it didn't sound at all the right word) '--but I shall have to ask them what the name of the country is, you know. Please, Ma'am, is this New Zealand or Australia?' (and she tried to curtsey as she spoke--fancy CURTSEYING as you're falling through the air! Do you think you could manage it?) 'And what an ignorant little girl she'll think me for asking! No, it'll never do to ask: perhaps I shall see it written up somewhere.' Down, down, down. There was nothing else to do, so Alice soon began talking again. 'Dinah'll miss me very much to-night, I should think!' (Dinah was the cat.) 'I hope they'll remember her saucer of milk at tea-time. Dinah my dear! I wish you were down here with me! There are no mice in the air, I'm afraid, but you might catch a bat, and that's very like a mouse, you know. But do cats eat bats, I wonder?' And here Alice began to get rather sleepy, and went on saying to herself, in a dreamy sort of way, 'Do cats eat bats? Do cats eat bats?' and sometimes, 'Do bats eat cats?' for, you see, as she couldn't answer either question, it didn't much matter which way she put it. She felt that she was dozing off, and had just begun to dream that she was walking hand in hand with Dinah, and saying to her very earnestly, 'Now, Dinah, tell me the truth: did you ever eat a bat?' when suddenly, thump! thump! down she came upon a heap of sticks and dry leaves, and the fall was over. Alice was not a bit hurt, and she jumped up on to her feet in a moment: she looked up, but it was all dark overhead; before her was another long passage, and the White Rabbit was still in sight, hurrying down it. There was not a moment to be lost: away went Alice like the wind, and was just in time to hear it say, as it turned a corner, 'Oh my ears and whiskers, how late it's getting!' She was close behind it when she turned the corner, but the Rabbit was no longer to be seen: she found herself in a long, low hall, which was lit up by a row of lamps hanging from the roof. There were doors all round the hall, but they were all locked; and when Alice had been all the way down one side and up the other, trying every door, she walked sadly down the middle, wondering how she was ever to get out again. Suddenly she came upon a little three-legged table, all made of solid glass; there was nothing on it except a tiny golden key, and Alice's first thought was that it might belong to one of the doors of the hall; but, alas! either the locks were too large, or the key was too small, but at any rate it would not open any of them. However, on the second time round, she came upon a low curtain she had not noticed before, and behind it was a little door about fifteen inches high: she tried the little golden key in the lock, and to her great delight it fitted! Alice opened the door and found that it led into a small passage, not much larger than a rat-hole: she knelt down and looked along the passage into the loveliest garden you ever saw. How she longed to get out of that dark hall, and wander about among those beds of bright flowers and those cool fountains, but she could not even get her head through the doorway; 'and even if my head would go through,' thought poor Alice, 'it would be of very little use without my shoulders. Oh, how I wish I could shut up like a telescope! I think I could, if I only know how to begin.' For, you see, so many out-of-the-way things had happened lately, that Alice had begun to think that very few things indeed were really impossible. There seemed to be no use in waiting by the little door, so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes: this time she found a little bottle on it, ('which certainly was not here before,' said Alice,) and round the neck of the bottle was a paper label, with the words 'DRINK ME' beautifully printed on it in large letters. It was all very well to say 'Drink me,' but the wise little Alice was not going to do THAT in a hurry. 'No, I'll look first,' she said, 'and see whether it's marked "poison" or not'; for she had read several nice little histories about children who had got burnt, and eaten up by wild beasts and other unpleasant things, all because they WOULD not remember the simple rules their friends had taught them: such as, that a red-hot poker will burn you if you hold it too long; and that if you cut your finger VERY deeply with a knife, it usually bleeds; and she had never forgotten that, if you drink much from a bottle marked 'poison,' it is almost certain to disagree with you, sooner or later. However, this bottle was NOT marked 'poison,' so Alice ventured to taste it, and finding it very nice, (it had, in fact, a sort of mixed flavour of cherry-tart, custard, pine-apple, roast turkey, toffee, and hot buttered toast,) she very soon finished it off. * * * * * * * * * * * * * * * * * * * * 'What a curious feeling!' said Alice; 'I must be shutting up like a telescope.' And so it was indeed: she was now only ten inches high, and her face brightened up at the thought that she was now the right size for going through the little door into that lovely garden. First, however, she waited for a few minutes to see if she was going to shrink any further: she felt a little nervous about this; 'for it might end, you know,' said Alice to herself, 'in my going out altogether, like a candle. I wonder what I should be like then?' And she tried to fancy what the flame of a candle is like after the candle is blown out, for she could not remember ever having seen such a thing. After a while, finding that nothing more happened, she decided on going into the garden at once; but, alas for poor Alice! when she got to the door, she found she had forgotten the little golden key, and when she went back to the table for it, she found she could not possibly reach it: she could see it quite plainly through the glass, and she tried her best to climb up one of the legs of the table, but it was too slippery; and when she had tired herself out with trying, the poor little thing sat down and cried. 'Come, there's no use in crying like that!' said Alice to herself, rather sharply; 'I advise you to leave off this minute!' She generally gave herself very good advice, (though she very seldom followed it), and sometimes she scolded herself so severely as to bring tears into her eyes; and once she remembered trying to box her own ears for having cheated herself in a game of croquet she was playing against herself, for this curious child was very fond of pretending to be two people. 'But it's no use now,' thought poor Alice, 'to pretend to be two people! Why, there's hardly enough of me left to make ONE respectable person!' Soon her eye fell on a little glass box that was lying under the table: she opened it, and found in it a very small cake, on which the words 'EAT ME' were beautifully marked in currants. 'Well, I'll eat it,' said Alice, 'and if it makes me grow larger, I can reach the key; and if it makes me grow smaller, I can creep under the door; so either way I'll get into the garden, and I don't care which happens!' She ate a little bit, and said anxiously to herself, 'Which way? Which way?', holding her hand on the top of her head to feel which way it was growing, and she was quite surprised to find that she remained the same size: to be sure, this generally happens when one eats cake, but Alice had got so much into the way of expecting nothing but out-of-the-way things to happen, that it seemed quite dull and stupid for life to go on in the common way. So she set to work, and very soon finished off the cake. * * * * * * * * * * * * * * * * * * * * CHAPTER II. The Pool of Tears 'Curiouser and curiouser!' cried Alice (she was so much surprised, that for the moment she quite forgot how to speak good English); 'now I'm opening out like the largest telescope that ever was! Good-bye, feet!' (for when she looked down at her feet, they seemed to be almost out of sight, they were getting so far off). 'Oh, my poor little feet, I wonder who will put on your shoes and stockings for you now, dears? I'm sure _I_ shan't be able! I shall be a great deal too far off to trouble myself about you: you must manage the best way you can;--but I must be kind to them,' thought Alice, 'or perhaps they won't walk the way I want to go! Let me see: I'll give them a new pair of boots every Christmas.' And she went on planning to herself how she would manage it. 'They must go by the carrier,' she thought; 'and how funny it'll seem, sending presents to one's own feet! And how odd the directions will look! ALICE'S RIGHT FOOT, ESQ. HEARTHRUG, NEAR THE FENDER, (WITH ALICE'S LOVE). Oh dear, what nonsense I'm talking!' Just then her head struck against the roof of the hall: in fact she was now more than nine feet high, and she at once took up the little golden key and hurried off to the garden door. Poor Alice! It was as much as she could do, lying down on one side, to look through into the garden with one eye; but to get through was more hopeless than ever: she sat down and began to cry again. 'You ought to be ashamed of yourself,' said Alice, 'a great girl like you,' (she might well say this), 'to go on crying in this way! Stop this moment, I tell you!' But she went on all the same, shedding gallons of tears, until there was a large pool all round her, about four inches deep and reaching half down the hall. After a time she heard a little pattering of feet in the distance, and she hastily dried her eyes to see what was coming. It was the White Rabbit returning, splendidly dressed, with a pair of white kid gloves in one hand and a large fan in the other: he came trotting along in a great hurry, muttering to himself as he came, 'Oh! the Duchess, the Duchess! Oh! won't she be savage if I've kept her waiting!' Alice felt so desperate that she was ready to ask help of any one; so, when the Rabbit came near her, she began, in a low, timid voice, 'If you please, sir--' The Rabbit started violently, dropped the white kid gloves and the fan, and skurried away into the darkness as hard as he could go. Alice took up the fan and gloves, and, as the hall was very hot, she kept fanning herself all the time she went on talking: 'Dear, dear! How queer everything is to-day! And yesterday things went on just as usual. I wonder if I've been changed in the night? Let me think: was I the same when I got up this morning? I almost think I can remember feeling a little different. But if I'm not the same, the next question is, Who in the world am I? Ah, THAT'S the great puzzle!' And she began thinking over all the children she knew that were of the same age as herself, to see if she could have been changed for any of them. 'I'm sure I'm not Ada,' she said, 'for her hair goes in such long ringlets, and mine doesn't go in ringlets at all; and I'm sure I can't be Mabel, for I know all sorts of things, and she, oh! she knows such a very little! Besides, SHE'S she, and I'm I, and--oh dear, how puzzling it all is! I'll try if I know all the things I used to know. Let me see: four times five is twelve, and four times six is thirteen, and four times seven is--oh dear! I shall never get to twenty at that rate! However, the Multiplication Table doesn't signify: let's try Geography. London is the capital of Paris, and Paris is the capital of Rome, and Rome--no, THAT'S all wrong, I'm certain! I must have been changed for Mabel! I'll try and say "How doth the little--"' and she crossed her hands on her lap as if she were saying lessons, and began to repeat it, but her voice sounded hoarse and strange, and the words did not come the same as they used to do:-- 'How doth the little crocodile Improve his shining tail, And pour the waters of the Nile On every golden scale! 'How cheerfully he seems to grin, How neatly spread his claws, And welcome little fishes in With gently smiling jaws!' 'I'm sure those are not the right words,' said poor Alice, and her eyes filled with tears again as she went on, 'I must be Mabel after all, and I shall have to go and live in that poky little house, and have next to no toys to play with, and oh! ever so many lessons to learn! No, I've made up my mind about it; if I'm Mabel, I'll stay down here! It'll be no use their putting their heads down and saying "Come up again, dear!" I shall only look up and say "Who am I then? Tell me that first, and then, if I like being that person, I'll come up: if not, I'll stay down here till I'm somebody else"--but, oh dear!' cried Alice, with a sudden burst of tears, 'I do wish they WOULD put their heads down! I am so VERY tired of being all alone here!' As she said this she looked down at her hands, and was surprised to see that she had put on one of the Rabbit's little white kid gloves while she was talking. 'How CAN I have done that?' she thought. 'I must be growing small again.' She got up and went to the table to measure herself by it, and found that, as nearly as she could guess, she was now about two feet high, and was going on shrinking rapidly: she soon found out that the cause of this was the fan she was holding, and she dropped it hastily, just in time to avoid shrinking away altogether. 'That WAS a narrow escape!' said Alice, a good deal frightened at the sudden change, but very glad to find herself still in existence; 'and now for the garden!' and she ran with all speed back to the little door: but, alas! the little door was shut again, and the little golden key was lying on the glass table as before, 'and things are worse than ever,' thought the poor child, 'for I never was so small as this before, never! And I declare it's too bad, that it is!' As she said these words her foot slipped, and in another moment, splash! she was up to her chin in salt water. Her first idea was that she had somehow fallen into the sea, 'and in that case I can go back by railway,' she said to herself. (Alice had been to the seaside once in her life, and had come to the general conclusion, that wherever you go to on the English coast you find a number of bathing machines in the sea, some children digging in the sand with wooden spades, then a row of lodging houses, and behind them a railway station.) However, she soon made out that she was in the pool of tears which she had wept when she was nine feet high. 'I wish I hadn't cried so much!' said Alice, as she swam about, trying to find her way out. 'I shall be punished for it now, I suppose, by being drowned in my own tears! That WILL be a queer thing, to be sure! However, everything is queer to-day.' Just then she heard something splashing about in the pool a little way off, and she swam nearer to make out what it was: at first she thought it must be a walrus or hippopotamus, but then she remembered how small she was now, and she soon made out that it was only a mouse that had slipped in like herself. 'Would it be of any use, now,' thought Alice, 'to speak to this mouse? Everything is so out-of-the-way down here, that I should think very likely it can talk: at any rate, there's no harm in trying.' So she began: 'O Mouse, do you know the way out of this pool? I am very tired of swimming about here, O Mouse!' (Alice thought this must be the right way of speaking to a mouse: she had never done such a thing before, but she remembered having seen in her brother's Latin Grammar, 'A mouse--of a mouse--to a mouse--a mouse--O mouse!') The Mouse looked at her rather inquisitively, and seemed to her to wink with one of its little eyes, but it said nothing. 'Perhaps it doesn't understand English,' thought Alice; 'I daresay it's a French mouse, come over with William the Conqueror.' (For, with all her knowledge of history, Alice had no very clear notion how long ago anything had happened.) So she began again: 'Ou est ma chatte?' which was the first sentence in her French lesson-book. The Mouse gave a sudden leap out of the water, and seemed to quiver all over with fright. 'Oh, I beg your pardon!' cried Alice hastily, afraid that she had hurt the poor animal's feelings. 'I quite forgot you didn't like cats.' 'Not like cats!' cried the Mouse, in a shrill, passionate voice. 'Would YOU like cats if you were me?' 'Well, perhaps not,' said Alice in a soothing tone: 'don't be angry about it. And yet I wish I could show you our cat Dinah: I think you'd take a fancy to cats if you could only see her. She is such a dear quiet thing,' Alice went on, half to herself, as she swam lazily about in the pool, 'and she sits purring so nicely by the fire, licking her paws and washing her face--and she is such a nice soft thing to nurse--and she's such a capital one for catching mice--oh, I beg your pardon!' cried Alice again, for this time the Mouse was bristling all over, and she felt certain it must be really offended. 'We won't talk about her any more if you'd rather not.' 'We indeed!' cried the Mouse, who was trembling down to the end of his tail. 'As if I would talk on such a subject! Our family always HATED cats: nasty, low, vulgar things! Don't let me hear the name again!' 'I won't indeed!' said Alice, in a great hurry to change the subject of conversation. 'Are you--are you fond--of--of dogs?' The Mouse did not answer, so Alice went on eagerly: 'There is such a nice little dog near our house I should like to show you! A little bright-eyed terrier, you know, with oh, such long curly brown hair! And it'll fetch things when you throw them, and it'll sit up and beg for its dinner, and all sorts of things--I can't remember half of them--and it belongs to a farmer, you know, and he says it's so useful, it's worth a hundred pounds! He says it kills all the rats and--oh dear!' cried Alice in a sorrowful tone, 'I'm afraid I've offended it again!' For the Mouse was swimming away from her as hard as it could go, and making quite a commotion in the pool as it went. So she called softly after it, 'Mouse dear! Do come back again, and we won't talk about cats or dogs either, if you don't like them!' When the Mouse heard this, it turned round and swam slowly back to her: its face was quite pale (with passion, Alice thought), and it said in a low trembling voice, 'Let us get to the shore, and then I'll tell you my history, and you'll understand why it is I hate cats and dogs.' It was high time to go, for the pool was getting quite crowded with the birds and animals that had fallen into it: there were a Duck and a Dodo, a Lory and an Eaglet, and several other curious creatures. Alice led the way, and the whole party swam to the shore. crm114-20100106-BlameMichelson.src/crm_pca_lib_fncts.c0000644000000000017500000002063511321154266020576 0ustar rootwsy// crm_pca_lib_fcnts.c - Principal Component Analysis //////////////////////////////////////////////////////////////////////// // This code is originally copyright and owned by William // S. Yerazunis as file crm_neural_net. In return for addition of // significant derivative work, Jennifer Barry is hereby granted a full // unlimited license to use this code, includng license to relicense under // other licenses. //////////////////////////////////////////////////////////////////////// // // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. #include "crm_svm_matrix.h" #include "crm_pca_lib_fncts.h" /******************************************************************************************* *This is a variation of the VonMises power algorithm for sparse matrices. Let us have *x_1, x_2,.., x_n examples with column means xbar. Then the VonMises update is: * p <- sum_{x_i} ((x_i - xbar)*p)*xbar *However, if the x_i were sparse, doing this exactly makes the calculations nonsparse. *Therefore we break it into the following algorithm: * 1) Precompute xbardotp = xbar*p * 2) In a loop over the x_i, using sparse dot products, compute * a) p' = sum_{x_i}(x_i*p - xbar*p)*x_i * b) S = sum_{x_i}(x_i*p - xbar*p) * 3) Update p <- p' - S*xbar * *INPUT: M: The example matrix. * init_pca: An initial guess at the PCA vector or NULL if you don't have one * *OUTPUT: A pca solution consisting of the first principal component p and xbar*p. * *TYPES: For fastest running, first principal component should be left as NON_SPARSE. ******************************************************************************************/ PCA_Solution *run_pca(Matrix *M, Vector *init_pca) { Vector *xbar = vector_make(M->cols, NON_SPARSE, MATR_PRECISE), *p = vector_make(M->cols, NON_SPARSE, MATR_PRECISE), *pold = vector_make(M->cols, NON_SPARSE, MATR_PRECISE), *row; VectorIterator vit; int i, loop_it = 0; double xbardotp, d, s, del, n; PCA_Solution *sol; MATR_DEBUG_MODE = PCA_DEBUG_MODE; //calculate the mean vector for (i = 0; i < M->rows; i++) { row = matr_get_row(M, i); vector_add(xbar, row, xbar); //will do fast addition for us } vector_multiply(xbar, 1.0/M->rows, xbar); if (!init_pca || vector_iszero(init_pca)) { //if we don't have an initial vector //make p a random vector vectorit_set_at_beg(&vit, p); for (i = 0; i < M->cols; i++) { vectorit_insert(&vit, i, rand()/(double)RAND_MAX, p); } } else { //otherwise, start at the initial input vector_copy(init_pca, p); } //normalize p vector_multiply(p, 1.0/norm(p), p); del = PCA_ACCURACY +1; //loop to calculate the principle component while (del > PCA_ACCURACY && loop_it < MAX_PCA_ITERATIONS) { //print out debug info if desired if (PCA_DEBUG_MODE >= PCA_DEBUG) { fprintf(stderr, "%d: delta = %.12lf\n", loop_it, del); } if (PCA_DEBUG_MODE >= PCA_LOOP) { fprintf(stderr, "p = "); vector_write_sp_fp(p, stderr); } xbardotp = dot(p, xbar); //mean vector dot p vector_copy(p, pold); //pold = p on the previous iteration vector_zero(p); //set p to be all zeros s = 0; //when this loop finishes: //s = sum_{rows}(row*pold - xbar*pold) //p = p' = sum_{rows}(row*pold - xbar*pold)*row for (i = 0; i < M->rows; i++) { row = matr_get_row(M, i); //ith row of M d = dot(row, pold) - xbardotp; //row*pold - xbar*pold s += d; //add row*pold - xbar*pold to s vector_add_multiple(p, row, d, p); //add (row*pold - xbar*pold)*row to p } //compute p = p' - sum_{rows}(row*pold - xbar*pold)*xbar // => p -> p - s*xbar vector_add_multiple(p, xbar, -1.0*s, p); //normalize p n = norm(p); while (1.0/n < SVM_EPSILON) { //this will create a problem //since we treat SVM_EPSILON as zero vector_multiply(p, SVM_EPSILON*10, p); n *= SVM_EPSILON*10; } vector_multiply(p, 1.0/n, p); //this is the distance between the old and new vector del = vector_dist2(p, pold); loop_it++; } if (PCA_DEBUG_MODE >= PCA_DEBUG) { fprintf(stderr, "Number of iterations: %d\n", loop_it); } //create the solution sol = (PCA_Solution *)malloc(sizeof(PCA_Solution)); sol->theta = p; sol->mudottheta = dot(p, xbar); //free everything vector_free(pold); vector_free(xbar); return sol; } /******************************************************************************************** *Removes zero columns from the example matrix and initial PCA guess. * *INPUT: X: example matrix * init_pca: initial PC guess if you have one or NULL. if init_pca is NON-NULL this will * remove only columns that are all zeros in the X matrix AND the init_pca vector so that * the new columns of X and init_pca correspond. * *OUTPUT: An expanding array mapping columns back so that A[i] = c indicates that what is * now the ith column of X used to be column c. *******************************************************************************************/ ExpandingArray *pca_preprocess(Matrix *X, Vector *init_pca) { ExpandingArray *colMap; if (init_pca) { matr_shallow_row_copy(X, X->rows, init_pca); } colMap = matr_remove_zero_cols(X); if (init_pca) { matr_erase_row(X, X->rows-1); } return colMap; } /******************************************************************************************* *Solves a PCA and returns its solution in *sol. * *INPUT: X: example matrix * sol: A pointer to a previous solution OR a pointer to NULL if there is no previous * solution. THIS POINTER SHOULD BE NON-NULL. If you have no initial solution, pass in * a pointer TO NULL. * *OUTPUT: A pca solution as pass-by-reference in *sol. ******************************************************************************************/ void pca_solve(Matrix *X, PCA_Solution **sol) { PCA_Solution *new_sol; int i; Vector *row, *theta; VectorIterator vit; ExpandingArray *colMap; MATR_DEBUG_MODE = PCA_DEBUG_MODE; if (!X) { if (PCA_DEBUG_MODE) { fprintf(stderr, "Null example matrix.\n"); } if (sol && *sol) { pca_free_solution(*sol); *sol = NULL; } return; } if (!sol) { if (PCA_DEBUG_MODE) { fprintf(stderr, "Null pointer to old pca solution. If you have no pca solution pass in a POINTER to a null vector. Do not pass in a NULL pointer. I am returning.\n"); } return; } if (*sol) { theta = (*sol)->theta; } else { theta = NULL; } if (PCA_DEBUG_MODE >= PCA_DEBUG) { fprintf(stderr, "X is %d by %u with %d non-zero elements\n", X->rows, X->cols, X->nz); } colMap = pca_preprocess(X, theta); if (PCA_DEBUG_MODE >= PCA_DEBUG) { fprintf(stderr, "After preprocess X is %d by %u\n", X->rows, X->cols); } if (PCA_DEBUG_MODE >= PCA_LOOP) { fprintf(stderr, "X = \n"); matr_print(X); } //run pca new_sol = run_pca(X, theta); if (*sol) { pca_free_solution(*sol); } *sol = new_sol; if (!(*sol)) { //uh oh if (colMap) { expanding_array_free(colMap); } if (PCA_DEBUG_MODE) { fprintf(stderr, "PCA Solver Error.\n"); } return; } if (colMap) { //redensify theta and X vector_convert_nonsparse_to_sparray((*sol)->theta, colMap); matr_add_ncols(X, expanding_array_get(X->cols-1, colMap).compact->i+1 - X->cols); for (i = 0; i < X->rows; i++) { row = matr_get_row(X, i); if (!row) { continue; } vectorit_set_at_end(&vit, row); while (!vectorit_past_beg(vit, row)) { vectorit_set_col(vit, expanding_array_get (vectorit_curr_col(vit, row), colMap).compact->i, row); vectorit_prev(&vit, row); } } expanding_array_free(colMap); } if (PCA_DEBUG_MODE >= PCA_DEBUG) { fprintf(stderr, "Solver complete.\n"); } } int pca_classify(Vector *new_ex, PCA_Solution *sol) { double d = dot(new_ex, sol->theta) - sol->mudottheta; if (d > 0) { return 1; } if (d < 0) { return -1; } return 0; } /****************************************************************************************** *Frees a PCA solution. * *INPUT: solution to free. *****************************************************************************************/ void pca_free_solution(PCA_Solution *sol) { if (!sol) { return; } vector_free(sol->theta); free(sol); } crm114-20100106-BlameMichelson.src/cssdiff.c0000644000000000017500000001227511321154266016571 0ustar rootwsy// cssutil.c - utility for munging css files, version X0.1 // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" int main (int argc, char **argv) { long i,j,k; // some random counters, when we need a loop long hfsize, hfsize1, hfsize2; long f1, f2; long sim, diff, dom1, dom2, hclash, kclash; { struct stat statbuf; // filestat buffer FEATUREBUCKET_STRUCT *h1, *h2; // the text of the hash file // filename is argv [1] // and stat it to get it's length if(!argv[1] || !argv[2]) { fprintf (stdout, "Usage: cssdiff \n"); return (EXIT_SUCCESS); }; // quick check- does the first file even exist? k = stat (argv[1], &statbuf); if (k != 0) { fprintf (stderr, "\n CSS file '%s' not found. \n", argv[1]); exit (EXIT_FAILURE); }; // hfsize = statbuf.st_size; // mmap the hash file into memory so we can bitwhack it h1 = (FEATUREBUCKET_STRUCT *) crm_mmap_file (argv[1], 0, hfsize, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); if (h1 == MAP_FAILED) { fprintf (stderr, "\n MMAP failed on file %s\n", argv[1]); exit (EXIT_FAILURE); }; hfsize1 = statbuf.st_size / sizeof (FEATUREBUCKET_STRUCT); // // and repeat the process for the second file: k = stat (argv[2], &statbuf); // quick check- does the file even exist? if (k != 0) { fprintf (stderr, "\n.CSS file '%s' not found.\n", argv[2]); exit (EXIT_FAILURE); }; hfsize2 = statbuf.st_size; // mmap the hash file into memory so we can bitwhack it h2 = (FEATUREBUCKET_STRUCT *) crm_mmap_file (argv[2], 0, hfsize2, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); if (h2 == MAP_FAILED) { fprintf (stderr, "\n MMAP failed on file %s\n", argv[2]); exit (EXIT_FAILURE); }; hfsize2 = hfsize2 / sizeof (FEATUREBUCKET_STRUCT); fprintf (stderr, "Sparse spectra file %s has %ld bins total\n", argv[1], hfsize1); fprintf (stdout, "Sparse spectra file %s has %ld bins total\n", argv[2], hfsize2); // // if (hfsize1 != hfsize2) { fprintf (stderr, "\n.CSS files %s, %s :\n lengths differ: %ld vs %ld.\n", argv[1],argv[2], hfsize1, hfsize2); fprintf (stderr, "\n This is not a fatal error, but be warned.\n"); }; f1 = 0; f2 = 0; sim = 0; diff = 0; dom1 = 0; dom2 = 0; hclash = 0; kclash = 0; // // The algorithm - for each file, // for each bucket in each file // find corresponding bucket in other file // increment dom1 or dom2 as appropriate // always increment sim and diff // end // end // divide sim and diff by 2, as they are doublecounted // print statistics and exit. // // start at 1 - no need to check bin 0 (version). for ( i = 1; i < hfsize1; i++) { if ( h1[i].key != 0 ) { f1 += h1[i].value; k = h1[i].hash % hfsize2; if (k == 0) k = 1; while (h2[k].value != 0 && (h2[k].hash != h1[i].hash || h2[k].key != h1[i].key)) { k++; if (k >= hfsize2) k = 1; }; // Now we've found the corresponding (or vacant) slot in // h2. Do our tallies... j = h1[i].value ; if (j > h2[k].value ) j = h2[k].value; sim += j; j = h1[i].value - h2[k].value; if (j < 0) j = -j; diff += j; j = h1[i].value - h2[k].value; if (j < 0) j = 0; dom1 += j; }; }; // // And repeat for file 2. for ( i = 1; i < hfsize2; i++) { if ( h2[i].key != 0 ) { f2 += h2[i].value; k = h2[i].hash % hfsize1; if (k == 0) k = 1; while (h1[k].value != 0 && (h1[k].hash != h2[i].hash || h1[k].key != h2[i].key)) { k++; if (k >= hfsize1) k = 1; }; // Now we've found the corresponding (or vacant) slot in // h1. Do our tallies... j = h2[i].value ; if (j > h1[k].value ) j = h1[k].value; sim += j; j = h1[k].value - h2[i].value; if (j < 0) j = -j; diff += j; j = h2[i].value - h1[k].value; if (j < 0) j = 0; dom2 += j; }; }; fprintf (stdout, "\n File 1 total features : %12ld", f1); fprintf (stdout, "\n File 2 total features : %12ld\n", f2); fprintf (stdout, "\n Similarities between files : %12ld", sim/2); fprintf (stdout, "\n Differences between files : %12ld\n", diff/2); fprintf (stdout, "\n File 1 dominates file 2 : %12ld", dom1); fprintf (stdout, "\n File 2 dominates file 1 : %12ld\n", dom2); } return 0; } crm114-20100106-BlameMichelson.src/paolo_ov4.crm0000755000000000017500000000202311321154266017402 0ustar rootwsy#! /usr/bin/crm # # paolo_ov4.crm - paolo written test script # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # window alter (:_dw:) /HHHHHHHHHHHHHHHHHH\nContent-Type: multipart; boundary="--0123456789"\nhhhhhhhhhhhhhhhhhhhhhh\n\nThis is a multi-part message in MIME format.\n----0123456789\nContent-Type: text\nContent-Transfer-Encoding: 7bit\n\nTTTTTTTTTTTTTTTTTTTTTTT\n\n----0123456789\nContent-Type: image\n name="clonic.GIF"\nContent-Transfer-Encoding: base64\niiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii\n\nooooooooooooooooooooooooooooooooo\noooooooooooooooooooooooooooo\n\n----0123456789--\n\n\n/ { match (:: :headers: :body:) /(.*?)\n\n(.*)/ } isolate (:headers:) { match [:headers:] /^Content-Type: .* boundary="(.+)"/ \ (:x: :boundary:) output /:boundary:=:*:boundary:\n/ } isolate (:c:) /0/ { eval (:c:) /:@::*:c:+1:/ match [:body:] (:x: :headers:) /\n--:*:boundary:\n(.+?)\n\n/ output /:c:=:*:c:\n:*:headers:\n\n/ liaf } crm114-20100106-BlameMichelson.src/Makefile0000644000000000017500000004642511321154266016450 0ustar rootwsy# # Makefile for CRM114 # # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # If you want to install the executables somewhere else, change # BINDIR here. Default is /usr/bin but you can change it to /usr/local/bin # if you prefer that sort of thing. # prefix?=/usr BINDIR=${prefix}/bin # VER_SUFFIX defines a version suffix for our installed executables, # handy when you want many versions of CRM114 coexisting. # # e.g.: VER_SUFFIX=27B-6, then if you make install, you get crm-27b-6 # VER_SUFFIX ?= # # The following forces the compiler to be GCC. If you have trouble # with your default compiler, and you want to force GCC, uncomment it. CC=gcc # # What version is this software (PLEASE don't change this just # to mess with my brain. - wsy) # CRM114_ARCHIVE = /homes/crm114/archive VERSION=20100106-BlameMichelson PREV_VERSION=20091209-BlameMichelson #PREV_VERSION=20090914-BlameThorstenAndJenny #PREV_VERSION=20090806-LastOldSVM #PREV_VERSION=20090423-BlameSteveJobs #PREV_VERSION=20090120-BlameSteveJobs #PREV_VERSION=20081111-BlameBarack #PREV_VERSION=20081023-BlameMaxwell #PREV_VERSION=20080923-BlameVT #PREV_VERSION=20080618-BlameBadGlasses #PREV_VERSION=20080502-BlameJason #PREV_VERSION=20080326-BlameSentansoken #PREV_VERSION=20080217-BlameJoeLangeway_VT #PREV_VERSION=20071201-BlameLittleStevie_VT #PREV_VERSION=20071126-BlameGerHobbelt #PREV_VERSION=20070917-BlameGerHobbelt #VERSION=20070830-BlameGerHobbelt #PREV_VERSION=20070828-BlameGerHobbelt #PREV_VERSION=20070827-BlameGerHobbelt #PREV_VERSION=20070826-BlameGerHobbelt #PREV_VERSION = 20070815-BlameBadData #PREV_VERSION = 20070810-BlameTheSegfault #PREV_VERSION = 20070505-BlameIdiocy #PREV_VERSION = 20061103-BlameDalkey #PREV_VERSION = 20061010-BlameBratko #PREV_VERSION = 20060920-BlameNico #PREV_VERSION = 20060704a-BlameRobert #PREV_VERSION = 20060619-MargarineBeast #PREV_VERSION = 20060611-ButterBeast #PREV_VERSION = 20060611-SomewhatTamedBeast #PREV_VERSION = 20060606-VersionOfTheBeast #PREV_VERSION = 20060429-ReaverReturn #PREV_VERSION = 20060118-BlameTheReavers #PREV_VERSION = 20051231a-BlameBarryAndPaolo #PREV_VERSION = 20051001-BlameRaulMiller #PREV_VERSION = crm114-20050721-BlameNeilArmstrong # # Define the TRE directory (used only for building distributions- note that # this must be HAND-UPDATED when new versions of TRE come out ) TRE_TARFILE = tre-0.7.5 # VERSION += [$(TRE_TARFILE)] # # # Are we compiling on a POSIX system or a Windows system? NOTE: # WINDOWS IS UNSUPPORTED BY THE AUTHOR. THE AUTHOR WILL # ACCEPT REASONABLE-LOOKING PATCHES BUT WINDOWS-ONLY # BUG REPORTS THAT DO NOT REPRODUCE UNDER LINUX ET AL # SIMPLY _CANNOT_ BE WORKED AS I HAVE NO WINDOWS MACHINES. # SO, IF YOU FIND A WINDOWS-ONLY BUG, PLEASE PUT IT ON THE # MAILING LIST SO WINDOWS PEOPLE CAN WORK THE BUG. # # CFLAGS += -DWIN32 # # Build our basic CFLAGS chunk # CFLAGS += -DVERSION='"$(VERSION)"' # # Known Neighbord On, only for debugging # CFLAGS += -DKNN_ON # # Ben Graphic, only for debugging # CFLAGS += -DBEN_GRAPHIC # # Stab test, long accepted # CFLAGS += -DSTAB_TEST # # Stochastic setting, unused in crm_neural_net.c # CFLAGS += -DSTOCHASTIC # # Tell the compiler full optimization "-O3", add debugging info "-g", # and warn on all warnable things "-Wall". Someday add -Wextra for # really picayune stuff. # # Our default is with full optimization, lots of warnings, and forcing # certain functions to be inline (I added that to force gcc to inline functions # under high optimization but not under low so that debugging is possible - # that's why DO_INLINES is defined at O3 but not O0 - JB): CFLAGS += -O3 -Wall -DDO_INLINES # turn this one on for no optimization, but debugging #CFLAGS += -g -O0 -Wall # # Turn this one on for time profiling. #CFLAGS += -pg -O0 # turn this one on for really picayune debugging: # CFLAGS += -Wextra # # Choose between static and dynamic linking (we recommend static linking) # Comment this out if you want dynamic linking # # BSD users - do NOT use -static-libgcc!!! # Also note that this keeps valgrind from running correctly so comment # it out for debugging. LDFLAGS += -static -static-libgcc # use this one if you want program profiling. #LDFLAGS += -static -static-libgcc -pg #LDFLAGS += -pg # # Any standard install flags? We nominally use protection 755 INSTALLFLAGS += -m 755 # # uncomment the next line if you want to "strip" the debugger info # from the binaries when installing. This speeds up load, but # you won't be able to submit meaningful CRM114 engine bug reports. # Default is don't strip. # INSTALLFLAGS += -s # # # Libraries to link against. LIBS += -lm # # --------- If you're compiling under *BSD, check these out: # # Simson Garfinkel suggests that you # uncomment the following to get a BSD-sane environment. Leave them # untouched (commented out) for Linux builds. # Add for FreeBSD: #CFLAGS += -I/usr/local/include #LDFLAGS += -L/usr/local/lib #LIBS += -lintl -liconv #LDFLAGS += -dynamic-libgcc # # Jeff Rice suggests the following for FreeBSD: #CFLAGS += -I/usr/local/include -I${HOME}/include #LDFLAGS += -L/usr/local/lib -L${HOME}/lib #LIBS += -lintl -liconv # # Nico Kadel-Garcia has added these flags for systems without logl or sqrtf # BSD has no logl function # CFLAGS += -DNO_LOGL # # BSD has no sqrtf function # CFLAGS += -DNO_SQRTF # ------------------ end of *BSD stuff # # ------------------For Mac OSX, try this--------------- # # Yimin Wu (of the SVMx classifiers) suggests that to get # a good compile on the Mac OSX machines, try: # # COMMENT OUT all the above "LDFLAGS += -static -static-libgcc -pg" # # and add # #CFLAGS += -I/usr/local/include #LDFLAGS += -l/usr/local/lib # # and change final options on the main link line from: # # "-lm -ltre -o crm114" to "-ltre -o crm114" # # PowerPPC has no sqrtf function # CFLAGS += -DNO_SQRTF # # --------------End of Mac OSX stuff ----------------------- # # # End of user-configurable options... if you modify anything below # this line, you risk early insanity and blindness. # # # These are the files needed to build the CRM114 engine; they don't # include the side utilities # CFILES = crm_main.c crm_compiler.c crm_errorhandlers.c \ crm_math_exec.c crm_var_hash_table.c crm_expandvar.c \ crm_stmt_parser.c crm_vector_tokenize.c \ crm_expr_alter.c crm_expr_match.c crm_css_maintenance.c \ crm_markovian.c crm_osb_bayes.c crm_osb_hyperspace.c \ crm_correlate.c crm_osb_winnow.c crm_winnow_maintenance.c \ crm_osbf_bayes.c crm_osbf_maintenance.c crm_bit_entropy.c \ crm_neural_net.c crm_expr_clump.c \ crm_expr_window.c crm_expr_isolate.c crm_expr_file_io.c \ crm_expr_syscall.c crm_expr_classify.c crm_expr_translate.c \ crm_exec_engine.c crm_debugger.c crm_str_funcs.c \ crm_preprocessor.c crmregex_tre.c \ crm_stats.c crm_expr_sks.c\ crm_svm_matrix_util.c crm_svm_quad_prog.c crm_svm_matrix.c\ crm_svm_lib_fncts.c crm_svm.c crm_pca_lib_fncts.c crm_pca.c\ crm_fast_substring_compression.c HFILES = Makefile crm114_sysincludes.h crm114_structs.h crm114_config.h \ crm114.h crm114_osbf.h crm_svm_matrix_util.h crm_svm_quad_prog.h\ crm_svm_matrix.h crm_svm_lib_fncts.h crm_svm.h crm_pca_lib_fncts.h\ crm_pca.h OFILES = crm_main.o crm_compiler.o crm_errorhandlers.o \ crm_math_exec.o crm_var_hash_table.o crm_expandvar.o \ crm_stmt_parser.o crm_vector_tokenize.o \ crm_expr_alter.o crm_expr_match.o crm_css_maintenance.o \ crm_markovian.o crm_osb_bayes.o crm_osb_hyperspace.o \ crm_correlate.o crm_osb_winnow.o crm_winnow_maintenance.o \ crm_osbf_bayes.o crm_osbf_maintenance.o crm_bit_entropy.o \ crm_neural_net.o crm_expr_clump.o \ crm_expr_window.o crm_expr_isolate.o crm_expr_file_io.o \ crm_expr_syscall.o crm_expr_classify.o crm_expr_translate.o \ crm_exec_engine.o crm_debugger.o crm_str_funcs.o \ crm_preprocessor.o crmregex_tre.c \ crm_expr_sks.o crm_stats.o \ crm_svm_matrix_util.o crm_svm_quad_prog.o crm_svm_matrix.o\ crm_svm_lib_fncts.o crm_svm.o crm_pca_lib_fncts.o crm_pca.o\ crm_fast_substring_compression.o UTILFILES = osbf-util.c cssutil.c cssdiff.c cssmerge.c crm_util_errorhandlers.c CRMFILES = mailfilter.crm mailfilter.cf \ mailreaver.crm mailtrainer.crm \ maillib.crm shuffle.crm \ rewriteutil.crm \ matchtest.crm windowtest.crm overalterisolatedtest.crm exectest.crm \ learntest.crm classifytest.crm \ escapetest.crm argtest.crm beeptest.crm skudtest.crm aliustest.crm \ nestaliustest.crm eval_infiniteloop.crm \ traptest.crm fataltraptest.crm uncaughttraptest.crm \ inserttest_a.crm inserttest_b.crm inserttest_c.crm \ backwardstest.crm approxtest.crm \ mathalgtest.crm mathrpntest.crm \ indirecttest.crm translate_tr.crm zz_translate_test.crm \ rewritetest.crm test_rewrites.mfp \ randomiotest.crm isolate_reclaim_test.crm \ match_isolate_test.crm match_isolate_reclaim.crm \ call_return_test.crm defaulttest.crm alternating_example_neural.crm \ shroud.crm quine.crm alternating_example_svm.crm \ alternating_example_pca.crm \ bracktest.crm unionintersecttest.crm \ statustest.crm windowtest_fromvar.crm paolo_overvars.crm \ paolo_ov2.crm paolo_ov3.crm paolo_ov4.crm paolo_ov5.crm \ pad.crm calc.crm \ gatlingsort.crm tenfold_validate.crm\ test_svm.crm \ classifymail.crm TEXTFILES = COLOPHON.txt pad.dat \ QUICKREF.txt INTRO.txt KNOWNBUGS.txt FAQ.txt THINGS_TO_DO.txt \ CLASSIFY_DETAILS.txt README CRM114_Mailfilter_HOWTO.txt \ Alice_In_Wonderland_Chap_1_And_2.txt Macbeth_Act_IV.txt \ Hound_of_the_Baskervilles_first_500_lines.txt \ The_Wind_in_the_Willows_Chap_1.txt \ reto_procmailrc.recipe \ priolist.mfp blacklist.mfp whitelist.mfp rewrites.mfp \ priolist.mfp.example blacklist.mfp.example whitelist.mfp.example \ crm114-mode.el \ crm114.spec \ inoc_passwd.txt \ GPL_License.txt \ megatest.sh megatest_knowngood.log # # all: crm114 cssutil cssdiff cssmerge osbf-util # # Done. # # *** Type "make help" to see what else you can make. # help: FORCE # # You can make the following things: # # make -or- make all - builds the binaries. This uses the # # make crm114 - build CRM binary. This uses the # TRE regex library # make install - builds and installs the binaries. # You must be root to do this. # make uninstall - uninstalls the binaries. You must # be root to do this. # make megatest - runs the full test suite ( takes # about ~1 minute, TRE version _ONLY_! ) # make clean - deletes all .o and executables. # Don't do this on a binary kit! # make cssfiles - builds a set of .css files from data # in ./good.dir/* and ./spam.dir/* # make distribtion - builds the .gzballs for distribution # of with your mods built in. crm114: $(OFILES) $(CC) $(LDFLAGS) $(LIBS) $(OFILES) \ -ltre -lm \ -o crm114 # # Done. Type "make help" to see what else you can make. # *.o: $(HFILES) .c.o: $(CC) $(CFLAGS) -c $*.c cssutil: cssutil.o crm_css_maintenance.o crm_util_errorhandlers.o crm_str_funcs.o $(HFILES) $(CC) $(CFLAGS) $(LDFLAGS) cssutil.o \ crm_css_maintenance.o \ crm_util_errorhandlers.o \ crm_str_funcs.o -lm -ltre -o cssutil osbf-util: osbf-util.o crm_osbf_maintenance.o crm_util_errorhandlers.o crm_str_funcs.o $(HFILES) $(CC) $(CFLAGS) $(LDFLAGS) osbf-util.o \ crm_str_funcs.o \ crm_util_errorhandlers.o \ crm_osbf_maintenance.o -lm -ltre -o osbf-util cssdiff: cssdiff.o crm_util_errorhandlers.o crm_str_funcs.o $(HFILES) $(CC) $(CFLAGS) $(LDFLAGS) \ crm_util_errorhandlers.o \ crm_str_funcs.o -ltre cssdiff.o -o cssdiff cssmerge: cssmerge.o crm_util_errorhandlers.o crm_str_funcs.o $(HFILES) $(CC) $(CFLAGS) $(LDFLAGS) \ cssmerge.o crm_str_funcs.o \ crm_util_errorhandlers.o \ -ltre -o cssmerge clean: -rm -f crm114 -rm -f cssutil -rm -f osbf-util -rm -f cssdiff -rm -f cssmerge -rm -f crm114 -rm -f crm114_tre -rm -f *.o install: FORCE $(MAKE) install_engine install_utils # # Done. Type "make help" to see what else you can make. # install_utils: # # Installing Utilities # $(MAKE) cssmerge cssutil cssdiff osbf-util install $(INSTALLFLAGS) cssdiff $(BINDIR)/cssdiff$(VER_SUFFIX) install $(INSTALLFLAGS) cssmerge $(BINDIR)/cssmerge$(VER_SUFFIX) install $(INSTALLFLAGS) cssutil $(BINDIR)/cssutil$(VER_SUFFIX) install $(INSTALLFLAGS) osbf-util $(BINDIR)/osbf-util$(VER_SUFFIX) # if you don't have EMACS installed in the default place, you # won't get the crm114-mode.el automatically. -install $(INSTALLFLAGS) crm114-mode.el $(prefix)/share/emacs/site-lisp/crm114-mode.el$(VER_SUFFIX) # use different name so we can keep it around along with default one # and same VER_SUFFIX postfix install_engine: crm114 # # Installing engine # install $(INSTALLFLAGS) crm114 $(BINDIR)/crm$(VER_SUFFIX) install_binary_only: FORCE install $(INSTALLFLAGS) crm114 $(BINDIR)/crm$(VER_SUFFIX) install $(INSTALLFLAGS) cssdiff $(BINDIR)/cssdiff$(VER_SUFFIX) install $(INSTALLFLAGS) cssmerge $(BINDIR)/cssmerge$(VER_SUFFIX) install $(INSTALLFLAGS) cssutil $(BINDIR)/cssutil$(VER_SUFFIX) install $(INSTALLFLAGS) osbf-util $(BINDIR)/osbf-util$(VER_SUFFIX) -install $(INSTALLFLAGS) crm114-mode.el $(prefix)/share/emacs/site-lisp/crm114-mode.el$(VER_SUFFIX) uninstall: FORCE -rm -rf $(BINDIR)/crm$(VER_SUFFIX) -rm -rf $(BINDIR)/crmg$(VER_SUFFIX) -rm -rf $(BINDIR)/crma$(VER_SUFFIX) -rm -rf $(BINDIR)/cssdiff$(VER_SUFFIX) -rm -rf $(BINDIR)/cssmerge$(VER_SUFFIX) -rm -rf $(BINDIR)/cssutil$(VER_SUFFIX) -rm -rf $(BINDIR)/osbf-util$(VER_SUFFIX) -rm -rf $(prefix)/share/emacs/site-lisp/crm114-mode.el$(VER_SUFFIX) megatest: FORCE # # This runs a moderately interesting set of base tests # to exercise much of CRM114 under TRE. This takes about # 1 minute to run on a 1.6 GHz Pentium-M laptop. Please # be patient; you (hopefully) won't see anything till the # full set of tests complete. If you didn't use TRE, all # bets are off. # # Lines of output that start with OK_IF_mumble are allowed # to change values. No other lines should. If other lines # do change, either your kit isn't quite right or your # install is broken (or you've found a bug). # ### The >> in the output forces line-at-a-time buffering. rm -rf megatest_test.log ./megatest.sh >>megatest_test.log 2>>megatest_test.log # ./megatest.sh >megatest_test.log 2>&1 #./megatest.sh 2>&1 | tee megatest_test.log #diff -U 5 megatest_knowngood.log megatest_test.log & sleep 1 diff megatest_knowngood.log megatest_test.log & sleep 1 neurotest: FORCE ./nn_segfault_test.sh cssfiles: spam.dir good.dir # # This may take a LONG TIME to run, up to 10 minutes per megabyte # of text. Please adjust your expectations accordingly. # # Also, be advised that relearning the same text more than once # can be ill-advised. Remove the .css files first if you are # relearning your entire corpus (new .css files will be created # if they don't already exist) # # If your text corpi are very large, you may need to rebuild # crm114 with a larger default window size - or alter the commands # below to use a bigger -w than your default # # If you have NOT yet set up your mailfilter.cf and mailfilter.mfp # files, hit ^C NOW. REALLY!!! Fix that, and rerun. sleep 5 # Here we go... # export DATE $(date +%Y_%m_%d_%H_%M_%S) # MUCH STUFF DEPRECATED!!! Use mailtrainer.crm instead... # cp spamtext.txt saved_spamtext__$(DATE).txt # cp nonspamtext.txt saved_nonspamtext__$(DATE).txt cssutil -b -r spam.css cssutil -b -r nonspam.css cp spam.css spam__$(DATE).txt cp nonspam.css nonspam__$(DATE).txt # ./crm114 mailfilter.crm --learnspam --force < saved_spamtext__$(DATE).txt # ./crm114 mailfilter.crm --learnnonspam --force < saved_nonspamtext__$(DATE).txt # # run mailtrainer.crm on the spam.dir and good.dir directories. mailtrainer.crm \ --good=./good.dir/ \ --spam=./spam.dir/ \ --repeat=5 \ --random # If all went well with the above, you will have backup copies of # your spam and nonspam .css files. # You may now delete these files with impunity, or you may # choose to keep them around as backup. tarfile: all check_permissions rm -rf crm114-$(VERSION).src mkdir -m 0755 crm114-$(VERSION).src cp $(CFILES) \ $(HFILES) \ $(UTILFILES) \ $(CRMFILES) \ $(TEXTFILES) \ crm114-$(VERSION).src tar -cvf crm114-$(VERSION).src.tar crm114-$(VERSION).src rm -rf crm114-$(VERSION).src chmod 0644 crm114-$(VERSION).src.tar src_gzip: tarfile rm -rf crm114-$(VERSION).src.tar.gz gzip crm114-$(VERSION).src.tar chmod 0644 crm114-$(VERSION).src.tar.gz i386_tarfile: all rm -rf crm114-$(VERSION).i386 mkdir -m 0755 crm114-$(VERSION).i386 cp crm114 cssutil cssdiff cssmerge osbf-util \ Makefile \ CRM114_Mailfilter_HOWTO.txt \ COLOPHON.txt \ QUICKREF.txt INTRO.txt KNOWNBUGS.txt FAQ.txt THINGS_TO_DO.txt \ mailfilter.crm mailfilter.cf \ mailreaver.crm mailtrainer.crm \ maillib.crm shuffle.crm \ rewriteutil.crm rewrites.mfp \ whitelist.mfp blacklist.mfp priolist.mfp \ priolist.mfp.example blacklist.mfp.example whitelist.mfp.example \ crm114-mode.el \ GPL_License.txt \ crm114-$(VERSION).i386 tar -cvf crm114-$(VERSION).i386.tar crm114-$(VERSION).i386 rm -rf crm114-$(VERSION).i386 i386_gzip: i386_tarfile rm -rf crm114-$(VERSION).i386.tar.gz gzip crm114-$(VERSION).i386.tar chmod 0644 crm114-$(VERSION).i386.tar.gz css_tarfile: spam.css nonspam.css rm -rf crm114-$(VERSION).css mkdir crm114-$(VERSION).css cp spam.css nonspam.css \ crm114-$(VERSION).css tar -cvf crm114-$(VERSION).css.tar crm114-$(VERSION).css rm -rf crm114-$(VERSION).css chmod 0644 crm114-$(VERSION).css.tar css_gzip: css_tarfile rm -rf crm114-$(VERSION).css.tar.gz gzip crm114-$(VERSION).css.tar chmod 0644 crm114-$(VERSION).css.tar.gz distribution: install src_gzip i386_gzip md5sum crm114-$(VERSION).*.tar.gz upload_tarball: distribution rsync -zrltDv \ crm114-$(VERSION).src.tar.gz \ crm114-$(VERSION).i386.tar.gz \ wsy,crm114@web.sourceforge.net:htdocs/tarballs/ upload_tarball_latest: i386_gzip rsync -zrltDv crm114-$(VERSION).i386.tar.gz \ wsy,crm114@web.sourceforge.net:htdocs/tarballs/crm114-latest.i386.tar.gz rsync -zrltDv crm114-$(VERSION).src.tar.gz \ wsy,crm114@web.sourceforge.net:htdocs/tarballs/crm114-latest.src.tar.gz echo $(VERSION) > 00-LATEST-IS rsync -auv 00-LATEST-IS \ wsy,crm114@web.sourceforge.net:htdocs/tarballs/ rsync: FORCE check_permissions echo "Options Indexes" > .htaccess chmod 0644 .htaccess rsync -zprltDv .htaccess \ $(CFILES) $(HFILES) $(UTILFILES) $(CRMFILES) $(TEXTFILES)\ wsy,crm114@web.sourceforge.net:htdocs/src check_permissions: # # Setting file permissions # find -name .git -prune -o -type d ! -perm 2775 -exec chmod 2775 '{}' \; -print find -name .git -prune -o -type f \ ! -name \*\.crm \ ! -name \*\.sh \ ! -name crm114 \ ! -perm 0644 \ -exec chmod 0644 '{}' \; -print find -name .git -prune -o -type f -name \*\.crm ! -perm 0755 -exec chmod 0755 '{}' \; -print find -name .git -prune -o -type f -name \*\.sh ! -perm 0755 -exec chmod 0755 '{}' \; -print find -name .git -prune -o -type f -name crm114 ! -perm 0755 -exec chmod 0755 '{}' \; -print FORCE: crm114-20100106-BlameMichelson.src/pad.crm0000755000000000017500000001216011321154266016247 0ustar rootwsy#! /usr/bin/crm # -(help file delete) # # pad.crm - Program for Associative Data - PAD # # Note to SunOS and FreeBSD users - do not place command arguments of # "-([arguments])" format on the first line of this program # or you will not get what you expect. This is due to a kernel # difference in how a bangline should be dealt with. # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # # Program for Associative Data - PAD # # this is meant to be used as a handy notepad for phone numbers, addresses, # things like that; data that's in some way ephemeral but may also need # to be durable. # window # Step 0 - check to see if we have any args at all. If not, # we insert --help into our argument stream. { { match [:_pos2:] /./ } trap /Attempt/ { isolate (:_pos2:) alter (:_pos2:) /--help/ } } # Step 1 - are we being asked for help? If so, print out help and # cleanly exit. { match [:_pos2:] /(-h)|(--help)/ output / pad - Program for Associative Data \n/ output / This is a general-purpose, persistent-data notepad.\n/ output / pad --help - this text \n/ output / pad + data data... - remember the data \n/ output / pad --delete data - forget any line containing the data \n/ output / pad data - find all references containing the data\n/ output / pad @ data - best match to data, extra @'s mean more matches\n/ output / pad --file=filename - use filename instead of pad.dat for data\n/ exit /0/ } # Step 2 - check to see if we were given a filename; if not, # we default to "pad.dat" { # Get the data file name if one is supplied # isolate (:file:) { match [:file:] /./ alter (:file:) /pad.dat/ } { input [:*:file:] (:data:) } trap /couldn't open file/ { output /Couldn't find the file ":*:file:", I'll create it.\n/ output [:*:file:] // isolate (:data:) // } } # Step 3 - now we have a context-sensitive situation - sometimes # our control arg is markered (i.e. --delete == "SET"), # sometimes it's positional (i.e. '+', which shows up as # the variable :_pos3:). So we splice together pos4 onward, # and independently check to see if we need to frontsplice # the pos3 argument { # start with an empty :cmdlineargs: # isolate (:cmdlineargs:) /:*:_pos_str:/ # # and give me the part that is after the args (since we specify # a -(helpset) this is starting at the third token, so we # explicitly match away the first two [[:graph:]]+ tokens in # the :_pos_str: array. # match [:cmdlineargs:] (:z: :cmdlineargs:) /[[:graph:]]+ [[:graph:]]+ (.*)/ # output /CMA = >>:*:cmdlineargs:< /^\\+ (.*)$/ { match [:data:] /:*:vals:/ output /Sorry, but I already know :*:vals: :*:_nl:/ exit /0/ } output [:*:file:] /:*:vals::*:_nl:/ exit /0/ } # Do we want to --delete (that is, forget) something? # # Remember, we need to make a backup copy as well, in # case the user decides not to forget it (or makes a mistake). { # if there is a --delete, we find all lines # with :cmdlineargs: and remove them match [:delete:] /SET/ { match [:data:] (:dead:) /^.*:*:cmdlineargs:.*$/ alter (:dead:) // # get rid of the following newline as well match [:data:] (:tn:) /:*:_nl:/ alter (:tn:) // liaf } # save back versions (with date tag) isolate (:date:) // syscall () (:date:) /date +%Y%m%d%H%M%S/ syscall /cp :*:file: :*:file:-:*:date:/ # and write out the changed version output [:*:file:] /:*:data:/ exit /0/ trap /.*/ } # Are we supposed to do an approximate match, to find # data that doesn't necessarily fit perfectly but is # pretty close? # # Note the cute trick of looping on the presence of the # '@' sign and deleting them one at a time; that way, the # user can specify the top N closest matches desired by # how many @ signs they type. { match [:cmdlineargs:] (:the_atsign:) /@/ { # approximate match, anyone? match [:cmdlineargs:] (:z: :the_atsign: :vars:) /(@)@* (.*)/ match [:data:] (:res: :q:) /^.*(:*:vars:){~}.*$/ output /:*:vars: >> :*:q: >> :*:res:\n/ #output /:*:res:\n/ alter (:res:) // alter (:the_atsign:) // liaf } exit /0/ } # Well, we didn't find any other keywords or commands, so # we're doing the most common thing - looking up a string # in the pad.dat file. # # Note the LIAF-loop; we loop so that we get _each_ copy of # the target string present. # # We also have to put "pos3" onto the front of the command # line args. { match [:data:] // # output /check exact on -:*:cmdlineargs:- \n/ { match [:data:] (:res: :q:) /^.*(:*:cmdlineargs:).*$/ output /:*:res:\n/ liaf } exit /0/ } crm114-20100106-BlameMichelson.src/crm_svm_matrix.c0000644000000000017500000037601211321154266020204 0ustar rootwsy// crm_svm_matrix.c - Support Vector Machine //////////////////////////////////////////////////////////////////////// // This code is originally copyright and owned by William // S. Yerazunis as file crm_neural_net. In return for addition of // significant derivative work, Jennifer Barry is hereby granted a full // unlimited license to use this code, includng license to relicense under // other licenses. //////////////////////////////////////////////////////////////////////// // // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. #include "crm_svm_matrix.h" #include "crm_svm_matrix_util.h" /***************************************************************************** *This is a matrix/vector library. *The intent of it is to keep all matrix operations out of the algorithms. *Therefore, sparse and full matrices are handled transparently inside these *functions. In other words, once a matrix is declared sparse or full, you *can call any matrix operation on it and the operation will do the best *thing for the type of matrix. If this were C++ (which, sadly, it is not) *matrix would be an abstract class with sparse and full implementing classes. * *Specifically there are 3 types of data structures for matrices currently *supported. For an R x C matrix with S non-zero elements, these are: * *NON_SPARSE: The matrix is represented as a set of R pointers to arrays * of length C. If more columns are added to the matrix, the array is * expanded to exactly the number of columns necessary. The representation * is not sparse so that the data in the cth spot of the array is assumed * to be the data in the cth column of that row. All data in a NON_SPARSE * matrix is accessible in O(1) time. * *SPARSE_ARRAY: The matrix is represented as a set of R pointers to arrays * of approximately length S/R. Each element of the array stores a column * number and data element. The arrays are arranged in order of increasing * column number. When more non-zero elements are added to a row than the * array can hold, its size is doubled. This is a sparse representation so * that any columns not mentioned in a row are assumed to be zero. All data * in a SPARSE_ARRAY matrix is accessible in O(lg(C)) time. * *SPARSE_LIST: The matrix is represented as a set of R pointers to linked * lists. Each element in the linked list stores a column number, data entry, * and pointers to its previous and next elements. When a new non-zero * element is added, only one more node of the list is created. This is a * sparse representation so that any columns not mentioned in a row are * assumed to be zero. All data in a SPARSE_LIST matrix is accessible in * O(C) time. * *In addition, the type of data stored in a matrix is variable. All columns *are assumed to be unsigned ints, but data can either be an int or a double. *The data type is specified by setting the MATR_COMPACT flag: * *MATR_COMPACT: A matrix set to be MATR_COMPACT contains integer data. * This matrix will take up less space than the equivalent MATR_PRECISE matrix. * *MATR_PRECISE: A matrix set to be MATR_PRECISE contains double data. * This matrix will take more space than the equivalent MATR_COMPACT matrix. * *Which representation to use depends very much on the application. All of *the representations work together (ie you can add a SPARSE_ARRAY and a *SPARSE_LIST without a problem) so you can have many different types in *one application. *Time considerations: *All functions in this file should be commented with their big-oh running *time for the three different representations. In general, though, very *sparse matrices that will change the number of non-zero elements often *are best represented as SPARSE_LISTS, while more static sparse matrices *should be SPARSE_ARRAYs. * *Space considerations (small considerations are left out): * *MATR_COMPACT, NON_SPARSE: Requires 4*R*C + 32*R bytes. * *MATR_PRECISE, NON_SPARSE: Requires 8*R*C + 32*R bytes. * *MATR_COMPACT, SPARSE_ARRAY: Requires up to 2*8*S + 64*R bytes (though usually * this is closer to 8*S bytes unless you get very unlucky in how the arrays * double in size. Setting MATR_DEFAULT_VECTOR_SIZE or passing in the starting * size of the array can keep this relatively small). * *MATR_PRECISE, SPARSE_ARRAY: Requires up to 2*16*S + 64*R bytes (on this computer * anyway, a 12-byte struct is scaled up to 16 bytes). * *MATR_COMPACT, SPARSE_LIST: Requires 24*S + 88*R bytes. * *MATR_PRECISE, SPARSE_LIST: Requires 32*S + 88*R bytes ****************************************************************************/ //Static matrix function decalartions static void matr_decrease_rows(Matrix *M, unsigned int r, int free_row); static ExpandingArray *matr_remove_zero_cols_sort(Matrix *X, SortingAlgorithms sorttype); static void matr_write_sp(Matrix *M, FILE *out); static void matr_write_ns(Matrix *M, FILE *out); /************************************************************************* *Makes a zero matrix. * *INPUT: rows: number of rows in the matrix * cols: number of columns in the matrix * type: NON_SPARSE, SPARSE_ARRAY, or SPARSE_LIST specifying the data * structure * compact: MATR_COMPACT or MATR_PRECISE specifying whether data is stored * as an int or a double * *OUTPUT: A rows X cols matrix of all zeros with the type and compact * flags set correctly. If the matrix is a sparse array, the arrays * will begin at size MATR_DEFAULT_VECTOR_SIZE. * *TIME: * NON_SPARSE: O(R*C) * SPARSE_ARRAY: O(R) * SPARSE_LIST: O(R) *************************************************************************/ Matrix *matr_make(unsigned int rows, unsigned int cols, VectorType type, int compact) { return matr_make_size(rows, cols, type, compact, MATR_DEFAULT_VECTOR_SIZE); } /************************************************************************* *Makes a zero matrix. * *INPUT: rows: number of rows in the matrix * cols: number of columns in the matrix * type: NON_SPARSE, SPARSE_ARRAY, or SPARSE_LIST specifying the data * structure * compact: MATR_COMPACT or MATR_PRECISE specifying whether data is stored as an * int or a double * size: The starting size of the array for the SPARSE_ARRAY data type. * *OUTPUT: A rows X cols matrix of all zeros with the type and compact * flags set correctly. If the matrix is a sparse array, the arrays * will begin at size MATR_DEFAULT_VECTOR_SIZE. * *TIME: * NON_SPARSE: O(R*C) * SPARSE_ARRAY: O(R) * SPARSE_LIST: O(R) *************************************************************************/ Matrix *matr_make_size(unsigned int rows, unsigned int cols, VectorType type, int compact, int size) { Matrix *M; unsigned int i; M = (Matrix *)malloc(sizeof(Matrix)); if (!M) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Unable to allocate memory for matrix.\n"); } return NULL; } M->rows = rows; M->cols = cols; M->type = type; M->compact = compact; M->size = size; M->was_mapped = 0; switch(type) { case NON_SPARSE: M->nz = rows*cols; break; case SPARSE_ARRAY: M->nz = 0; break; case SPARSE_LIST: M->nz = 0; break; default: if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_make: unrecognized type.\n"); } free(M); return NULL; } if (rows > 0) { M->data = (Vector **)malloc(sizeof(Vector *)*rows); if (!M->data) { rows = 0; } for (i = 0; i < rows; i++) { //it would be nice if there were some way to make this //contiguous in memory //but i'm not sure how M->data[i] = vector_make_size(cols, type, compact, size); if (!M->data[i]) { //something went wrong break; } } if (i != M->rows) { M->rows = i; matr_free(M); return NULL; } } else { M->data = NULL; } return M; } /************************************************************************* *Sets a row of the matrix. * *INPUT: A: matrix in which to set a row. * r: row to set. * v: pointer to new row. * *TIME: * NON_SPARSE: O(C) * SPARSE_ARRAY: O(S/R) * SPARSE_LIST: O(S/R) * *WARNINGS: *1) This does NOT free the old row of A in case it is still in use * somewhere. It is up to you to do that. *************************************************************************/ void matr_set_row(Matrix *A, unsigned int r, Vector *v) { int oldnz; if (!A || !A->data || r < 0 || r >= A->rows || !A->data[r]) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_set_row: bad arguments.\n"); } return; } oldnz = A->data[r]->nz; vector_copy(v, A->data[r]); A->nz += A->data[r]->nz - oldnz; } /************************************************************************* *Does a shallow copy to set the rth row of M to be v. If M has fewer than *r rows, enough rows are added. Note that this is a SHALLOW copy - M does *not create additional space for row r - it just sets M->data[r] = v. * *INPUT: M: matrix in which to set a row. * r: row to set. * v: pointer to new row. * *TIME: (if r = M->rows + t) * NON_SPARSE: O(t*C) * SPARSE_ARRAY: O(t) * SPARSE_LIST: O(t) * *WARNINGS: *1) M and v must be the same type. *2) When this is done M->data[r] and v have the SAME value. Freeing one * will free the other (for example). *************************************************************************/ void matr_shallow_row_copy(Matrix *M, unsigned int r, Vector *v) { int oldrows, i; if (!v || !M || r < 0) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_shallow_row_copy: bad arguments.\n"); } return; } if (v->type != M->type) { //this can be bad if v is sparse and M is non-sparse //since we try to set M->cols = v->dim and v->dim might be HUGE if (MATR_DEBUG_MODE) { fprintf (stderr, "Attempt to do shallow row copy between different vector types.\n"); } return; } oldrows = M->rows; if (r >= M->rows) { //add a row or 3 M->data = (Vector **)realloc(M->data, sizeof(Vector *)*(r+1)); if (!M->data) { //oh, oh something is really wrong if (MATR_DEBUG_MODE) { fprintf(stderr, "Unable to grow M in shallow_row_copy.\n"); } M->rows = 0; return; } M->rows = r+1; } for (i = oldrows; i < M->rows; i++) { M->data[i] = vector_make_size(M->cols, M->type, M->compact, M->size); } if (v->dim > M->cols) { matr_add_ncols(M, v->dim - M->cols); } vector_free(M->data[r]); M->data[r] = v; v->dim = M->cols; M->nz += v->nz; } /************************************************************************* *Sets a column of the matrix. * *INPUT: A: matrix in which to set a column. * r: column to set. * v: pointer to new column. * *TIME: * NON_SPARSE: O(R) * SPARSE_ARRAY: * Generally: O(R*lg(S/R)) (few zeros in v) O(S) (many zeros in v) * First/Last column: O(R) * SPARSE_LIST: * Generally: O(S) * First/Last column: O(R) *************************************************************************/ void matr_set_col(Matrix *A, unsigned int c, Vector *v) { int oldnz; int i, col, lastcol = -1; VectorIterator vit; if (!v || !A || !A->data || c < 0 || c >= A->cols) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_set_col: bad arguments.\n"); } return; } if (v->dim != A->rows) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_set_col: dimension mismatch.\n"); } return; } if (MATR_DEBUG_MODE >= MATR_OPS) { fprintf(stderr, "setting column %d of \n", c); matr_print(A); fprintf(stderr, "to be\n"); vector_print(v); } vectorit_set_at_beg(&vit, v); while (!vectorit_past_end(vit, v)) { col = vectorit_curr_col(vit, v); for (i = lastcol+1; i < col; i++) { oldnz = A->data[i]->nz; vector_set(A->data[i], c, 0); A->nz += A->data[i]->nz - oldnz; } oldnz = A->data[col]->nz; vector_set(A->data[col], c, vectorit_curr_val(vit, v)); A->nz += A->data[vectorit_curr_col(vit, v)]->nz - oldnz; lastcol = col; vectorit_next(&vit, v); } for (i = lastcol+1; i < A->rows; i++) { oldnz = A->data[i]->nz; vector_set(A->data[i], c, 0); A->nz += A->data[i]->nz - oldnz; } } /************************************************************************* *Adds a row to the bottom of the matrix. * *INPUT: M: matrix to which to add a row. * *TIME: * NON_SPARSE: O(C) (if realloc succeeds) O(R) + O(C) (if realloc fails) * SPARSE_ARRAY: O(1) (if realloc succeds) O(R) (if realloc fails) * SPARSE_LIST: O(1) (if realloc succeeds) O(R) (if realloc fails) *************************************************************************/ void matr_add_row(Matrix *M) { if (!M) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_add_row: null matrix.\n"); } return; } //reallocate the memory for M M->data = (Vector **)realloc(M->data, sizeof(Vector *)*(M->rows+1)); if (!M->data) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Unable to add more rows to matrix.\n"); } M->rows = 0; M->nz = 0; return; } M->data[M->rows] = vector_make_size(M->cols, M->type, M->compact, M->size); M->rows++; } /************************************************************************* *Adds n rows to the bottom of the matrix. * *INPUT: M: matrix to which to add rows. * n: number of rows to add * *TIME: * NON_SPARSE: O(n*C) (if realloc succeeds) O(R) + O(n*C) (if realloc fails) * SPARSE_ARRAY: O(n) (if realloc succeds) O(R) + O(n) (if realloc fails) * SPARSE_LIST: O(n) (if realloc succeeds) O(R) + O(n) (if realloc fails) *************************************************************************/ void matr_add_nrows(Matrix *M, unsigned int n) { unsigned int i; if (!M || n <= 0) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_add_nrows: bad arguments.\n"); } return; } M->data = (Vector **)realloc(M->data, sizeof(Vector *)*(M->rows+n)); if (!M->data) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Unable to add more rows to matrix.\n"); } M->rows = 0; M->nz = 0; return; } for (i = M->rows; i < M->rows+n; i++) { M->data[i] = vector_make_size(M->cols, M->type, M->compact, M->size); } M->rows += n; } /************************************************************************* *Adds a column to the right of the matrix. * *INPUT: M: matrix to which to add columns. * *TIME: * NON_SPARSE: O(R) (if realloc succeeds often) O(R*C) (if realloc fails often) * SPARSE_ARRAY: O(R) * SPARSE_LIST: O(R) *************************************************************************/ void matr_add_col(Matrix *M) { unsigned int i, j; if (!M) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_add_col: null matrix.\n"); } return; } if (M->data) { for (i = 0; i < M->rows; i++) { vector_add_col(M->data[i]); if (!M->data[i]) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Unable to add more columns to matrix.\n"); } for (j = 0; j < i; j++) { vector_free(M->data[j]); } for (j = i+1; j < M->rows; j++) { vector_free(M->data[j]); } free(M->data); M->data = NULL; M->cols = 0; M->nz = 0; return; } } } M->cols++; } /************************************************************************* *Adds n columns to the right of the matrix. * *INPUT: M: matrix to which to add columns. * n: number of columns to add * *TIME: * NON_SPARSE: O(n*R) (if realloc succeeds often) O(n*R*C) (if realloc fails) * SPARSE_ARRAY: O(R) * SPARSE_LIST: O(R) *************************************************************************/ void matr_add_ncols(Matrix *M, unsigned int n) { unsigned int i, j; if (!M || n <= 0) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_add_ncols: bad arguments.\n"); } return; } if (M->data) { for (i = 0; i < M->rows; i++) { vector_add_ncols(M->data[i], n); if (!M->data[i]) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Unable to add more columns to matrix.\n"); } for (j = 0; j < i; j++) { vector_free(M->data[j]); } for (j = i+1; j < M->rows; j++) { vector_free(M->data[j]); } free(M->data); M->data = NULL; M->cols = 0; M->nz = 0; return; } } } M->cols += n; } /************************************************************************* *Removes a row from the matrix and frees the row. * *INPUT: M: matrix from which to remove a row. * r: row to remove * *TIME: * NON_SPARSE: O(R) * SPARSE_ARRAY: O(R) * SPARSE_LIST: O(R) + O(S/R) *************************************************************************/ void matr_remove_row(Matrix *M, unsigned int r) { matr_decrease_rows(M, r, 1); } /************************************************************************* *Removes a row from the matrix and DOES NOT free the row. * *INPUT: M: matrix from which to remove a row. * r: row to remove * *TIME: * NON_SPARSE: O(R) * SPARSE_ARRAY: O(R) * SPARSE_LIST: O(R) + O(S/R) *************************************************************************/ void matr_erase_row(Matrix *M, unsigned int r) { matr_decrease_rows(M, r, 0); } //private function to actually do the work of removing a row //and erasing if (if desired) static void matr_decrease_rows(Matrix *M, unsigned int r, int free_row) { Vector *tptr = NULL; unsigned int i; if (!M || !M->data) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_decrease_rows: null matrix.\n"); } return; } if (r >= M->rows || r < 0) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_decrease_rows: attempt to remove non-existant row.\n"); } return; } if (M->rows == 0) { return; } if (M->rows == 1) { if (M->data) { if (free_row) { vector_free(M->data[0]); } free(M->data); M->data = NULL; } M->rows = 0; return; } if (M->data[r]) { M->nz -= M->data[r]->nz; } if (r < M->rows-1) { tptr = M->data[M->rows-1]; } else if (free_row) { vector_free(M->data[r]); } M->data = (Vector **)realloc(M->data, sizeof(Vector *)*(M->rows-1)); if (!M->data) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Unable to reduce rows of matrix.\n"); } M->rows = 0; M->nz = 0; return; } if (r < M->rows-1 && free_row) { vector_free(M->data[r]); } for (i = r; i < M->rows-2; i++) { M->data[i] = M->data[i+1]; } if (r < M->rows-1) { M->data[M->rows-2] = tptr; } M->rows--; } /************************************************************************* *Removes a column from the matrix. * *INPUT: M: matrix from which to remove a column. * c: column to remove * *TIME: * NON_SPARSE: O(R) * SPARSE_ARRAY: O(R*lg(S/R)) * SPARSE_LIST: O(S) *************************************************************************/ void matr_remove_col(Matrix *M, unsigned int c) { unsigned int i, j; int oldnz; if (!M || !M->data || M->cols == 0) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_remove_col: null matrix.\n"); } return; } if (c >= M->cols) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_remove_col: attempt to remove non-existent column.\n"); } return; } //it would be much faster if we didn't need to preserve the order of the //columns //but we do for (i = 0; i < M->rows; i++) { if (!M->data[i]) { //?? continue; } oldnz = M->data[i]->nz; vector_remove_col(M->data[i], c); if (!M->data[i]) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Unable to remove columns from matrix.\n"); } for (j = 0; j < i; j++) { vector_free(M->data[j]); } for (j = i+1; j < M->rows; j++) { vector_free(M->data[j]); } free(M->data); M->data = NULL; M->cols = 0; M->nz = 0; return; } M->nz += M->data[i]->nz - oldnz; } M->cols--; } /************************************************************************* *Removes rows that are all zeros from the matrix. * *INPUT: M: matrix from which to remove rows. * *TIME: (if there are Z zero rows) * NON_SPARSE: O(ZR) * SPARSE_ARRAY: O(ZR) * SPARSE_LIST: O(ZR) + O(ZS/R) *************************************************************************/ ExpandingArray *matr_remove_zero_rows(Matrix *X) { unsigned int i, offset, row, lim; ExpandingArray *rowMap = make_expanding_array(MATR_DEFAULT_VECTOR_SIZE, MATR_COMPACT); CompactExpandingType cet; ExpandingType et; Vector *r; if (!X || !X->data) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_remoev_zero_rows: null matrix.\n"); } return NULL; } offset = 0; lim = X->rows; for (i = 0; i < lim; i++) { row = i - offset; r = matr_get_row(X, row); if (!r) { continue; } if (vector_iszero(r)) { matr_remove_row(X, row); offset++; } else { cet.i = i; et.compact = &cet; expanding_array_insert(et, rowMap); } } return rowMap; } /************************************************************************* *Removes columns that are all zeros from a sparse matrix. * *INPUT: M: matrix from which to remove columns. * *OUTPUT: An expanding array A of unsigned ints such that if c is a column * number of M after the removal, A[c] is the column number of M before the * removal. A will allow you to reconstruct M. * *TIME: * The running time of this function is complicated. If there are fewer * than QSORT_COUNTING_CUTOFF (we recommend you set this ~8*10^7) elements, * we sort the columns of X using qsort. The running time for this is * (on average) where C is the number of non-zero columns: * NON_SPARSE: -- * SPARSE_ARRAY: O(Slg(S)) + O(Slg(C)) * SPARSE_LIST: O(Slg(S)) + O(Slg(C)) * * If there are greater than QSORT_COUNTING_CUTOFF elements, we sort the * columns of X using a counting sort. The running time if F is the total * number of columns is then * NON_SPARSE: -- * SPARSE_ARRAY: O(F/SVM_MAX_ALLOC_SIZE*RS) * SPARSE_LIST: O(F/SVM_MAX_ALLOC_SIZE*RS) * This depends on the total number of columns because of the amount of * memory we can allocate. If columns are considered unsigned ints, then * F/SVM_MAX_ALLOC_SIZE is usually ~40 * * In general, empirical evidence shows that for small to medium size * matrices qsort is faster and for larger matrices counting sort is faster. * For any matrix that fits in memory, neither should take more than 5 minutes. * *MEMORY: * Running QSort requires having a copy of all of the elements in the * matrix while running counting sort requires an array of size * SVM_MAX_ALLOC_SIZE. Because countint sort is topped at SVM_MAX_ALLOC_SIZE, * if the matrix has a very large number of elements, we recommend you use * counting sort. *************************************************************************/ ExpandingArray *matr_remove_zero_cols(Matrix *X) { if (!X || !X->data) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_remove_zero_cols: null matrix.\n"); } return NULL; } if (!(X->cols) || !(X->nz)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_remove_zero_cols: X has nothing to sort.\n"); } return NULL; } if (X->nz < QSORT_COUNTING_CUTOFF) { return matr_remove_zero_cols_sort(X, QSORT); } return matr_remove_zero_cols_sort(X, COUNTING); } //"private" function to select the sorting algorithm //and actually remove the columns static ExpandingArray *matr_remove_zero_cols_sort(Matrix *X, SortingAlgorithms sorttype) { unsigned int size, offset, j, i, startcol = 0, index, col, lastcol; int iterations, *coliszero = NULL; ExpandingArray *colMap = NULL; VectorIterator vit; Vector *row; CompactExpandingType cet; ExpandingType et; int front, back; if (!X || !X->data) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_remove_zero_cols: null matrix.\n"); } return NULL; } if (X->type == NON_SPARSE) { if (MATR_DEBUG_MODE) { fprintf (stderr, "Called matr_remove_zero_cols on non-sparse matrix. Returning.\n"); } //we don't want to try renumbering columns on a non-sparse matrix //what a headache! return NULL; } switch (sorttype) { case COUNTING: { //O(S*lg_{10^8}(X->cols)) //now remove the zero columns //if X->cols is less than SVM_MAX_ALLOC_SIZE/sizeof(int) //this is a single loop //Basically, we are doing a counting sort on the columns of X. size = X->cols; if (size > SVM_MAX_ALLOC_SIZE/sizeof(int)) { //don't run out of memory //instead do this several times //note that since cols is an integer //X->cols <= 2^32 ~= 4.3*10^9 //i recommend setting SVM_MAX_ALLOC_SIZE = 4*10^8 //this uses only 400 MB of memory and //we won't ever do this loop more than 43 times //So it's still fast size = SVM_MAX_ALLOC_SIZE/sizeof(int); } colMap = make_expanding_array(MATR_DEFAULT_VECTOR_SIZE, MATR_COMPACT); if (!colMap) { if (MATR_DEBUG_MODE) { fprintf (stderr, "Unable to allocate memory for counting sort. Giving up.\n"); } return NULL; } iterations = (int)((X->cols-1)/(double)size+1); if (MATR_DEBUG_MODE >= MATR_DEBUG) { fprintf(stderr, "Removing zero columns will take %d iterations, each of length %d.\n", iterations, size); } coliszero = (int *)malloc(sizeof(int)*size); offset = 0; for (j = 0; j < iterations; j++) { startcol = j*size; if (startcol >= X->cols) { //just a check startcol = (j-1)*size; break; } if (MATR_DEBUG_MODE >= MATR_DEBUG) { fprintf(stderr, "startcol = %u\n", startcol); } for (i = 0; i < size; i++) { coliszero[i] = 1; } //locate the zero columns for (i = 0; i < X->rows; i++) { row = matr_get_row(X, i); if (!row) { continue; } vectorit_set_at_beg(&vit, row); vectorit_find(&vit, startcol, row); vectorit_prev(&vit, row); while ((vectorit_past_beg(vit, row) || vectorit_curr_col(vit, row) < (long)startcol + (long)size) && !vectorit_past_end(vit, row)) { if (!vectorit_past_beg(vit, row) && vectorit_curr_col(vit, row) >= startcol) { coliszero[vectorit_curr_col(vit, row)-startcol] = 0; } vectorit_next(&vit, row); } } //calculate the offset for every column and store it in coliszero //also update colMap for (i = 0; i < size; i++) { if (!coliszero[i]) { cet.i = i + startcol; et.compact = &cet; expanding_array_insert(et, colMap); } else { offset++; } coliszero[i] = offset; } //renumber the columns of X for (i = 0; i < X->rows; i++) { row = matr_get_row(X, i); if (!row) { continue; } vectorit_set_at_beg(&vit, row); vectorit_find(&vit, startcol, row); while (!vectorit_past_end(vit, row) && (vectorit_past_beg(vit, row) || vectorit_curr_col(vit, row) < (long)startcol + (long)size)) { if (!vectorit_past_beg(vit, row) && vectorit_curr_col(vit, row) >= startcol) { vectorit_set_col(vit, vectorit_curr_col(vit, row) - coliszero[vectorit_curr_col(vit, row)-startcol], row); } vectorit_next(&vit, row); } } } index = X->cols - startcol-1; //tell X it has fewer columns now X->cols -= coliszero[index]; for (i = 0; i < X->rows; i++) { X->data[i]->dim -= coliszero[index]; } if (MATR_DEBUG_MODE >= MATR_DEBUG) { fprintf(stderr, "There were %u zero columns.\n", coliszero[index]); } free(coliszero); return colMap; } case MERGE: { if (MATR_DEBUG_MODE) { fprintf(stderr, "Merge sort not yet implemented. Using counting sort.\n"); } return matr_remove_zero_cols(X); } case QSORT: { //ok, let's hope we can fit two versions of X into memory! //put everything into colMap if (MATR_DEBUG_MODE >= MATR_DEBUG) { fprintf(stderr, "Allocating %d elements for sort.\n", X->nz); } colMap = make_expanding_array(X->nz, MATR_COMPACT); if (!colMap) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Unable to allocate enough space for qsort. Using counting sort.\n"); } return matr_remove_zero_cols_sort(X, COUNTING); } for (i = 0; i < X->rows; i++) { row = matr_get_row(X, i); if (!row) { continue; } vectorit_set_at_beg(&vit, row); while (!vectorit_past_end(vit, row)) { col = vectorit_curr_col(vit, row); cet.i = col; et.compact = &cet; expanding_array_insert(et, colMap); vectorit_next(&vit, row); } } qsort(&(colMap->data.compact[colMap->first_elt]), colMap->n_elts, sizeof(CompactExpandingType), compact_expanding_type_int_compare); if (MATR_DEBUG_MODE >= MATR_DEBUG) { fprintf(stderr, "Finished qsort.\n"); } //make colMap unique lastcol = expanding_array_get(0, colMap).compact->i; offset = 0; for (i = 1; i < colMap->n_elts; i++) { et = expanding_array_get(i, colMap); if (et.compact->i == lastcol) { offset++; } else { lastcol = et.compact->i; if (MATR_DEBUG_MODE >= MATR_OPS_MORE) { fprintf(stderr, "Replacing column %d (%u) with column %d (%u)\n", i - offset, expanding_array_get(i-offset, colMap).compact->i, i, expanding_array_get(i, colMap).compact->i); } expanding_array_set(et, i - offset, colMap); } } colMap->n_elts -= offset; colMap->last_elt -= offset; expanding_array_trim(colMap); if (MATR_DEBUG_MODE >= MATR_DEBUG) { fprintf(stderr, "Renumbering columns. Total columns = %d last_elt = %d\n", colMap->n_elts, colMap->last_elt); } //renumber the columns of X for (i = 0; i < X->rows; i++) { row = matr_get_row(X, i); if (!row) { continue; } vectorit_set_at_beg(&vit, row); index = 0; while (!vectorit_past_end(vit, row)) { front = colMap->first_elt; back = colMap->last_elt; col = vectorit_curr_col(vit, row); index = (front + back)/2; while (colMap->data.compact[index].i != col) { if (colMap->data.compact[index].i < col) { front = index+1; } else if (colMap->data.compact[index].i > col) { back = index-1; } index = (front + back)/2; } index -= colMap->first_elt; if (MATR_DEBUG_MODE >= MATR_OPS_MORE) { fprintf(stderr, "index = %d, colMap[index] = %u, actual col = %u\n", index, expanding_array_get(index, colMap).compact->i, vectorit_curr_col(vit, row)); } vectorit_set_col(vit, index, row); vectorit_next(&vit, row); } } //tell X it has fewer columns X->cols = colMap->n_elts; for (i = 0; i < X->rows; i++) { X->data[i]->dim = colMap->n_elts; } return colMap; // return matr_remove_zero_cols_sort(X, COUNTING); } default: { if (MATR_DEBUG_MODE) { fprintf(stderr, "Invalid sorting type.\n"); } return NULL; } } } /************************************************************************* *Appends one matrix to another using shallow copies. In essence, this just *does pointer arithmetic so that the last from->rows of *to_ptr = from *and from no longer has anything in it. * *INPUT: to_ptr: pointer to a pointer to the matrix to which to append. * if to_ptr is NULL, that's bad. if *to_ptr is null, a new matrix will be * created. * from: matrix to append. on return, from will contain no rows * *OUTPUT: A matrix in *to_ptr such that the last from->rows rows of to_ptr * are the rows of from in reverse order. from will have 0 rows. * *TIME: (R and C refer to from) * NON_SPARSE: O(R*C) * SPARSE_ARRAY: O(R) * SPARSE_LIST: O(R) *************************************************************************/ void matr_append_matr(Matrix **to_ptr, Matrix *from) { Matrix *to; unsigned int i, oldrows; Vector *row; if (!to_ptr) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_append_matr: pointer to 'to' matrix unitialized.\n"); } return; } to = *to_ptr; if (from && from->rows > 0) { if (!to) { to = matr_make_size(from->rows, from->cols, from->type, from->compact, from->size); oldrows = 0; } else { oldrows = to->rows; matr_add_nrows(to, from->rows); } if (!to || (from->rows && !(to->data))) { //something is wrong if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_append_matr: error in creating new matrix. your from matrix appears corrupted.\n"); } if (to) { to->rows = 0; to->nz = 0; } return; } for (i = oldrows; i < to->rows; i++) { row = matr_get_row(from, from->rows-1); matr_shallow_row_copy(to, i, row); matr_erase_row(from, from->rows-1); } } *to_ptr = to; } /************************************************************************* *Multiplies a matrix by a vector. * *INPUT: M: matrix * v: row vector * *OUTPUT: ret = M*v. If ret has more rows than M or v, only the first * R rows (where M is R x C) will be relevant. If M has more rows than * ret, M will be treated as a D x C matrix where D is the number of rows * of ret. * *TIME: (M is R x C with S non-zero elements, v has s non-zero elements) * Both NON_SPARSE: O(R*C) * M NON_SPARSE, v SPARSE: O(R*s) * M SPARSE, v NON_SPARSE: O(S) * Both SPARSE: O(S + R*s) * *WARNINGS: *1) v and ret CANNOT point to the same vector. *2) If v->dim > M->cols or M->cols > v->dim, the missing numbers will be * treated as zeros. **************************************************************************/ void matr_vector(Matrix *M, Vector *v, Vector *ret) { unsigned int i, rows; VectorIterator vit; double d; if (!M || !M->data || !v || !ret) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_vector: null arguments.\n"); } return; } rows = ret->dim; if (MATR_DEBUG_MODE >= MATR_OPS) { fprintf(stderr, "matr_vector: multiplying\n"); matr_print(M); fprintf(stderr, "by\n"); vector_print(v); fprintf(stderr, "putting in\n"); vector_print(ret); } if (M->rows < rows) { rows = M->rows; } if (ret->type == SPARSE_ARRAY) { //this is fast //and prevents us from moving the whole array around later vector_zero(ret); } vectorit_set_at_beg(&vit, ret); for (i = 0; i < rows; i++) { d = dot(M->data[i], v); if (fabs(d) < SVM_EPSILON && i == vectorit_curr_col(vit, ret)) { vectorit_zero_elt(&vit, ret); continue; } vectorit_insert(&vit, i, d, ret); vectorit_next(&vit, ret); if (MATR_DEBUG_MODE >= MATR_OPS_MORE) { fprintf(stderr, "ret = "); vector_print(ret); } } } /************************************************************************* *Multiplies a sequence of matrices by a vector. * *INPUT: A: List of matrices * nmatrices: number of matrices in A * maxrows: the maximum number of rows any matrix in the list A has * w: the vector to multiply by * *OUTPUT: z = A_{n-1}*A_{n-2}*...*A_0*w. * *TIME: nmatrices*TIME(matr_vector) * *WARNINGS: *1) w and z CANNOT point to the same vector. **************************************************************************/ void matr_vector_seq(Matrix **A, int nmatrices, unsigned int maxrows, Vector *w, Vector *z) { int i; Vector *tmp1, *tmp2, *ctmp; if (!A || !w || !z) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_vector_seq: null arguments.\n"); } return; } tmp1 = vector_make_size(maxrows, z->type, z->compact, z->size); tmp2 = vector_make_size(maxrows, z->type, z->compact, z->size); if (nmatrices == 0) { return; } if (nmatrices == 1) { matr_vector(A[0], w, z); vector_free(tmp1); vector_free(tmp2); return; } vector_copy(w, tmp1); ctmp = tmp1; for (i = 0; i < nmatrices; i++) { if (!(i%2)) { matr_vector(A[i], tmp1, tmp2); ctmp = tmp2; } else { matr_vector(A[i], tmp2, tmp1); ctmp = tmp1; } } vector_copy(ctmp, z); vector_free(tmp1); vector_free(tmp2); } /************************************************************************* *Transposes a matrix. * *INPUT: A: Matrix to transpose. *OUTPUT: T = A^T, transpose of A. * *TIME: * NON_SPARSE: O(R*C) * SPARSE_ARRAY: O(R) + O(S) * SPARSE_LIST: O(S) * *WARNINGS: *1) A and T CANNOT point to the same matrix. **************************************************************************/ void matr_transpose(Matrix *A, Matrix *T) { unsigned int i; VectorIterator vit, trit; if (!A || !T || !A->data || !T->data) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_transpose: null matrix.\n"); } return; } if (A->rows != T->cols || A->cols != T->rows) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_transpose: transposed matrix incorrect size.\n"); } return; } if (T->type != NON_SPARSE && T->nz > 0) { //clear out T for (i = 0; i < T->rows; i++) { vector_zero(T->data[i]); } T->nz = 0; } for (i = 0; i < A->rows; i++) { vectorit_set_at_beg(&vit, A->data[i]); while (!vectorit_past_end(vit, A->data[i])) { vectorit_set_at_end(&trit, T->data[vectorit_curr_col(vit, A->data[i])]); vectorit_insert(&trit, i, vectorit_curr_val(vit, A->data[i]), T->data[vectorit_curr_col(vit, A->data[i])]); vectorit_next(&vit, A->data[i]); } } } /************************************************************************* *Checks if a matrix is all zeros. * *INPUT: A: matrix to check. *OUTPUT: 1 if matrix is all zeros, 0 else. * *TIME: * NON_SPARSE: O(R*C) * SPARSE_ARRAY: O(R) * SPARSE_LIST: O(R) * **************************************************************************/ int matr_iszero(Matrix *M) { unsigned int i; if (!M || !M->data) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_iszero: null matrix.\n"); } return 1; } for (i = 0; i < M->rows; i++) { if (!vector_iszero(M->data[i])) { return 0; } } return 1; } /*************************************************************************** *Converts a NON_SPARSE matrix to a SPARSE_ARRAY using colMap. * *INPUT: M: matrix to convert * colMap: array such that if c is a column of M when M is non-sparse, * that column will have value colMap[c] when M is sparse. this can be * used to "undo" removing zero columns IF you convert to a NON_SPARSE * matrix after you do so. * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: -- * SPARSE_LIST: -- *************************************************************************/ void matr_convert_nonsparse_to_sparray(Matrix *M, ExpandingArray *colMap) { int i; Vector *row; if (!M || !colMap) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_convert: null arguments.\n"); } return; } if (M->type != NON_SPARSE) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Attempt to convert an already sparse matrix to sparse.\n"); } return; } M->type = SPARSE_ARRAY; M->nz = 0; M->size = M->cols; for (i = 0; i < M->rows; i++) { row = matr_get_row(M, i); if (!row) { continue; } M->nz += row->nz; vector_convert_nonsparse_to_sparray(row, colMap); } } /************************************************************************* *Prints a matrix to stdout putting back the zeros so the full matrix can * be seen. If you want to print a matrix in sparse form, use matr_write * with the file pointer stdout. * *INPUT: M: matrix to print. * *TIME: * NON_SPARSE: O(R*C) * SPARSE_ARRAY: O(R*C) * SPARSE_LIST: O(R*C) * **************************************************************************/ void matr_print(Matrix *M) { unsigned int i; Vector *row; if (!M) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_print: null matrix.\n"); } return; } if (M->rows == 0 || M->cols == 0) { //empty matrix fprintf(stderr, "[]"); return; } for (i = 0; i < M->rows; i++) { row = matr_get_row(M, i); if (row) { vector_print(row); } } } /************************************************************************* *Writes a matrix to a file using a sparse representation for the sparse * matrices and non-sparse representation for the non-sparse ones. * *INPUT: M: matrix to write. * filename: file to write to. *TIME: * NON_SPARSE: O(R*C) * SPARSE_ARRAY: O(S) * SPARSE_LIST: O(S) * **************************************************************************/ void matr_write(Matrix *M, char *filename) { FILE *out = fopen(filename, "w"); if (!out) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Bad file name in matr_write: %s\n", filename); } return; } matr_write_fp(M, out); fclose(out); } /************************************************************************* *Writes a matrix to a file using a sparse representation for the sparse * matrices and non-sparse representation for the non-sparse ones. * *INPUT: M: matrix to write. * fp: file to write to. *TIME: * NON_SPARSE: O(R*C) * SPARSE_ARRAY: O(S) * SPARSE_LIST: O(S) * **************************************************************************/ void matr_write_fp(Matrix *M, FILE *out) { if (!M || !out) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_write: null arguments.\n"); } return; } if (M->type == NON_SPARSE) { matr_write_ns(M, out); } else { matr_write_sp(M, out); } } //"private" functions for writing the different types of matrices static void matr_write_sp(Matrix *M, FILE *out) { unsigned int i; VectorIterator vit; if (!M || !out || !M->data) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_write: null arguments.\n"); } return; } for (i = 0; i < M->rows; i++) { vectorit_set_at_beg(&vit, M->data[i]); while (!vectorit_past_end(vit, M->data[i])) { fprintf(out, "%u %u %lf\n", i, vectorit_curr_col(vit, M->data[i]), vectorit_curr_val(vit, M->data[i])); vectorit_next(&vit, M->data[i]); } } } static void matr_write_ns(Matrix *M, FILE *out) { unsigned int i; if (!M || !out || !M->data) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_write: null arguments.\n"); } return; } for (i = 0; i < M->rows; i++) { vector_write_fp(M->data[i], out); fprintf(out, "\n"); } } /************************************************************************* *Writes a matrix to a file using a binary representation. * *INPUT: M: matrix to write. * filename: file to write to. *TIME: * NON_SPARSE: O(R*C) * SPARSE_ARRAY: O(S) * SPARSE_LIST: O(S) * **************************************************************************/ size_t matr_write_bin(Matrix *M, char *filename) { size_t size; FILE *fp = fopen(filename, "wb"); if (!fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_write_bin: bad filename %s", filename); } return 0; } size = matr_write_bin_fp(M, fp); fclose(fp); return size; } /************************************************************************* *Writes a matrix to a file using a binary representation. * *INPUT: M: matrix to write. * fp: file to write to. *TIME: * NON_SPARSE: O(R*C) * SPARSE_ARRAY: O(S) * SPARSE_LIST: O(S) * **************************************************************************/ size_t matr_write_bin_fp(Matrix *M, FILE *fp) { size_t size; unsigned int i; Vector *row; if (!M || !fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_write: null arguments.\n"); } return 0; } size = sizeof(Matrix)*fwrite(M, sizeof(Matrix), 1, fp); for (i = 0; i < M->rows; i++) { row = matr_get_row(M, i); if (row) { size += vector_write_bin_fp(row, fp); } } return size; } /************************************************************************* *Reads a matrix from a file using a binary representation. * *INPUT: filename: file to read from. * *OUTPUT: Matrix in the file or NULL if the file is incorrectly formatted. * *TIME: * NON_SPARSE: O(R*C) * SPARSE_ARRAY: O(S) * SPARSE_LIST: O(S) * *WARNINGS: * 1) This expects a file formatted as matr_write_bin would write it. If it * detects the file is wrong, it may return NULL, but it may not. Check * the output! **************************************************************************/ Matrix *matr_read_bin(char *filename) { Matrix *M; FILE *fp = fopen(filename, "rb"); if (!fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_read_bin: bad filename %s", filename); } return NULL; } M = matr_read_bin_fp(fp); fclose(fp); return M; } /************************************************************************* *Reads a matrix from a file using a binary representation. * *INPUT: fp: file to read from. * *OUTPUT: Matrix in the file or NULL if the file is incorrectly formatted. * *TIME: * NON_SPARSE: O(R*C) * SPARSE_ARRAY: O(S) * SPARSE_LIST: O(S) * *WARNINGS: * 1) This expects a file formatted as matr_write_bin would write it. If it * detects the file is wrong, it may return NULL, but it may not. Check * the output! **************************************************************************/ Matrix *matr_read_bin_fp(FILE *fp) { Matrix *M = (Matrix *)malloc(sizeof(Matrix)); unsigned int i; size_t amount_read, st; if (!fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_read: bad file pointer.\n"); } free(M); return NULL; } st = ftell(fp); amount_read = fread(M, sizeof(Matrix), 1, fp); M->was_mapped = 0; if (!amount_read) { free(M); return NULL; } M->data = (Vector **)malloc(sizeof(Vector *)*M->rows); if (!M->data && M->rows > 0) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_read: Unable to malloc space for matrix.\n"); } M->rows = 0; M->nz = 0; return M; } for (i = 0; i < M->rows; i++) { M->data[i] = vector_read_bin_fp(fp); if (!M->data[i]) { //oh oh bad file if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_read: Bad file.\n"); } break; } if (MATR_DEBUG_MODE >= MATR_OPS_MORE) { fprintf(stderr, "read row %u feof = %d dim = %d nz = %d\n", i, feof(fp), M->data[i]->dim, M->data[i]->nz); vector_write_sp_fp(matr_get_row(M, i), stderr); } } if (i != M->rows) { M->rows = i; matr_free(M); M = NULL; } return M; } /***************************************************************************** *Converts data stored at *addr into a matrix. * *INPUT: addr: a pointer to the address at which the matrix is stored. * last_addr: the last possible address that is valid. NOT necessarily where * the list ends - just the last address that has been allocated in the * chunk pointed to by *addr (ie, if *addr was taken from an mmap'd file * last_addr would be *addr + the file size). * *OUTPUT: A matrix STILL referencing the chunk of memory pointed to by *addr * although with its OWN, newly malloc'd row list. * *addr: (pass-by-reference) points to the first address AFTER the full * matrix. *WARNINGS: * 1) *addr needs to be writable. This will CHANGE VALUES stored at *addr and * will seg fault if addr is not writable. * 2) last_addr does not need to be the last address of the list * but if it is before that, either NULL will be returned or a * matrix with a NULL data value will be returned. * 3) if *addr does not contain a properly formatted matrix, this function * will not seg fault, but that is the only guarantee. * 4) you MUST call matr_free on this matrix AS WELL AS freeing memory * stored at *addr. * 5) *addr CHANGES! * 6) This was one of the last functions I added to the library and one of the * likeliest to cause memory errors and seg faults. I've done a good bit * of testing on it, but this function and memory-mapped objects in * general are the likeliest to break the library. I appologize. ****************************************************************************/ Matrix *matr_map(void **addr, void *last_addr) { Matrix *M; unsigned int i; if (!addr || !*addr || !last_addr) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_map: null arguments.\n"); } return NULL; } if (*addr + sizeof(Matrix) > last_addr) { return NULL; } M = (Matrix *)(*addr); *addr += sizeof(Matrix); M->was_mapped = 1; M->data = (Vector **)malloc(sizeof(Vector *)*M->rows); if (!M->data && M->rows > 0) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_map: unable to allocate space for matrix.\n"); } M->rows = 0; M->nz = 0; return M; } for (i = 0; i < M->rows; i++) { M->data[i] = vector_map(addr, last_addr); if (!M->data[i]) { if (MATR_DEBUG_MODE) { fprintf(stderr, "matr_map: bad file.\n"); } break; } } if (i != M->rows) { M->rows = i; matr_free(M); M = NULL; } return M; } /************************************************************************* *Frees all memory associated with a matrix. * *INPUT: M: matrix to free. * *TIME: * NON_SPARSE: O(R) * SPARSE_ARRAY: O(R) * SPARSE_LIST: O(S) * **************************************************************************/ void matr_free(Matrix *M) { unsigned int i; if (!M) { return; } if (M->data) { for (i = 0; i < M->rows; i++) { vector_free(M->data[i]); } free(M->data); } if (!M->was_mapped) { free(M); } } /************************************************************************** *The vector class works with the matrix class. All matrices are arrays *of pointers to vectors. In general, the actual work is done in the vector *class. * *For the times given below, we assume vectors are of length c with s *non-zero elements. **************************************************************************/ //Static vector function declarations static void vector_make_nsarray_data(Vector *v, int compact); static void vector_make_sparray_data(Vector *v, int compact, int init_size); static void vector_make_splist_data(Vector *v, int compact); static void vector_add_col_ns(Vector *v); static void vector_add_ncols_ns(Vector *v, unsigned int n); static inline void vector_add_fast(Vector *sp, Vector *ns, Vector *ret); static inline double dot_log(Vector *sp, Vector *ns); static inline double dot_fast(Vector *sp, Vector *ns); static inline void vector_add_multiple_fast(Vector *base, Vector *toadd, double factor); static size_t vector_write_bin_ns(Vector *v, FILE *fp); static void vector_read_bin_ns(Vector *v, FILE *fp); /************************************************************************* *Makes a zero vector. * *INPUT: dim: number of rows/columns in the vector * type: NON_SPARSE, SPARSE_ARRAY, or SPARSE_LIST specifying the data * structure * compact: MATR_COMPACT or MATR_PRECISE specifying whether data is stored * as an int or a double * *OUTPUT: A vector of dimension dim of all zeros with the type and compact * flags set correctly. If the vector is a sparse array, the array * will begin at size MATR_DEFAULT_VECTOR_SIZE. * *TIME: * NON_SPARSE: O(C) * SPARSE_ARRAY: O(1) * SPARSE_LIST: O(1) *************************************************************************/ Vector *vector_make(unsigned int dim, VectorType type, int compact) { return vector_make_size(dim, type, compact, MATR_DEFAULT_VECTOR_SIZE); } /************************************************************************* *Makes a zero vector. * *INPUT: dim: number of rows/columns in the vector * type: NON_SPARSE, SPARSE_ARRAY, or SPARSE_LIST specifying the data * structure * compact: MATR_COMPACT or MATR_PRECISE specifying whether data is stored * as an int or a double * size: the starting size of the array if the vector is a SPARSE_ARRAY * *OUTPUT: A vector of dimension dim of all zeros with the type and compact * flags set correctly. If the vector is a sparse array, the array * will begin at size size. * *TIME: * NON_SPARSE: O(C) * SPARSE_ARRAY: O(1) * SPARSE_LIST: O(1) *************************************************************************/ Vector *vector_make_size(unsigned int dim, VectorType type, int compact, int size) { Vector *v = (Vector *)malloc(sizeof(Vector)); v->dim = dim; v->type = type; v->compact = compact; v->size = size; v->was_mapped = 0; switch(type) { case NON_SPARSE: vector_make_nsarray_data(v, compact); break; case SPARSE_ARRAY: vector_make_sparray_data(v, compact, size); break; case SPARSE_LIST: vector_make_splist_data(v, compact); break; default: if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_make: unrecognized type.\n"); } free(v); return NULL; } return v; } //"private" functions for dealing with making the //different types of data structures static void vector_make_nsarray_data(Vector *v, int compact) { unsigned int i; if (!v) { return; } v->nz = v->dim; if (v->dim > 0) { if (compact) { v->data.nsarray.compact = (int *)malloc(sizeof(int)*v->dim); if (!v->data.nsarray.compact) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Unable to malloc data for non-sparse vector.\n"); } return; } } else { v->data.nsarray.precise = (double *)malloc(sizeof(double)*v->dim); if (!v->data.nsarray.precise) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Unable to malloc data for non-sparse vector.\n"); } return; } } for (i = 0; i < v->dim; i++) { vector_set(v, i, 0); } } else { v->data.nsarray.precise = NULL; //pointers are same size doesn't matter } } static void vector_make_sparray_data(Vector *v, int compact, int size) { if (!v) { return; } if (size < 0) { size = 0; v->size = 0; } v->nz = 0; v->data.sparray = make_expanding_array(size, compact); if (!v->data.sparray && MATR_DEBUG_MODE) { fprintf(stderr, "warning: no space malloc'd for sparse array vector.\n"); } } static void vector_make_splist_data(Vector *v, int compact) { if (!v) { return; } v->nz = 0; v->data.splist = make_list(compact); if (!v->data.splist && MATR_DEBUG_MODE) { fprintf(stderr, "warning: no space malloc'd for sparse list vector.\n"); } } /************************************************************************* *Copies one vector to another. * *INPUT: from: vector to copy from * *OUTPUT: to = from, a copy of the vector from * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(s) * SPARSE_LIST: O(s) * *WARNINGS: *1) from and to cannot point to the same vector (why would you want to do * that anyway?) *************************************************************************/ void vector_copy(Vector *from, Vector *to) { VectorIterator fit, toit; if (!to || !from) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_copy: null arguments.\n"); } return; } if (to->type == SPARSE_ARRAY) { //this is constant time //and prevents us having to move the array //around as we zero elements vector_zero(to); } vectorit_set_at_beg(&fit, from); vectorit_set_at_beg(&toit, to); while (!vectorit_past_end(fit, from) && vectorit_curr_col(fit, from) < to->dim) { if (vectorit_curr_col(toit, to) < vectorit_curr_col(fit, from)) { vectorit_zero_elt(&toit, to); continue; } vectorit_insert(&toit, vectorit_curr_col(fit, from), vectorit_curr_val(fit, from), to); while (vectorit_curr_col(toit, to) <= vectorit_curr_col(fit, from)) { vectorit_next(&toit, to); } vectorit_next(&fit, from); } while (!vectorit_past_end(toit, to)) { vectorit_zero_elt(&toit, to); } } /************************************************************************* *Sets an element of a vector. * *INPUT: v: vector in which to set an element * i: element to set * d: value to set element to * *TIME: * NON_SPARSE: O(1) * SPARSE_ARRAY: * Generally: ammortized O(lg(s)) if d != 0, O(s) if d = 0 * First/Last element: ammortized O(1) * SPARSE_LIST: * Generally: O(s) * First/Last element: O(1) *************************************************************************/ inline void vector_set(Vector *v, unsigned int i, double d) { VectorIterator vit; if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_set: null vector.\n"); } return; } if (i >= v->dim) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_set: out of range column %u.\n", i); } return; } if (v->type == NON_SPARSE) { if (v->compact) { if (v->data.nsarray.compact) { v->data.nsarray.compact[i] = (int)d; } else if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_set: null vector.\n"); } } else { if (v->data.nsarray.precise) { v->data.nsarray.precise[i] = d; } else if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_set: null vector.\n"); } } return; } vectorit_set_at_beg(&vit, v); vectorit_insert(&vit, i, d, v); } /************************************************************************* *Gets an element of a vector. * *INPUT: v: vector from which to get an element * i: element to get * *OUTPUT: The element in the ith column of v. * *TIME: * NON_SPARSE: O(1) * SPARSE_ARRAY: * Generally: O(lg(s)) * First/Last element: O(1) * SPARSE_LIST: * Generally: O(s) * First/Last element: O(1) *************************************************************************/ inline double vector_get(Vector *v, unsigned int i) { VectorIterator vit; if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_get: null vector.\n"); } return 0; } if (i >= v->dim) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_get: out of range column %u.\n", i); } return 0; } if (v->type == NON_SPARSE) { if (v->compact) { if (v->data.nsarray.compact) { return v->data.nsarray.compact[i]; } else { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_get: null vector.\n"); } return 0; } } else { if (v->data.nsarray.precise) { return v->data.nsarray.precise[i]; } else { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_get: null vector.\n"); } return 0; } } } vectorit_set_at_beg(&vit, v); vectorit_find(&vit, i, v); if (vectorit_curr_col(vit, v) == i) { return vectorit_curr_val(vit, v); } return 0; } /************************************************************************* *Zero out a vector. * *INPUT: v: vector to zero * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(1) * SPARSE_LIST: O(s) *************************************************************************/ inline void vector_zero(Vector *v) { unsigned int i; if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_zero: null vector.\n"); } return; } switch (v->type) { case NON_SPARSE: { if (!(v->data.nsarray.compact) && !(v->data.nsarray.precise)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_zero: null vector.\n"); } return; } for (i = 0; i < v->dim; i++) { if (v->compact) { v->data.nsarray.compact[i] = 0; } else { v->data.nsarray.precise[i] = 0; } } break; } case SPARSE_ARRAY: { if (!v->data.sparray) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_zero: null vector.\n"); } return; } expanding_array_clear(v->data.sparray); v->nz = 0; break; } case SPARSE_LIST: { if (!v->data.splist) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_zero: null vector.\n"); } return; } list_clear(v->data.splist); v->nz = 0; break; } default: { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_zero: unrecognized type.\n"); } } } } /************************************************************************* *Add two vectors. * *INPUT: v1: first vector * v2: second vector * *OUTPUT: ret = v1 + v2. note that ret CAN point to v1 or v2 * *TIME: * Both NON_SPARSE: O(c) * One NON_SPARSE, one SPARSE, ret points to the NON_SPARSE: O(s) * Both SPARSE: O(s_1) + O(s_2) *************************************************************************/ void vector_add(Vector *v1, Vector *v2, Vector *ret) { VectorIterator vit1, vit2, *vitr; unsigned int col1, col2, colr, col; double d; if (!v1 || !v2 || !ret) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_add: null arguments.\n"); } return; } if (v1->type != NON_SPARSE && v2->type == NON_SPARSE && ret == v2) { vector_add_fast(v1, v2, ret); return; } if (v1->type == NON_SPARSE && v2->type != NON_SPARSE && ret == v1) { vector_add_fast(v2, v1, ret); return; } if (MATR_DEBUG_MODE >= MATR_OPS) { fprintf(stderr, "Adding\n\t"); vector_print(v1); fprintf(stderr, "and\n\t"); vector_print(v2); fprintf(stderr, "putting in\n\t"); vector_print(ret); } if (ret->type == SPARSE_ARRAY && ret != v1 && ret != v2) { //zero out ret vector_zero(ret); } vectorit_set_at_beg(&vit1, v1); vectorit_set_at_beg(&vit2, v2); if (v1 == ret) { vitr = &vit1; } if (v2 == ret) { vitr = &vit2; } if (v1 != ret && v2 != ret) { vitr = (VectorIterator *)malloc(sizeof(VectorIterator)); vectorit_set_at_beg(vitr, ret); } while (!vectorit_past_end(*vitr, ret) || (!vectorit_past_end(vit1, v1) && vectorit_curr_col(vit1, v1) < ret->dim) || (!vectorit_past_end(vit2, v2) && (vectorit_curr_col(vit2, v2) < ret->dim))) { col1 = vectorit_curr_col(vit1, v1); col2 = vectorit_curr_col(vit2, v2); colr = vectorit_curr_col(*vitr, ret); if (MATR_DEBUG_MODE >= MATR_OPS_MORE) { fprintf(stderr, "col1 = %d, col2 = %d, colr = %d\n", col1, col2, colr); } if ((colr < col1 || col1 >= v1->dim) && (colr < col2 || col2 >= v2->dim)) { vectorit_zero_elt(vitr, ret); continue; } if (col1 == col2 && col1 < v1->dim && col2 < v2->dim) { d = vectorit_curr_val(vit1, v1) + vectorit_curr_val(vit2, v2); col = col1; if (v1 != ret) { vectorit_next(&vit1, v1); } if (v2 != ret) { vectorit_next(&vit2, v2); } } else if (col1 < col2 || col2 == v2->dim) { col = col1; d = vectorit_curr_val(vit1, v1); if (v1 != ret) { vectorit_next(&vit1, v1); } } else { col = col2; d = vectorit_curr_val(vit2, v2); if (v2 != ret) { vectorit_next(&vit2, v2); } } if (fabs(d) < SVM_EPSILON) { vectorit_zero_elt(vitr, ret); } else { vectorit_insert(vitr, col, d, ret); vectorit_next(vitr, ret); } } if (v1 != ret && v2 != ret) { free(vitr); } } /************************************************************************* *Multiply a vector by a scalar. * *INPUT: v: vector to multiply * s: scalar to multiply by * *OUTPUT: ret = s*v. ret CAN point to the same vector as v. * *TIME: * NON_SPARSE: O(c) * SPARSE_LIST: O(s) * SPARSE_ARRAY: O(s) *************************************************************************/ void vector_multiply(Vector *v, double s, Vector *ret) { VectorIterator vit, *vitr; unsigned int col, colr; if (!v || !ret) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_multiply: null arguments.\n"); } return; } if (MATR_DEBUG_MODE >= MATR_OPS) { fprintf(stderr, "multiplying\n\t"); vector_print(v); fprintf(stderr, "by %lf, putting in\n\t", s); vector_print(ret); } if (fabs(s) < SVM_EPSILON || vector_iszero(v)) { //zero out vector if (MATR_DEBUG_MODE >= MATR_OPS) { fprintf(stderr, "zeroing ret.\n"); } vector_zero(ret); return; } if (ret != v && ret->type == SPARSE_ARRAY) { vector_zero(ret); } vectorit_set_at_beg(&vit, v); if (ret == v) { vitr = &vit; } else { vitr = (VectorIterator *)malloc(sizeof(VectorIterator)); vectorit_set_at_beg(vitr, ret); } while (!vectorit_past_end(*vitr, v) || (!vectorit_past_end(vit, v) && vectorit_curr_col(vit, v) < ret->dim)) { col = vectorit_curr_col(vit, v); colr = vectorit_curr_col(*vitr, ret); if (MATR_DEBUG_MODE >= MATR_OPS_MORE) { fprintf(stderr, "col = %d, colr = %d ret = ", col, colr); vector_print(ret); } if (colr < col || col == v->dim) { vectorit_zero_elt(vitr, ret); continue; } vectorit_insert(vitr, col, s*vectorit_curr_val(vit, v), ret); vectorit_next(&vit, v); while (vectorit_curr_col(*vitr, ret) <= col) { vectorit_next(vitr, ret); } } if (ret != v) { free(vitr); } } /************************************************************************* *Dot two vectors. * *INPUT: v1: first vector * v2: second vector * *OUTPUT: v1 dot v2. * *TIME: * Both NON_SPARSE: O(c) * One NON_SPARSE, one SPARSE: O(s) * Both SPARSE: O(s_1) + O(s_2) *************************************************************************/ double dot(Vector *v1, Vector *v2) { VectorIterator vit1, vit2; unsigned int col1, col2; double ret = 0; if (!v1 || !v2) { if (MATR_DEBUG_MODE) { fprintf(stderr, "dot: null arguments.\n"); } return 0; } if (v1->type != NON_SPARSE && v2->type == NON_SPARSE) { return dot_fast(v1, v2); } if (v1->type == NON_SPARSE && v2->type != NON_SPARSE) { return dot_fast(v2, v1); } if (v1->type == SPARSE_ARRAY && v2->nz <= 0.1*v1->nz) { return dot_log(v2, v1); } if (v2->type == SPARSE_ARRAY && v1->nz <= 0.1*v2->nz) { return dot_log(v1, v2); } vectorit_set_at_beg(&vit1, v1); vectorit_set_at_beg(&vit2, v2); while (!vectorit_past_end(vit1, v1) && !vectorit_past_end(vit2, v2)) { col1 = vectorit_curr_col(vit1, v1); col2 = vectorit_curr_col(vit2, v2); if (col1 < col2) { vectorit_next(&vit1, v1); continue; } if (col2 < col1) { vectorit_next(&vit2, v2); continue; } ret += vectorit_curr_val(vit1, v1)*vectorit_curr_val(vit2, v2); vectorit_next(&vit1, v1); vectorit_next(&vit2, v2); } return ret; } void vector_add_multiple(Vector *base, Vector *toadd, double factor, Vector *ret) { VectorIterator vit1, vit2, *vitr; unsigned int col1, col2, colr, col; double d; if (!base || !toadd || !ret) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_add_multiple: null arguments.\n"); } return; } if (fabs(factor) < SVM_EPSILON) { if (ret != base) { vector_copy(base, ret); } return; } if (base->type == NON_SPARSE && ret == base) { vector_add_multiple_fast(base, toadd, factor); return; } if (MATR_DEBUG_MODE >= MATR_OPS) { fprintf(stderr, "Adding to \n\t"); vector_print(base); fprintf(stderr, "Multiplying\n\t"); vector_print(toadd); fprintf(stderr, "by %lf and putting in\n\t", factor); vector_print(ret); } if (ret->type == SPARSE_ARRAY && ret != base && ret != toadd) { //zero out ret vector_zero(ret); } vectorit_set_at_beg(&vit1, base); vectorit_set_at_beg(&vit2, toadd); if (base == ret) { vitr = &vit1; } if (toadd == ret) { vitr = &vit2; } if (base != ret && toadd != ret) { vitr = (VectorIterator *)malloc(sizeof(VectorIterator)); vectorit_set_at_beg(vitr, ret); } while (!vectorit_past_end(*vitr, ret) || (!vectorit_past_end(vit1, base) && vectorit_curr_col(vit1, base) < ret->dim) || (!vectorit_past_end(vit2, toadd) && (vectorit_curr_col(vit2, toadd) < ret->dim))) { col1 = vectorit_curr_col(vit1, base); col2 = vectorit_curr_col(vit2, toadd); colr = vectorit_curr_col(*vitr, ret); if (MATR_DEBUG_MODE >= MATR_OPS_MORE) { fprintf(stderr, "col1 = %d, col2 = %d, colr = %d\n", col1, col2, colr); } if ((colr < col1 || col1 >= base->dim) && (colr < col2 || col2 >= toadd->dim)) { vectorit_zero_elt(vitr, ret); continue; } if (col1 == col2 && col1 < base->dim && col2 < toadd->dim) { d = vectorit_curr_val(vit1, base) + factor*vectorit_curr_val(vit2, toadd); col = col1; if (base != ret) { vectorit_next(&vit1, base); } if (toadd != ret) { vectorit_next(&vit2, toadd); } } else if (col1 < col2 || col2 == toadd->dim) { col = col1; d = vectorit_curr_val(vit1, base); if (base != ret) { vectorit_next(&vit1, base); } } else { col = col2; d = factor*vectorit_curr_val(vit2, toadd); if (toadd != ret) { vectorit_next(&vit2, toadd); } } if (fabs(d) < SVM_EPSILON) { vectorit_zero_elt(vitr, ret); } else { vectorit_insert(vitr, col, d, ret); vectorit_next(vitr, ret); } } if (base != ret && toadd != ret) { free(vitr); } } /************************************************************************* *Check if a vector is all zeros. * *INPUT: v: vector to check * *OUTPUT: 1 if v is all zeros, 0 else * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(1) * SPARSE_LIST: O(1) *************************************************************************/ inline int vector_iszero(Vector *v) { unsigned int i; if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_iszero: null vector.\n"); } return 1; } switch (v->type) { case NON_SPARSE: { for (i = 0; i < v->dim; i++) { if (vector_get(v, i)) { return 0; } } return 1; } case SPARSE_ARRAY: { if (!v->data.sparray) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_iszero: null vector.\n"); } return 1; } return v->data.sparray->n_elts == 0; } case SPARSE_LIST: { if (!v->data.splist) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_iszero: null vector.\n"); } return 1; } return list_is_empty(v->data.splist); } default: if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_iszero: unrecognized type.\n"); } return 1; } } /************************************************************************* *Check if two vectors have the same content (regardless of representation). * *INPUT: v1, v2: vectors to check * *OUTPUT: 1 if v1 = v2 in the content sense, 0 else * *TIME: * NON_SPARSE, SPARSE: O(c) + O(s) * SPARSE, SPARSE: O(s_1) + O(s_2) *************************************************************************/ int vector_equals(Vector *v1, Vector *v2) { VectorIterator vit1, vit2; unsigned int col1, col2; if (v1 == v2) { return 1; } if (!v1 || !v2) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_equals: null arguments.\n"); } return 0; } vectorit_set_at_beg(&vit1, v1); vectorit_set_at_beg(&vit2, v2); while ((!vectorit_past_end(vit1, v1) && vectorit_curr_col(vit1, v1) < v2->dim) || (!vectorit_past_end(vit2, v2) && vectorit_curr_col(vit2, v2) < v1->dim)) { col1 = vectorit_curr_col(vit1, v1); col2 = vectorit_curr_col(vit2, v2); if (col1 != col2) { //check for non-sparse representations having 0's where //sparse representations have no entry if (v1->type == NON_SPARSE && col1 < col2 && fabs(vectorit_curr_val(vit1, v1)) < SVM_EPSILON) { vectorit_next(&vit1, v1); continue; } if (v2->type == NON_SPARSE && col2 < col1 && fabs(vectorit_curr_val(vit2, v2)) < SVM_EPSILON) { vectorit_next(&vit2, v2); continue; } return 0; } if (vectorit_curr_val(vit1, v1) != vectorit_curr_val(vit2, v2)) { return 0; } vectorit_next(&vit1, v1); vectorit_next(&vit2, v2); } return 1; } //"private" function for adding a sparse and a non-sparse //vector quickly static inline void vector_add_fast(Vector *sp, Vector *ns, Vector *ret) { VectorIterator vit; unsigned int col; if (!sp || !ns || !ret) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_add: null arguments.\n"); } return; } if (ret != ns || sp->type == NON_SPARSE || ns->type != NON_SPARSE) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_add_fast: Wrong sparseness.\n"); } vector_add(sp, ns, ret); return; } vectorit_set_at_beg(&vit, sp); while (!vectorit_past_end(vit, sp)) { col = vectorit_curr_col(vit, sp); vector_set(ret, col, vectorit_curr_val(vit, sp) + vector_get(ns, col)); vectorit_next(&vit, sp); } } static inline void vector_add_multiple_fast(Vector *base, Vector *toadd, double factor) { VectorIterator vit; int j; if (!base || !toadd) { if (MATR_DEBUG_MODE) { fprintf(stderr, "add multiple: null arguments.\n"); } return; } if (base->type != NON_SPARSE) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Warning: Called add_multiple_fast with wrong sparseness.\n"); } } if (fabs(factor) < SVM_EPSILON) { return; } //a common combination that we want to be screaming fast if (toadd->type == SPARSE_ARRAY && toadd->compact && !(base->compact) && toadd->data.sparray && base->type == NON_SPARSE && base->data.nsarray.precise) { for (j = toadd->data.sparray->first_elt; j <= toadd->data.sparray->last_elt; j++) { base->data.nsarray.precise[toadd->data.sparray->data.compact[j].s.col] += factor*toadd->data.sparray->data.compact[j].s.data; } return; } vectorit_set_at_beg(&vit, toadd); while (!vectorit_past_end(vit, toadd)) { vector_set(base, vectorit_curr_col(vit, toadd), vector_get(base, vectorit_curr_col(vit, toadd)) + factor*vectorit_curr_val(vit, toadd)); vectorit_next(&vit, toadd); } } //"private" function for dotting a sparse and a non-sparse vector //quickly static inline double dot_fast(Vector *sp, Vector *ns) { VectorIterator vit; double ret = 0; int j; if (!sp || !ns) { if (MATR_DEBUG_MODE) { fprintf(stderr, "dot: null arguments.\n"); } return 0; } if (ns->type != NON_SPARSE) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Warning: Called dot_fast with incorrect sparseness.\n"); } } //this particular combination of types comes up often //we want it to be as fast as possible //so use ugly code if (sp->type == SPARSE_ARRAY && sp->compact && !(ns->compact) && sp->data.sparray && ns->type == NON_SPARSE && ns->data.nsarray.precise) { for (j = sp->data.sparray->first_elt; j <= sp->data.sparray->last_elt; j++){ ret += sp->data.sparray->data.compact[j].s.data* ns->data.nsarray.precise[sp->data.sparray->data.compact[j].s.col]; } return ret; } //this is still fairly fast for all other cases //and much prettier =D vectorit_set_at_beg(&vit, sp); while (!vectorit_past_end(vit, sp)) { ret += vectorit_curr_val(vit, sp)* vector_get(ns, vectorit_curr_col(vit, sp)); vectorit_next(&vit, sp); } return ret; } //private function for dotting a sparse array and another much sparser //vector quickly using a binary search in the sparse array. static inline double dot_log(Vector *sp, Vector *ns) { VectorIterator vit, nit; double ret = 0; if (!sp || !ns) { if (MATR_DEBUG_MODE) { fprintf(stderr, "dot: null arguments.\n"); } return 0; } if (ns->type != SPARSE_ARRAY) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Warning: Called dot_log with incorrect sparseness.\n"); } //return dot(sp, ns); } if (sp->type == NON_SPARSE) { //this is faster return dot_fast(ns, sp); } vectorit_set_at_beg(&vit, sp); vectorit_set_at_beg(&nit, ns); while (!vectorit_past_end(vit, sp)) { vectorit_find(&nit, vectorit_curr_col(vit, sp), ns); if (vectorit_curr_col(nit, ns) == vectorit_curr_col(vit, sp)) { ret += vectorit_curr_val(vit, sp)*vectorit_curr_val(nit, ns); } vectorit_next(&vit, sp); } return ret; } /************************************************************************* *Add a column to the end of the vector. * *INPUT: v: vector to add a column to * *TIME: * NON_SPARSE: O(1) (realloc succeeds) O(c) (realloc fails) * SPARSE_ARRAY: O(1) * SPARSE_LIST: O(1) *************************************************************************/ void vector_add_col(Vector *v) { if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_add_col: null vector.\n"); } return; } if (v->type != NON_SPARSE) { v->dim++; //well, this is easy return; } vector_add_col_ns(v); } //"private" function to add a column to //a non-sparse vector static void vector_add_col_ns(Vector *v) { NSData tmpdata; if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_add_col: null vector.\n"); } return; } if (v->compact) { if (!(v->was_mapped) || (v->was_mapped && (void *)v + sizeof(Vector) != (void *)(v->data.nsarray.compact))) { v->data.nsarray.compact = (int *)realloc(v->data.nsarray.compact, sizeof(int)*(v->dim+1)); } else { tmpdata.compact = v->data.nsarray.compact; v->data.nsarray.compact = (int *)malloc(sizeof(int)*(v->dim+1)); if (v->data.nsarray.compact) { memcpy(v->data.nsarray.compact, tmpdata.compact, sizeof(int)*v->dim); } } if (!v->data.nsarray.compact) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Error adding a column to non-sparse vector.\n"); } v->dim = 0; v->nz = 0; return; } } else { if (!(v->was_mapped) || (v->was_mapped && (void *)v + sizeof(Vector) != (void *)(v->data.nsarray.precise))) { v->data.nsarray.precise = (double *)realloc(v->data.nsarray.precise, sizeof(double)*(v->dim+1)); } else { tmpdata.precise = v->data.nsarray.precise; v->data.nsarray.precise = (double *)malloc(sizeof(double)*(v->dim+1)); if (v->data.nsarray.precise) { memcpy(v->data.nsarray.precise, tmpdata.precise, sizeof(double)*v->dim); } } if (!v->data.nsarray.precise) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Error adding a column to non-sparse vector.\n"); } v->dim = 0; v->nz = 0; return; } } v->dim++; vector_set(v, v->dim-1, 0); } /************************************************************************* *Add n columns to the end of the vector. * *INPUT: v: vector to add columns to * *TIME: * NON_SPARSE: O(1) (realloc succeeds) O(c) (realloc fails) * SPARSE_ARRAY: O(1) * SPARSE_LIST: O(1) *************************************************************************/ void vector_add_ncols(Vector *v, unsigned int n) { if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_add_ncols: null vector.\n"); } return; } if (n <= 0) { return; } if (v->type != NON_SPARSE) { v->dim += n; } else { vector_add_ncols_ns(v, n); } } //"private" function to add n columns to a non-sparse vector static void vector_add_ncols_ns(Vector *v, unsigned int n) { unsigned int i; NSData tmpdata; if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_add_ncols: null vector.\n"); } return; } if (n <= 0){ return; } if (v->compact) { if (!(v->was_mapped) || (v->was_mapped && (void *)v + sizeof(Vector) != (void *)(v->data.nsarray.compact))) { v->data.nsarray.compact = (int *)realloc(v->data.nsarray.compact, sizeof(int)*(v->dim+n)); } else { tmpdata.compact = v->data.nsarray.compact; v->data.nsarray.compact = (int *)malloc(sizeof(int)*(v->dim+n)); if (v->data.nsarray.compact) { memcpy(v->data.nsarray.compact, tmpdata.compact, sizeof(int)*v->dim); } } if (!v->data.nsarray.compact) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Error adding a column to non-sparse vector.\n"); } v->dim = 0; v->nz = 0; return; } } else { if (!(v->was_mapped) || (v->was_mapped && (void *)v + sizeof(Vector) != (void *)(v->data.nsarray.precise))) { v->data.nsarray.precise = (double *)realloc(v->data.nsarray.precise, sizeof(double)*(v->dim+n)); } else { tmpdata.precise = v->data.nsarray.precise; v->data.nsarray.precise = (double *)malloc(sizeof(double)*(v->dim+n)); if (v->data.nsarray.precise) { memcpy(v->data.nsarray.precise, tmpdata.precise, sizeof(double)*v->dim); } } if (!v->data.nsarray.precise) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Error adding a column to non-sparse vector.\n"); } v->dim = 0; v->nz = 0; return; } } v->dim += n; for (i = v->dim-n; i < v->dim; i++) { vector_set(v, i, 0); } } /************************************************************************* *Remove a column from a vector. * *INPUT: v: vector from which to remove a column * c: column to remove * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(s) * SPARSE_LIST: O(s) *************************************************************************/ void vector_remove_col(Vector *v, unsigned int c) { VectorIterator vit; int remove = 0, i; double d; if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_remove_col: null vector.\n"); } return; } if (c >= v->dim) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_remove_col: attempt to remove nonexistant column.\n"); } return; } vectorit_set_at_beg(&vit, v); vectorit_find(&vit, c, v); if (vectorit_curr_col(vit, v) == c) { remove = 1; } //just make sure we're pointing after or at c while (vectorit_curr_col(vit, v) < c) { vectorit_next(&vit, v); } if (v->type == NON_SPARSE) { if (v->dim == 1) { if (v->compact && v->data.nsarray.compact) { free(v->data.nsarray.compact); } else if (v->data.nsarray.precise) { free(v->data.nsarray.precise); } v->data.nsarray.precise = NULL; v->dim = 0; return; } d = vector_get(v, v->dim-1); if (v->compact) { if (!(v->was_mapped) || (v->was_mapped && (void *)v + sizeof(Vector) != (void *)(v->data.nsarray.compact))) { v->data.nsarray.compact = (int *)realloc(v->data.nsarray.compact, sizeof(int)*(v->dim-1)); } else if (v->dim-1 <= 0) { //otherwise v->data is mapped in memory and all is good v->data.nsarray.compact = NULL; } if (!v->data.nsarray.compact && v->dim > 0) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Error removing a column from non-sparse vector.\n"); } v->dim = 0; v->nz = 0; return; } } else { if (!(v->was_mapped) || (v->was_mapped && (void *)v + sizeof(Vector) != (void *)(v->data.nsarray.compact))) { v->data.nsarray.precise = (double *)realloc(v->data.nsarray.precise, sizeof(double)*(v->dim-1)); } else if (v->dim-1 <= 0) { v->data.nsarray.precise = NULL; } if (!v->data.nsarray.precise && v->dim > 0) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Error removing a column from non-sparse vector.\n"); } v->dim = 0; v->nz = 0; return; } } if (v->dim >= 2) { for (i = c; i < v->dim-2; i++) { vector_set(v, i, vector_get(v, i+1)); } } if ((v->dim >= 1 && c < v->dim-1)) { vector_set(v, v->dim-2, d); } v->dim--; return; } if (remove) { vectorit_zero_elt(&vit, v); } while (!vectorit_past_end(vit, v)) { vectorit_set_col(vit, vectorit_curr_col(vit, v) - 1, v); vectorit_next(&vit, v); } v->dim--; } /************************************************************************* *Squared distance between two vectors. * *INPUT: v1: first vector * v2: second vector * *OUTPUT: ||v1 - v2||^2 * *TIME: * Both NON_SPARSE: O(c) * One NON_SPARSE, one SPARSE: O(s) + O(c) * Both SPARSE: O(s_1) + O(s_2) *************************************************************************/ double vector_dist2(Vector *v1, Vector *v2) { VectorIterator vit1, vit2; unsigned int col1, col2; double ret = 0, d; if (!v1 || !v2) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_dist2: null arguments.\n"); } return -1; } if (v1->dim != v2->dim) { //uh oh if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_dist2: dimension mismatch\n"); } return -1; } vectorit_set_at_beg(&vit1, v1); vectorit_set_at_beg(&vit2, v2); while (!vectorit_past_end(vit1, v1) || !vectorit_past_end(vit2, v2)) { col1 = vectorit_curr_col(vit1, v1); col2 = vectorit_curr_col(vit2, v2); if (col1 == col2) { d = vectorit_curr_val(vit1, v1) - vectorit_curr_val(vit2, v2); vectorit_next(&vit1, v1); vectorit_next(&vit2, v2); } else if (col1 < col2) { d = vectorit_curr_val(vit1, v1); vectorit_next(&vit1, v1); } else { d = vectorit_curr_val(vit2, v2); vectorit_next(&vit2, v2); } ret += d*d; } return ret; } /************************************************************************* *Converts a NON_SPARSE vector to a sparse array using colMap. * *INPUT: v: vector to convert * colMap: array such that if c is a column of v when v is non-sparse, * that column will have value colMap[c] when v is sparse. this can be * used to "undo" removing zero columns. * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: -- * SPARSE_LIST: -- *************************************************************************/ void vector_convert_nonsparse_to_sparray(Vector *v, ExpandingArray *colMap) { Vector tmpv; int i; VectorIterator vit; ExpandingType et; if (!v || !colMap) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_convert: null arguments.\n"); } return; } if (v->type != NON_SPARSE) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Attempt to convert already sparse vector to sparse.\n"); } return; } et = expanding_array_get(v->dim-1, colMap); if (!et.precise || !et.compact) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_convert: colMap doesn't have enough entries.\n"); } return; } tmpv.type = NON_SPARSE; tmpv.size = v->size; tmpv.dim = v->dim; tmpv.nz = v->nz; tmpv.compact = v->compact; tmpv.data = v->data; v->type = SPARSE_ARRAY; v->size = v->dim; v->dim = et.compact->i+1; vector_make_sparray_data(v, v->compact, v->size); if (!v->data.sparray || (v->compact && !v->data.sparray->data.compact) || (!(v->compact) && !v->data.sparray->data.precise)) { if (MATR_DEBUG_MODE) { fprintf (stderr, "vector_convert: unable to convert vector. It appears corrupted.\n"); } v->type = tmpv.type; v->size = tmpv.size; v->dim = tmpv.dim; v->data = tmpv.data; return; } vectorit_set_at_beg(&vit, v); for (i = 0; i < tmpv.dim; i++) { et = expanding_array_get(i, colMap); if (!et.precise || !et.compact) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_convert: colMap doesn't have enough entries.\n"); } if (v->compact) { free(v->data.sparray->data.compact); } else { free(v->data.sparray->data.precise); } v->type = tmpv.type; v->size = tmpv.size; v->dim = tmpv.dim; return; } vectorit_insert(&vit, et.compact->i, vector_get(&tmpv, i), v); } if (tmpv.compact) { free(tmpv.data.nsarray.compact); } else { free(tmpv.data.nsarray.precise); } } /************************************************************************* *Print a vector to stdout. Puts the zeros back in the vector. * *INPUT: v: vector to print * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(c) * SPARSE_LIST: O(c) *************************************************************************/ void vector_print(Vector *v) { VectorIterator vit; int lastcol = -1, i, col; if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_print: null vector.\n"); } return; } fprintf(stderr, "["); vectorit_set_at_beg(&vit, v); while (!vectorit_past_end(vit, v)) { col = vectorit_curr_col(vit, v); for (i = lastcol+1; i < col; i++) { fprintf(stderr, "%20.10lf", 0.0); } fprintf(stderr, "%20.10lf", vectorit_curr_val(vit, v)); lastcol = col; vectorit_next(&vit, v); } for (i = lastcol+1; i < v->dim; i++) { fprintf(stderr, "%20.10lf", 0.0); } fprintf(stderr, "]\n"); } /************************************************************************* *Write a vector to a file. Writes everything in non-sparse format. * *INPUT: v: vector to write * filename: file to write vector to * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(c) * SPARSE_LIST: O(c) *************************************************************************/ void vector_write(Vector *v, char *filename) { FILE *out = fopen(filename, "w"); if (!out) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_write: Bad file name %s\n", filename); } return; } vector_write_fp(v, out); fclose(out); } /************************************************************************* *Write a vector to a file. Writes everything in non-sparse format. * *INPUT: v: vector to write * out: pointer to file to write vector to * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(c) * SPARSE_LIST: O(c) *************************************************************************/ void vector_write_fp(Vector *v, FILE *out) { int lastcol = -1, i, col; VectorIterator vit; if (!v || !out) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_write: null arguments.\n"); } return; } vectorit_set_at_beg(&vit, v); while (!vectorit_past_end(vit, v)) { col = vectorit_curr_col(vit, v); for (i = lastcol+1; i < col; i++) { fprintf(out, "0 "); } lastcol = col; fprintf(out, "%f ", vectorit_curr_val(vit, v)); vectorit_next(&vit, v); } for (i = lastcol+1; i < v->dim; i++) { fprintf(out, "0 "); } } /************************************************************************* *Write a vector to a file. Writes everything (including non-sparse vectors!) *in sparse format. * *INPUT: v: vector to write * filname: file to write vector to * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(s) * SPARSE_LIST: O(s) *************************************************************************/ void vector_write_sp(Vector *v, char *filename) { FILE *out; out = fopen(filename, "w"); if (!out) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_write_sp: bad output filename %s\n", filename); } return; } vector_write_sp_fp(v, out); fclose(out); } /************************************************************************* *Write a vector to a file. Writes everything in sparse format. * *INPUT: v: vector to write * out: pointer to file to write vector to * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(s) * SPARSE_LIST: O(s) *************************************************************************/ void vector_write_sp_fp(Vector *v, FILE *out) { VectorIterator vit; if (!v || !out) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_write: null arguments.\n"); } return; } vectorit_set_at_beg(&vit, v); while (!vectorit_past_end(vit, v)) { fprintf(out, "%u %lf\n", vectorit_curr_col(vit, v), vectorit_curr_val(vit, v)); vectorit_next(&vit, v); } } /************************************************************************* *Write a vector to a file. Writes everything in binary format. * *INPUT: v: vector to write * filename: file to write vector to * *OUTPUT: number of bytes written. * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(s) * SPARSE_LIST: O(s) *************************************************************************/ size_t vector_write_bin(Vector *v, char *filename) { size_t size; FILE *fp = fopen(filename, "wb"); if (!fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_write_bin: Bad file name %s.\n", filename); } return 0; } size = vector_write_bin_fp(v, fp); fclose(fp); return size; } /************************************************************************* *Write a vector to a file. Writes everything in binary format. * *INPUT: v: vector to write * fp: file to write vector to * *OUTPUT: number of bytes written * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(s) * SPARSE_LIST: O(s) *************************************************************************/ size_t vector_write_bin_fp(Vector *v, FILE *fp) { size_t size = sizeof(Vector)*fwrite(v, sizeof(Vector), 1, fp); if (!v || !fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_write: null arguments.\n"); } return 0; } switch(v->type) { case NON_SPARSE: { size += vector_write_bin_ns(v, fp); return size; } case SPARSE_ARRAY: { size += expanding_array_write(v->data.sparray, fp); return size; } case SPARSE_LIST: { size += list_write(v->data.splist, fp); return size; } default: { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_write_bin_fp: unrecognized type\n"); } return size; } } } //"private" function to write non-sparse vector to file in binary static size_t vector_write_bin_ns(Vector *v, FILE *fp) { if (!v || !fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_write: null arguments.\n"); } return 0; } if (v->type != NON_SPARSE) { return vector_write_bin_fp(v, fp); } if (v->compact) { return sizeof(int)*fwrite(v->data.nsarray.compact, sizeof(int), v->dim, fp); } return sizeof(double)*fwrite(v->data.nsarray.precise, sizeof(double), v->dim, fp); } /************************************************************************* *Read a vector from a binary file. * *INPUT: filename: file to read vector from * *OUTPUT: vector in file or NULL if the file is incorrectly formatted * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(s) * SPARSE_LIST: O(s) * *WARNINGS: *1) This expects a binary file formatted as vector_write_bin does. If * the file is incorrectly formatted, this may return NULL or it may * return some weird interpretation. Check the output! *************************************************************************/ Vector *vector_read_bin(char *filename) { Vector *v; FILE *fp = fopen(filename, "rb"); if (!fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_write_bin: Bad file name %s.\n", filename); } return NULL; } v = vector_read_bin_fp(fp); fclose(fp); return v; } /************************************************************************* *Read a vector from a binary file. * *INPUT: fp: file to read vector from * *OUTPUT: vector in file or NULL if the file is incorrectly formatted * *TIME: * NON_SPARSE: O(c) * SPARSE_ARRAY: O(s) * SPARSE_LIST: O(s) * *WARNINGS: *1) This expects a binary file formatted as vector_write_bin does. If * the file is incorrectly formatted, this may return NULL or it may * return some weird interpretation. Check the output! *************************************************************************/ Vector *vector_read_bin_fp(FILE *fp) { Vector tmpv, *v; size_t amount_read; amount_read = fread(&tmpv, sizeof(Vector), 1, fp); if (!(amount_read)) { return NULL; } v = vector_make_size(tmpv.dim, tmpv.type, tmpv.compact, 0); if (!v) { return NULL; } v->nz = tmpv.nz; switch(v->type) { case NON_SPARSE: { vector_read_bin_ns(v, fp); return v; } case SPARSE_ARRAY: { if (v->nz && !v->data.sparray) { if (MATR_DEBUG_MODE) { fprintf (stderr, "warning: no space allocated for non-zero sparse array vector.\n"); } v->nz = 0; return v; } expanding_array_read(v->data.sparray, fp); return v; } case SPARSE_LIST: { if (v->nz && !(v->data.splist)) { if (MATR_DEBUG_MODE) { fprintf (stderr, "warning: no space allocated for non-zero sparse list vector.\n"); } v->nz = 0; return v; } v->nz = list_read(v->data.splist, fp, v->nz); return v; } default: { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_read_bin_fp: unrecognized type.\n"); } return v; } } } //private function to read a non-sparse vector from a binary file static void vector_read_bin_ns(Vector *v, FILE *fp) { size_t amount_read = 0; if (v->type != NON_SPARSE) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Called vector_read_bin_ns on non-sparse vector.\n"); } return; } if (v->compact) { if (v->data.nsarray.compact) { amount_read = fread(v->data.nsarray.compact, sizeof(int), v->dim, fp); } } else { if (v->data.nsarray.precise) { amount_read = fread(v->data.nsarray.precise, sizeof(double), v->dim, fp); } } if (v->dim && !amount_read) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Warning: nothing was read into non-sparse vector.\n"); } v->dim = 0; } } /***************************************************************************** *Converts data stored at *addr into a matrix. * *INPUT: addr: a pointer to the address at which the matrix is stored. * last_addr: the last possible address that is valid. NOT necessarily where * the list ends - just the last address that has been allocated in the * chunk pointed to by *addr (ie, if *addr was taken from an mmap'd file * last_addr would be *addr + the file size). * *OUTPUT: A matrix STILL referencing the chunk of memory pointed to by *addr * although with its OWN, newly malloc'd row list. * *addr: (pass-by-reference) points to the first address AFTER the full * matrix. *WARNINGS: * 1) *addr needs to be writable. This will CHANGE VALUES stored at *addr and * will seg fault if addr is not writable. * 2) last_addr does not need to be the last address of the list * but if it is before that, either NULL will be returned or a * matrix with a NULL data value will be returned. * 3) if *addr does not contain a properly formatted vector, this function * will not seg fault, but that is the only guarantee. * 4) you MUST call matr_free on this matrix AS WELL AS freeing memory * stored at *addr. * 5) *addr CHANGES! * 6) This was one of the last functions I added to the library and one of the * likeliest to cause memory errors and seg faults. I've done a good bit * of testing on it, but this function and memory-mapped objects in * general (specifically I'd be most suspicious of SPARSE_LIST's and * NON_SPARSE vectors since they aren't actually mapped anywhere in the * SVM implementation although, of course, I did test them separately) * are the likeliest to break the library. I appologize. ****************************************************************************/ Vector *vector_map(void **addr, void *last_addr) { Vector *v; if (!addr || !*addr || !last_addr || *addr >= last_addr) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_map: null arguments.\n"); } return NULL; } if (*addr + sizeof(Vector) > last_addr) { return NULL; } //tmpv = (Vector *)(*addr); v = (Vector *)(*addr); *addr += sizeof(Vector); v->was_mapped = 1; switch (v->type) { case NON_SPARSE: { if (v->compact) { if (v->dim > 0 && *addr + sizeof(int)*v->dim <= last_addr) { v->data.nsarray.compact = (int *)(*addr); addr += sizeof(int)*v->dim; } else { if (v->dim && MATR_DEBUG_MODE) { fprintf (stderr, "warning: no space allocated for non-sparse vector data.\n"); } v->data.nsarray.compact = NULL; } return v; } if (v->dim > 0 && *addr + sizeof(double)*v->dim <= last_addr) { v->data.nsarray.precise = (double *)(*addr); addr += sizeof(double)*v->dim; } else { if (v->dim && MATR_DEBUG_MODE) { fprintf (stderr, "warning: no space allocated for non-sparse vector data.\n"); } v->data.nsarray.precise = NULL; } return v; return NULL; } case SPARSE_ARRAY: { v->data.sparray = expanding_array_map(addr, last_addr); if (v->nz && !v->data.sparray) { if (MATR_DEBUG_MODE) { fprintf (stderr, "warning: no space allocated for non-zero sparse array vector.\n"); } v->nz = 0; } return v; } case SPARSE_LIST: { v->data.splist = list_map(addr, last_addr, &(v->nz)); if (!v->data.splist && MATR_DEBUG_MODE) { fprintf (stderr, "warning: no space allocated for non-zero sparse list vector.\n"); } return v; } default: if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_map: unrecognized type.\n"); } return v; } } /*************************************************************************** *Copies a vector into a chunk of memory. to will not be completely identical * to from since pointer values will change, etc. Rather to is a contiguous * form of from in memory. to can be treated on return as a vector and * should be freed with vector_free as well as freeing the chunk of memory * it is part of. As with memmove this function does not actually "move" * anything out of from. * *INPUT: to: a block of memory with enough memory to hold the entire vector * stored in from. * from: the vector to be copied from. * *OUTPUT: A pointer to the first address AFTER the data was copied. In * other words this returns to + size(from) where size(from) is the size * in bytes of the full vector from. * *WARNINGS: * 1) to needs to be writable. This will CHANGE VALUES stored at to and * will seg fault if to is not writable. * 2) this does NOT CHECK FOR OVERFLOW. to must have enough memory * already to contain from or this can cause a seg fault. * 3) unlike with memmove, this is not a completely byte-by-byte copy. * instead, to is a copy of the vector from stored contiguously at to * with the same functionality as from. in other words, to can be * treated as a vector. * 4) you should call vector_free on to unless you are CERTAIN you * have not changed it. calling vector_free on an unchanged list * will not do anything. * 5) This was one of the last functions I added to the library and one of the * likeliest to cause memory errors and seg faults. I've done a good bit * of testing on it, but this function and memory-mapped objects in * general are the likeliest to break the library. I appologize. ***************************************************************************/ void *vector_memmove(void *to, Vector *from) { void *unused; *((Vector *)to) = *from; ((Vector *)to)->was_mapped = 1; switch (from->type) { case NON_SPARSE: { if (from->compact && from->data.nsarray.compact) { ((Vector *)to)->data.nsarray.compact = (int *)(to + sizeof(Vector)); unused = memmove(to + sizeof(Vector), from->data.nsarray.compact, sizeof(int)*from->dim); return (to + sizeof(Vector) + sizeof(int)*from->dim); } if (!(from->compact) && from->data.nsarray.precise) { ((Vector *)to)->data.nsarray.precise = (double *)(to + sizeof(Vector)); unused = memmove(to + sizeof(Vector), from->data.nsarray.precise, sizeof(double)*from->dim); return (to + sizeof(Vector) + sizeof(double)*from->dim); } return to + sizeof(Vector); } case SPARSE_ARRAY: { if (from->data.sparray) { ((Vector *)to)->data.sparray = to + sizeof(Vector); *((ExpandingArray *)((Vector *)to)->data.sparray) = *(from->data.sparray); ((Vector *)to)->data.sparray->was_mapped = 1; if (from->compact && from->data.sparray->data.compact) { unused = memmove (to + sizeof(Vector) + sizeof(ExpandingArray), &(from->data.sparray->data.compact[from->data.sparray->first_elt]), sizeof(CompactExpandingType)*(from->data.sparray->n_elts)); return (to + sizeof(Vector) + sizeof(ExpandingArray) + sizeof(CompactExpandingType)*(from->data.sparray->n_elts)); } if (!(from->compact) && from->data.sparray->data.precise) { unused = memmove (to + sizeof(Vector) + sizeof(ExpandingArray), &(from->data.sparray->data.precise[from->data.sparray->first_elt]), sizeof(PreciseExpandingType)*(from->data.sparray->n_elts)); return (to + sizeof(Vector) + sizeof(ExpandingArray) + sizeof(PreciseExpandingType)*(from->data.sparray->n_elts)); } return to + sizeof(Vector) + sizeof(ExpandingArray); } return to + sizeof(Vector); } case SPARSE_LIST: { if (from->data.splist) { ((Vector *)to)->data.splist = to + sizeof(Vector); return list_memmove(to + sizeof(Vector), from->data.splist); } return to + sizeof(Vector); } default: { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_memmove: unrecognized type.\n"); } return NULL; } } } /*************************************************************************** *The full size of the vector in bytes. This is the size a binary file will *be or a chunk of contiguous memory needs to be to contain v. * *INPUT: v: vector from which to get the size. * *OUTPUT: The size in bytes of v. If you want to copy v into contiguous * memory, for example, the following bit of code would work: * copy = malloc(sizeof(v)) * vector_memmove(copy, v) ****************************************************************************/ size_t vector_size(Vector *v) { if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "warning: null vector in vector_size.\n"); } return 0; } switch (v->type) { case NON_SPARSE: { if (v->compact && v->data.nsarray.compact) { return sizeof(Vector) + v->dim*sizeof(int); } if (!(v->compact) && v->data.nsarray.precise) { return sizeof(Vector) + v->dim*sizeof(double); } return sizeof(Vector); } case SPARSE_ARRAY: { if (v->data.sparray) { if (v->compact && v->data.sparray->data.compact) { return sizeof(Vector) + sizeof(ExpandingArray) + sizeof(CompactExpandingType)*v->data.sparray->n_elts; } if (!(v->compact) && v->data.sparray->data.precise) { return sizeof(Vector) + sizeof(ExpandingArray) + sizeof(PreciseExpandingType)*v->data.sparray->n_elts; } return sizeof(Vector) + sizeof(ExpandingArray); } return sizeof(Vector); } case SPARSE_LIST: { if (v->data.splist) { if (v->compact) { return sizeof(Vector) + sizeof(SparseElementList) + sizeof(CompactSparseNode)*v->nz; } return sizeof(Vector) + sizeof(SparseElementList) + sizeof(PreciseSparseNode)*v->nz; } return sizeof(Vector); } default: { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_size: unrecognized type.\n"); } return 0; } } } /************************************************************************* *Frees all memory associated with a vector and the vector itself. * *INPUT: v: vector to free * *TIME: * NON_SPARSE: O(1) * SPARSE_ARRAY: O(1) * SPARSE_LIST: O(s) *************************************************************************/ void vector_free(Vector *v) { if (!v) { return; } switch (v->type) { case NON_SPARSE: { if (v->compact) { if (v->data.nsarray.compact && (!(v->was_mapped) || (void *)v + sizeof(Vector) != (void *)(v->data.nsarray.compact))) { free(v->data.nsarray.compact); } } else { if (v->data.nsarray.precise && (!(v->was_mapped) || (void *)v + sizeof(Vector) != (void *)(v->data.nsarray.precise))) { free(v->data.nsarray.precise); } } break; } case SPARSE_ARRAY: { if (v->was_mapped) { //if the array grew in size, it's possible that v was mapped //but the array was not expanding_array_free_data(v->data.sparray); } else { expanding_array_free(v->data.sparray); } break; } case SPARSE_LIST: { list_clear(v->data.splist); if (!(v->was_mapped)) { free(v->data.splist); } break; } default: { if (MATR_DEBUG_MODE) { fprintf(stderr, "vector_free: unrecognized type.\n"); } break; } } if (!v->was_mapped) { free(v); } } /************************************************************************** *The vector iterator class allows you to move along a vector without *knowing anything about its underlying data structure. This is very *much modeled after C++ iterators. * *All vector iterators are bi-directional. *************************************************************************/ //Static vector iterator function declarations static void vectorit_set(VectorIterator vit, double d, Vector *v); static void vectorit_insert_before(VectorIterator *vit, unsigned int c, double d, Vector *v); static void vectorit_insert_after(VectorIterator *vit, unsigned int c, double d, Vector *v); //"private" function to set an element of a vector using an iterator //this is private because where the iterator will point if d = 0 is //data structure dependent. to set an element of a vector, you should //use vectorit_zero_elt or vectorit_insert. static void vectorit_set(VectorIterator vit, double d, Vector *v) { SparseNode n; if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } switch (v->type) { case SPARSE_LIST: { if (!v->data.splist) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } n.is_compact = v->compact; n.precise = vit.pcurr; n.compact = vit.ccurr; if (!null_node(n)) { node_set_data(n, d); if (fabs(d) < SVM_EPSILON) { //get rid of this value list_remove_elt(v->data.splist, n); v->nz--; } } break; } case SPARSE_ARRAY: { if (!v->data.sparray) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } if (vit.nscurr >= v->data.sparray->first_elt && vit.nscurr <= v->data.sparray->last_elt) { if (v->compact && v->data.sparray->data.compact) { v->data.sparray->data.compact[vit.nscurr].s.data = (int)d; } else if (!(v->compact) && v->data.sparray->data.precise) { v->data.sparray->data.precise[vit.nscurr].s.data = d; } else { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } if (fabs(d) < SVM_EPSILON) { expanding_array_remove_elt(vit.nscurr - v->data.sparray->first_elt, v->data.sparray); v->nz--; } } break; } case NON_SPARSE: { if (vit.nscurr >= 0 && vit.nscurr < v->dim) { if (v->compact) { if (!(v->data.nsarray.compact)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } v->data.nsarray.compact[vit.nscurr] = (int)d; } else { if (!(v->data.nsarray.precise)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } v->data.nsarray.precise[vit.nscurr] = d; } } break; } default: if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_set: unrecognized type.\n"); } return; } } /************************************************************************* *Set the element vit points to to zero. vit will then point to the *NEXT element or past_end. Note that if v is NON_SPARSE, this STILL *moves vit so as to have the same behavior with every data structure. * *INPUT: vit: the iterator * v: the vector vit is traversing * *TIME: * NON_SPARSE: O(1) * SPARSE_ARRAY: O(s) * SPARSE_LIST: O(1) * *WARNINGS: *1) Regardless of data type, vit points to the NEXT ELEMENT after the * zero'd one! *************************************************************************/ void vectorit_zero_elt(VectorIterator *vit, Vector *v) { VectorIterator tmpit; unsigned int currcol = vectorit_curr_col(*vit, v); if (!v || !vit) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_zero_elt: null arguments.\n"); } if (vit) { vit->nscurr = -1; } return; } vectorit_copy(*vit, &tmpit); vectorit_next(&tmpit, v); vectorit_set(*vit, 0, v); //we need to wind up pointing to the next element //depending on how the delete is done //this is either the same element we were pointing at (prev(tmpit)) //or the next element (tmpit) //so we check both vectorit_copy(tmpit, vit); vectorit_prev(&tmpit, v); if ((!vectorit_past_beg(tmpit, v) && vectorit_curr_col(tmpit, v) > currcol)) { vectorit_copy(tmpit, vit); } } /************************************************************************* *Set the column of the element vit points to. NOT IMPLEMENTED for *NON_SPARSE vectors. * *INPUT: vit: the iterator * c: the new column value * v: the vector vit is traversing * *TIME: * NON_SPARSE: NOT IMPLEMENTED * SPARSE_ARRAY: O(1) * SPARSE_LIST: O(1) * *WARNINGS: *1) This does NOT move elements in v around. If changing the column * number would mess up the order of the elements, then this prints * an error message and DOES NOT DO IT. So check first. *************************************************************************/ inline void vectorit_set_col(VectorIterator vit, unsigned int c, Vector *v) { SparseNode n; if (!v) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_set_col: null vector.\n"); } return; } switch (v->type) { case NON_SPARSE: { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_set_col: not implemented for non-sparse matrices.\n"); } return; } case SPARSE_LIST: { n.is_compact = v->compact; n.precise = vit.pcurr; n.compact = vit.ccurr; if (null_node(n)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_set_col: Attempt to set uninitiated iterator.\n"); } return; } if (!null_node(prev_node(n)) && node_col(prev_node(n)) >= c) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_set_col: invalid column number in list.\n"); } return; } if (!null_node(next_node(n)) && node_col(next_node(n)) <= c) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_set_col: invalid column number in list.\n"); } return; } node_set_col(n, c); break; } case SPARSE_ARRAY: { if (!v->data.sparray || (v->compact && !(v->data.sparray->data.compact)) || (!(v->compact) && !(v->data.sparray->data.precise))) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_set_col: null vector.\n"); } return; } if (vit.nscurr < v->data.sparray->first_elt || vit.nscurr > v->data.sparray->last_elt) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_set_col: Attempt to set uninitiated iterator.\n"); } return; } if ((vit.nscurr-1 >= v->data.sparray->first_elt && ((v->compact && v->data.sparray->data.compact[vit.nscurr-1].s.col >= c) || (!v->compact && v->data.sparray->data.precise[vit.nscurr-1].s.col >= c))) || (vit.nscurr+1 <= v->data.sparray->last_elt && ((v->compact && v->data.sparray->data.compact[vit.nscurr+1].s.col <= c) || (!v->compact && v->data.sparray->data.precise[vit.nscurr+1].s.col <= c)))) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_set_col: invalid column number in array at space %ld column number is %u and it is ", vit.nscurr, c); if (vit.nscurr - 1 >= v->data.sparray->first_elt) { if ((v->compact && v->data.sparray->data.compact[vit.nscurr-1].s.col >= c)) { fprintf(stderr, " less than space %ld with the column number %u before it.\n", vit.nscurr-1, v->data.sparray->data.compact[vit.nscurr-1] .s.col); } if (!(v->compact) && v->data.sparray->data.precise[vit.nscurr-1].s.col >= c) { fprintf(stderr, " less than space %ld with the column number %u before it.\n", vit.nscurr-1, v->data.sparray->data.precise[vit.nscurr-1].s.col); } } if (vit.nscurr + 1 <= v->data.sparray->last_elt) { if ((v->compact && v->data.sparray->data.compact[vit.nscurr+1].s.col <= c)) { fprintf(stderr, " less than the column number %u before it.\n", v->data.sparray->data.compact[vit.nscurr+1].s.col); } if (!(v->compact) && v->data.sparray->data.precise[vit.nscurr+1].s.col <= c) { fprintf(stderr, " greater than the column number %u before it.\n", v->data.sparray->data.precise[vit.nscurr+1].s.col); } } } return; } if (v->compact) { v->data.sparray->data.compact[vit.nscurr].s.col = c; } else { v->data.sparray->data.precise[vit.nscurr].s.col = c; } break; } default: if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_set_col: unrecognized type.\n"); } return; } } /************************************************************************* *Insert a new value or set an old value of v using the iterator. * *INPUT: vit: an iterator that serves as an initial guess as to where * the value should be inserted. * c: the column * d: the data * v: the vector vit is traversing * *OUTPUT: vit will point to the element that has been inserted or the NEXT * element if d = 0 and the vector is SPARSE. If you insert a zero element * into an already 0 spot, vit will point to EITHER the PREVIOUS or the * NEXT element if the vector is SPARSE. * *TIME: * NON_SPARSE: O(1) * SPARSE_ARRAY: * Generally: O(s) * Setting an element, no initial guess: amortized O(lg(s)) * Setting first/last element or good initial guess: O(1) * Inserting at front or end of vector: amortized O(1) * SPARSE_LIST: * Generally: O(s) * Good initial guess, inserting or setting first/last element: O(1) * *WARNINGS: *1) If d = 0, the iterator will point to the NEXT ELEMENT only if the * vector is SPARSE! *2) Where the vector points if d = 0 and that spot was ALREADY 0 and the * vector is SPARSE is undetermined - it will either be BEFORE or AFTER * the spot you tried to insert. This is a weird case. *************************************************************************/ void vectorit_insert(VectorIterator *vit, unsigned int c, double d, Vector *v) { if (!v || !vit) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null arguments.\n"); } if (vit) { vit->nscurr = -1; } return; } if (c < 0 || c >= v->dim) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: invalid column number %u.\n", c); } return; } if (v->type == NON_SPARSE) { //we have constant time access to elements if (v->compact) { if (!(v->data.nsarray.compact)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } v->data.nsarray.compact[c] = d; } else { if (!(v->data.nsarray.precise)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } v->data.nsarray.precise[c] = d; } vit->nscurr = c; return; } vectorit_find(vit, c, v); if (!vectorit_past_end(*vit, v) && !vectorit_past_beg(*vit, v) && (vectorit_curr_col(*vit, v) == c)) { if (fabs(d) < SVM_EPSILON && vectorit_curr_col(*vit, v) == c) { if (MATR_DEBUG_MODE >= MATR_OPS_MORE) { fprintf(stderr, "zeroing column %u type = %d\n", vectorit_curr_col(*vit, v), v->type); } vectorit_zero_elt(vit, v); return; } } if (vectorit_curr_col(*vit, v) == c) { if (MATR_DEBUG_MODE >= MATR_OPS_MORE) { fprintf(stderr, "setting column %u to be %f type = %d\n", vectorit_curr_col(*vit, v), d, v->type); } vectorit_set(*vit, d, v); } else if (fabs(d) > SVM_EPSILON) { if (vectorit_past_beg(*vit, v) || vectorit_curr_col(*vit, v) < c) { if (MATR_DEBUG_MODE >= MATR_OPS_MORE) { if (vectorit_past_beg(*vit, v)) { fprintf(stderr, "inserting %lf in first position.\n", d); } else { fprintf(stderr, "inserting %lf after column %u type = %d\n", d, vectorit_curr_col(*vit, v), v->type); } } vectorit_insert_after(vit, c, d, v); } else { if (MATR_DEBUG_MODE >= MATR_OPS_MORE) { fprintf(stderr, "inserting %lf before column %u type = %d\n", d, vectorit_curr_col(*vit, v), v->type); } vectorit_insert_before(vit, c, d, v); } } } /************************************************************************* *Find a column in v. * *INPUT: vit: an iterator that serves as an initial guess as to where * the column is in the data structure. * c: the column * v: the vector vit is traversing * *OUTPUT: vit will point to the element with column number c if it exists. * If it does NOT exist, vit will point to either the last element with * a column number less than c or the first element with a column number * greater than c. If c is less than all column numbers in the vector, * vit may be past_beg. Similarly, if c is greater than all column numbers, * vit may be past_end. * *TIME: * NON_SPARSE: O(1) * SPARSE_ARRAY: * Generally: O(lg(s)) * Before/after/equals first/last element or good initial guess: O(1) * SPARSE_LIST: * Generally: O(s) * Before/after/equals first/last element or good initial guess: O(1) *************************************************************************/ void vectorit_find(VectorIterator *vit, unsigned int c, Vector *v) { SparseNode n; if (!v || !vit) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_find: null arguments.\n"); } if (vit) { vit->nscurr = -1; } return; } switch (v->type) { case NON_SPARSE: { vit->nscurr = c; break; } case SPARSE_ARRAY: { if (!v->data.sparray) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_find: null vector.\n"); } return; } vit->nscurr = expanding_array_search(c, vit->nscurr - v->data.sparray->first_elt, v->data.sparray) + v->data.sparray->first_elt; break; } case SPARSE_LIST: { if (!v->data.splist) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_find: null vector.\n"); } return; } n.is_compact = v->compact; n.compact = vit->ccurr; n.precise = vit->pcurr; n = list_search(c, n, v->data.splist); if (null_node(n)) { if (null_node((v->data.splist->head)) || c < node_col((v->data.splist->head))) { n = v->data.splist->head; } else { n = v->data.splist->tail; } } if (v->compact) { vit->ccurr = n.compact; } else { vit->pcurr = n.precise; } break; } default: { vit->nscurr = -1; if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_find: unrecognized type.\n"); } return; } } } //always call vectorit_insert NOT these! //these are for the use of vectorit_insert //they assume that c fits correctly before/after vit //and they do not check for it //if you are correctly inserting before or after the iterator passed //to insert, it will recognize that and call these in O(1) time //these functions are "private" static void vectorit_insert_before(VectorIterator *vit, unsigned int c, double d, Vector *v) { PreciseSparseElement pnewelt; PreciseExpandingType pet; CompactSparseElement cnewelt; CompactExpandingType cet; ExpandingType ne; SparseElement newelt; SparseNode n; if (!v || !vit) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null arguments.\n"); } if (vit) { vit->nscurr = -1; } return; } if (v->compact) { cnewelt.data = (int)d; cnewelt.col = c; cet.s = cnewelt; ne.compact = &cet; newelt.compact = &cnewelt; } else { pnewelt.data = d; pnewelt.col = c; pet.s = pnewelt; ne.precise = &pet; newelt.precise = &pnewelt; } if (c >= v->dim || c < 0) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert_before: bad index.\n"); } return; } switch (v->type) { case NON_SPARSE: { vit->nscurr = c; if (v->compact) { if (!(v->data.nsarray.compact)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } v->data.nsarray.compact[c] = (int)d; } else { if (!(v->data.nsarray.precise)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } v->data.nsarray.precise[c] = d; } break; } case SPARSE_ARRAY: { if (!v->data.sparray) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } if (vit->nscurr > v->data.sparray->last_elt && v->data.sparray->n_elts) { vectorit_set_at_end(vit, v); vectorit_insert_after(vit, c, d, v); return; } vit->nscurr -= v->data.sparray->first_elt; vit->nscurr = expanding_array_insert_before(ne, vit->nscurr, v->data.sparray); v->nz++; break; } case SPARSE_LIST: { if (!v->data.splist) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } if (!list_is_empty(v->data.splist) && c > node_col(v->data.splist->tail)) { //this is easier vectorit_set_at_end(vit, v); vectorit_insert_after(vit, c, d, v); return; } n.is_compact = v->compact; n.compact = vit->ccurr; n.precise = vit->pcurr; if (null_node(n)) { n = (v->data.splist->head); } n = list_insert_before(newelt, n, v->data.splist); if (v->compact) { vit->ccurr = n.compact; } else { vit->pcurr = n.precise; } v->nz++; break; } default: vit->nscurr = -1; if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert_before: unrecognized type.\n"); } break; } } static void vectorit_insert_after(VectorIterator *vit, unsigned int c, double d, Vector *v) { PreciseSparseElement pnewelt; PreciseExpandingType pet; CompactSparseElement cnewelt; CompactExpandingType cet; ExpandingType et; SparseElement newelt; SparseNode n; if (!v || !vit) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null arguments.\n"); } if (vit) { vit->nscurr = -1; } return; } if (v->compact) { cnewelt.data = (int)d; cnewelt.col = c; cet.s = cnewelt; et.compact = &cet; newelt.compact = &cnewelt; } else { pnewelt.data = d; pnewelt.col = c; pet.s = pnewelt; et.precise = &pet; newelt.precise = &pnewelt; } if (c >= v->dim || c < 0) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert_after: bad index.\n"); } return; } switch (v->type) { case NON_SPARSE: { vit->nscurr = c; if (v->compact) { if (!(v->data.nsarray.compact)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } v->data.nsarray.compact[c] = (int)d; } else { if (!(v->data.nsarray.precise)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } v->data.nsarray.precise[c] = d; } break; } case SPARSE_ARRAY: { if (!v->data.sparray) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } if (vit->nscurr < v->data.sparray->first_elt && v->data.sparray->n_elts) { vectorit_set_at_beg(vit, v); vectorit_insert_before(vit, c, d, v); return; } vit->nscurr -= v->data.sparray->first_elt; vit->nscurr = expanding_array_insert_after(et, vit->nscurr, v->data.sparray); v->nz++; break; } case SPARSE_LIST: { if (!v->data.splist) { if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert: null vector.\n"); } return; } if (!list_is_empty(v->data.splist) && c < node_col(v->data.splist->head)) { //this is easier vectorit_set_at_beg(vit, v); vectorit_insert_before(vit, c, d, v); return; } n.is_compact = v->compact; n.compact = vit->ccurr; n.precise = vit->pcurr; if (null_node(n)) { n = (v->data.splist->tail); } n = list_insert_after(newelt, n, v->data.splist); if (v->compact) { vit->ccurr = n.compact; } else { vit->pcurr = n.precise; } v->nz++; break; } default: { vit->nscurr = -1; if (MATR_DEBUG_MODE) { fprintf(stderr, "vectorit_insert_after: unrecognized type.\n"); } return; } } } //This function SHOULD NOT BE USED //for anything but checking answers //If you want to multiply matrices together, implement somthing smarter //than this. void matr_multiply(Matrix *M1, Matrix *M2, Matrix *ret) { unsigned int i, j, k; double s; if (M1->rows < ret->rows) { fprintf(stderr, "matr_multiply: Attempt to multiply more rows than matrix has.\n"); return; } if (M2->cols < ret->cols) { fprintf(stderr, "matr_multiply: Attempt to multiply more columns than matrix has.\n"); return; } if (M1->cols != M2->rows) { fprintf(stderr, "matr_multiply: Mismatch in inner dimensions.\n"); return; } for (i = 0; i < ret->rows; i++) { for (j = 0; j < ret->cols; j++) { s = 0; for (k = 0; k < M1->cols; k++) { s += matr_get(M1, i, k)*matr_get(M2, k, j); } matr_set(ret, i, j, s); } } } crm114-20100106-BlameMichelson.src/cssutil.c0000644000000000017500000003275611321154266016644 0ustar rootwsy// cssutil.c - utility for munging css files, version X0.1 // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" char version[] = "1.2"; void helptext () { fprintf (stdout, "cssutil version %s - generic css file utility.\n" "Usage: cssutil [options]... css-file\n" " -b - brief; print only summary\n" " -h - print this help\n" " -q - quite mode; no warning messages\n" " -r - report then exit (no menu)\n" " -s css-size - if no css file found, create new\n" " one with this many buckets.\n" " -S css-size - same as -s, but round up to next\n" " 2^n + 1 boundary.\n" " -v - print version and exit\n" " -D - dump css file to stdout in CSV format.\n" " -R csv-file - create and restore css from CSV\n", VERSION); } int main (int argc, char **argv) { long i, k; // some random counters, when we need a loop long v; long sparse_spectrum_file_length = DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH; long user_set_css_length = 0; long hfsize; long long sum; // sum of the hits... can be _big_. // int hfd; int brief = 0, quiet = 0, dump = 0, restore = 0; int opt, fields; int report_only = 0; long *bcounts; long maxchain; long curchain; long totchain; long fbuckets; long nchains; long zvbins; long ofbins; long histbins; // how many bins for the histogram char cmdstr[255]; char cssfile[255]; char csvfile[255]; unsigned char cmdchr[2]; char crapchr[2]; float cmdval; int zloop, cmdloop; long learns_index, features_index; long docs_learned = -1; long features_learned = -1; // the following for crm114.h's happiness char *newinputbuf; newinputbuf = (char *) &hfsize; histbins = FEATUREBUCKET_VALUE_MAX; if (histbins > FEATUREBUCKET_HISTOGRAM_MAX) histbins = FEATUREBUCKET_HISTOGRAM_MAX; bcounts = malloc (sizeof (unsigned long) * (histbins + 2) ); { struct stat statbuf; // filestat buffer FEATUREBUCKET_STRUCT *hashes; // the text of the hash file // parse cmdline options while ((opt = getopt (argc, argv, "bDhR:rqs:S:v")) != -1) { switch (opt) { case 'b': brief = 1; // brief, no 'bin value ...' lines break; case 'D': dump = 1; // dump css file, no cmd menu break; case 'q': quiet = 1; // quiet mode, no warning messages break; case 'R': { FILE *f; unsigned long key, hash, value; // count lines to determine number of buckets and check CSV format if ((f = fopen (optarg, "rb")) != NULL) { sparse_spectrum_file_length = 0; while (!feof (f)) if (fscanf (f, "%lu;%lu;%lu\n", &key, &hash, &value) == 3) sparse_spectrum_file_length++; else { fprintf (stderr, "\n %s is not in the right CSV format.\n", optarg); exit (EXIT_FAILURE); } fclose (f); strcpy (csvfile, optarg); } else { fprintf (stderr, "\n Couldn't open csv file %s; errno=%d.\n", optarg, errno); exit (EXIT_FAILURE); } } restore = 1; // restore css file, no cmd menu break; case 'r': report_only = 1; // print stats only, no cmd menu. break; case 's': // set css size to option value case 'S': // same as above but round up to next 2^n+1 if (sscanf (optarg, "%ld", &sparse_spectrum_file_length)) { if (!quiet) fprintf (stderr, "\nOverride css creation length to %ld\n", sparse_spectrum_file_length); user_set_css_length = 1; } else { fprintf (stderr, "On -%c flag: Missing or incomprehensible number of buckets.\n", opt); exit (EXIT_FAILURE); } if (opt == 'S') // round up to next 2^n+1 { int k; k = (long) floor (log10 (sparse_spectrum_file_length - 1) / log10 (2.0)); while ((2 << k) + 1 < sparse_spectrum_file_length) k++; sparse_spectrum_file_length = (2 << k) + 1; user_set_css_length = 1; } break; case 'v': fprintf (stderr, " This is cssutil, version %s\n", version); fprintf (stderr, " Copyright 2001-2006 W.S.Yerazunis.\n"); fprintf (stderr, " This software is licensed under the GPL with ABSOLUTELY NO WARRANTY\n"); exit (EXIT_SUCCESS); default: helptext (); exit (EXIT_SUCCESS); break; } } if (optind < argc) strncpy (cssfile, argv[optind], sizeof (cssfile)); else { helptext (); exit (EXIT_SUCCESS); } // and stat it to get it's length k = stat (cssfile, &statbuf); // quick check- does the file even exist? if (k == 0) { if (restore) { fprintf (stderr, "\n.CSS file %s exists! Restore operation aborted.\n", cssfile); exit (EXIT_FAILURE); } hfsize = statbuf.st_size; if (!quiet && user_set_css_length) fprintf (stderr, "\n.CSS file %s exists; -s, -S options ignored.\n", cssfile); } else { // file didn't exist... create it if (!quiet && !restore) fprintf (stdout, "\nHad to create .CSS file %s\n", cssfile); if (crm_create_cssfile (cssfile, sparse_spectrum_file_length, 0, 0, 0) != EXIT_SUCCESS) exit (EXIT_FAILURE); k = stat (cssfile, &statbuf); hfsize = statbuf.st_size; } // // mmap the hash file into memory so we can bitwhack it hashes = (FEATUREBUCKET_STRUCT *) crm_mmap_file (cssfile, 0, hfsize, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); if (hashes == MAP_FAILED) { fprintf (stderr, "\n Couldn't open RW file %s; errno=%d .\n", cssfile, errno); exit (EXIT_FAILURE); } // from now on, hfsize is buckets, not bytes. hfsize = statbuf.st_size / sizeof (FEATUREBUCKET_STRUCT); #ifdef OSB_LEARNCOUNTS // If LEARNCOUNTS is enabled, we normalize with documents-learned. // // We use the reserved h2 == 0 setup for the learncount. // { char* litf = "Learnings in this file"; char* fitf = "Features in this file"; unsigned int hcode, h1, h2; // hcode = strnhash (litf, strlen ( litf )); h1 = hcode % hfsize; h2 = 0; if (hashes[h1].hash != hcode) { // initialize the file? if (hashes[h1].hash == 0 && hashes[h1].key == 0) { hashes[h1].hash = hcode; hashes[h1].key = 0; hashes[h1].value = 1; learns_index = h1; } else { //fatalerror (" This file should have learncounts, but doesn't!", // " The slot is busy, too. It's hosed. Time to die."); //goto regcomp_failed; fprintf (stderr, "\n Minor Caution - this file has the learncount slot in use.\n This is not a problem for Markovian classification, but it will have some\n issues with an OSB classfier.\n"); }; } // fprintf (stderr, "This file has had %ld documents learned!\n", // hashes[h1].value); docs_learned = hashes[h1].value; hcode = strnhash (fitf, strlen ( fitf )); h1 = hcode % hfsize; h2 = 0; if (hashes[h1].hash != hcode) { // initialize the file? if (hashes[h1].hash == 0 && hashes[h1].key == 0) { hashes[h1].hash = hcode; hashes[h1].key = 0; hashes[h1].value = 1; features_index = h1; } else { //fatalerror (" This file should have learncounts, but doesn't!", // " The slot is busy, too. It's hosed. Time to die."); //goto regcomp_failed ; fprintf (stderr, "\n Minor Caution - this file has the featurecount slot in use.\n This is not a problem for Markovian classification, but it will have some\n issues with an OSB classfier.\n"); }; } //fprintf (stderr, "This file has had %ld features learned!\n", // hashes[h1].value); features_learned = hashes[h1].value; }; #endif // OSB_LEARNCOUNTS if (dump) { // dump the css file for (i = 0; i < hfsize; i++) { printf ("%u;%u;%u\n", hashes[i].key, hashes[i].hash, hashes[i].value); } } if (restore) { FILE *f; // restore the css file - note that if we DIDN'T create // it already, then this will fail. // if ((f = fopen (csvfile, "rb")) == NULL) { fprintf (stderr, "\n Couldn't open csv file %s; errno=%d.\n", csvfile, errno); exit (EXIT_FAILURE); } for (i = 0; i < hfsize; i++) { dontcare = fscanf (f, "%u;%u;%u\n", &(hashes[i].key), &(hashes[i].hash), &(hashes[i].value)); } fclose (f); } zloop = 1; while (zloop == 1 && !restore && !dump) { zloop = 0; //crm_packcss (hashes, hfsize, 1, hfsize-1); sum = 0; maxchain = 0; curchain = 0; totchain = 0; fbuckets = 0; nchains = 0; zvbins = 0; ofbins = 0; // calculate maximum overflow chain length for (i = 1; i < hfsize; i++) { if (hashes[i].key != 0) { // only count the non-special buckets for feature count sum = sum + hashes[i].value; // fbuckets++; curchain++; if (hashes[i].value == 0) zvbins++; if (hashes[i].value >= FEATUREBUCKET_VALUE_MAX) ofbins++; } else { if (curchain > 0) { nchains++; totchain += curchain; if (curchain > maxchain) maxchain = curchain; curchain = 0; } } } fprintf (stdout, "\n Sparse spectra file %s statistics: \n", cssfile); fprintf (stdout, "\n Total available buckets : %12ld ", hfsize); fprintf (stdout, "\n Total buckets in use : %12ld ", fbuckets); fprintf (stdout, "\n Total in-use zero-count buckets : %12ld ", zvbins); fprintf (stdout, "\n Total buckets with value >= max : %12ld ", ofbins); fprintf (stdout, "\n Total hashed datums in file : %12lld", sum); fprintf (stdout, "\n Documents learned : %12ld ", docs_learned); fprintf (stdout, "\n Features learned : %12ld ", features_learned); fprintf (stdout, "\n Average datums per bucket : %12.2f", (fbuckets > 0) ? (sum * 1.0) / (fbuckets * 1.0) : 0); fprintf (stdout, "\n Maximum length of overflow chain : %12ld ", maxchain); fprintf (stdout, "\n Average length of overflow chain : %12.2f ", (nchains > 0) ? (totchain * 1.0) / (nchains * 1.0) : 0 ); fprintf (stdout, "\n Average packing density : %12.2f\n", (fbuckets * 1.0) / (hfsize * 1.0)); // set up histograms for (i = 0; i < histbins; i++) bcounts[i] = 0; for (v = 1; v < hfsize; v++) { if (hashes[v].value < histbins) { bcounts[hashes[v].value]++; } else { bcounts[histbins]++; // note that bcounts is len(histbins+2) } } if (!brief) for (i = 0; i < histbins; i++) { if (bcounts[i] > 0) { if (i < histbins) { fprintf (stdout, "\n bin value %8ld found %9ld times", i, bcounts[i]); } else { fprintf (stdout, "\n bin value %8ld or more found %9ld times", i, bcounts[i]); } } } fprintf (stdout, "\n"); cmdloop = 1; while (!report_only && cmdloop) { // clear command buffer cmdchr[0] = '\0'; fprintf (stdout, "Options:\n"); fprintf (stdout, " Z n - zero bins at or below a value\n"); fprintf (stdout, " S n - subtract a constant from all bins\n"); fprintf (stdout, " D n - divide all bins by a constant\n"); fprintf (stdout, " R - rescan\n"); fprintf (stdout, " P - pack\n"); fprintf (stdout, " Q - quit\n"); fprintf (stdout, ">>> "); clearerr (stdin); dontcare = fscanf (stdin, "%[^\n]", cmdstr); dontcare = fscanf (stdin, "%c", crapchr); fields = sscanf (cmdstr, "%s %f", cmdchr, &cmdval); if (strlen ( (char *) cmdchr) != 1) { fprintf (stdout, "Unknown command: %s\n", cmdchr); continue; } switch (tolower ((int)cmdchr[0])) { case 'z': if (fields != 2) fprintf (stdout, "Z command requires a numeric argument!\n"); else { fprintf (stdout, "Working..."); for (i = 1; i < hfsize; i++) if (hashes[i].value <= cmdval) hashes[i].value = 0; fprintf (stdout, "done.\n"); } break; case 's': if (fields != 2) fprintf (stdout, "S command requires a numeric argument!\n"); else { fprintf (stdout, "Working..."); for (i = 1; i < hfsize; i++) { if (hashes[i].value > (int) cmdval) { hashes[i].value = hashes[i].value - cmdval; } else { hashes[i].value = 0; } } fprintf (stdout, "done.\n"); } break; case 'd': if (fields != 2) fprintf (stdout, "D command requires a numeric argument!\n"); else if (cmdval == 0) fprintf (stdout, "You can't divide by zero, nimrod!\n"); else { fprintf (stdout, "Working..."); for (i = 1; i < hfsize; i++) hashes[i].value = hashes[i].value / cmdval; fprintf (stdout, "done.\n"); } break; case 'r': zloop = 1; cmdloop = 0; break; case 'p': fprintf (stdout, "Working..."); crm_packcss (hashes, NULL, hfsize, 1, hfsize - 1); zloop = 1; cmdloop = 0; break; case 'q': fprintf (stdout, "Bye! \n"); cmdloop = 0; break; default: fprintf (stdout, "Unknown command: %c\n", cmdchr[0]); break; } } } crm_munmap_file ((void *) hashes); } return 0; } crm114-20100106-BlameMichelson.src/paolo_ov3.crm0000755000000000017500000000430011321154266017401 0ustar rootwsy#! /usr/bin/crm # # paolo_ov3.crm - paolo test script # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. window alter (:_dw:) /Return-path: \nEnvelope-to: \nDelivery-date:\nReceived: from\n by paranoia with esmtp\n id 1E24Zm-0008Pl-Bq\nReceived: from\nX-Identity-Key: id1\nDate: Mon\nFrom: International Medical Corporation\nX-Accept-Language: en-us, en\nMIME-Version: 1.0\nTo: bengen@vdst-ka.inka.de\nSubject: You have some free time?\nContent-Type: multipart\/related; boundary="------------000708090009050006030006"\nMessage-Id: \nX-CRM114-Version: 20050415.BlameTheIRS\nX-CRM114-Status: SPAM\nContent-Length: 21549\n\nThis is a multi-part message in MIME format.\n--------------000708090009050006030006\nContent-Type: text\/html; charset=us-ascii\nContent-Transfer-Encoding: 7bit\n\n<\/head>

<\/a><\/p>

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxx<\/font><\/p>

wwwwwwwwwww? xxxxx<\/font><\/p><\/body><\/html>\n\n--------------000708090009050006030006\nContent-Type: image\/gif;\n name="clonic.GIF"\nContent-Transfer-Encoding: base64\nContent-ID: \nContent-Disposition: inline;\n filename="clonic.GIF"\n\noooooooooooooooooooooooo\/\/\/\/\/oooooooooooooooooooooooooooooo\/oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo\nooooooooooooooooooooooooooooo\/oooooooooooooooooooooooooooooooooooooooooooooooooo\n...more garbage follows...\n\n--------------000708090009050006030006--\n\n\n/ { match (:: :headers: :body:) /(.*?)\n\n(.*)/ } isolate (:headers:) { match [:headers:] /^Content-Type: .* boundary="(.+)"/ \ (:x: :boundary:) output /:boundary:=:*:boundary:\n/ } isolate (:c:) /0/ { eval (:c:) /:@::*:c:+1:/ match [:body:] (:x: :headers:) /\n--:*:boundary:\n(.+?)\n\n/ output /:c:=:*:c:\n:*:headers:\n\n/ liaf } crm114-20100106-BlameMichelson.src/alternating_example_neural.crm0000755000000000017500000000432211321154266023075 0ustar rootwsy#! /usr/bin/crm # # alternating_example_neural.crm - Alternating Example Neural # Network Classifier training test # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. window { output /**** Alternating Example Neural Network Classifier TRAINING\n/ # load the files in isolate (:Macbeth: :Alice:) input (:Macbeth:) [ Macbeth_Act_IV.txt 0 16000] input (:Alice:) [ Alice_In_Wonderland_Chap_1_And_2.txt 0 16000] # Now loop. isolate (:loopcontrol:) // isolate (:loopcounter:) /0/ { eval (:loopcounter:) / :@: :*:loopcounter: + 1 : / # output /Top of loop at :*:loopcounter: \n/ match [:loopcontrol:] /./ { { # Grab a good chunk of Macbeth... match (:onep:) /(....){255}.*?\n/ [:Macbeth:] match [:onep:] /.../ learn [:onep:] < neural unigram append> (m_test.css) learn [:onep:] < neural unigram refute append> (a_test.css) } alius # Set done mark { alter (:loopcontrol:) /X/ } } { { # Grab a good chunk of Alice... match (:twop:) /(....){255}.*?\n/ [:Alice:] match [:twop:] /.../ learn [:twop:] < neural unigram append> (a_test.css) learn [:twop:] < neural unigram refute append> (m_test.css) } alius # reset to start of Macbeth file. { alter (:loopcontrol:) /X/ } } liaf } # Now run one fromstart loop on each of the files learn [:_nl:] (m_test.css) learn [:_nl:] (a_test.css) } output /\n**** Alternating Example Neural Network Classifier RUNNING TEST\n/ isolate (:s:) isolate (:filetxt:) // { input (:filetxt:) [ Alice_In_Wonderland_Chap_1_And_2.txt 16000 4096 ] match (:t1:) [:filetxt:] /(....){255}.*?\n\n/ { classify < neural unigram> ( m_test.css | a_test.css ) (:s:)/[[:graph:]]+/ [:t1:] output / type M \n:*:s:\n/ } alius { output / type A \n:*:s:\n/ } } { isolate (:t2:) // input (:filetxt:) [ Macbeth_Act_IV.txt 16000 4096 ] match (:t2:) [:filetxt:] /(....){255}.*?\n/ { classify < neural unigram > ( m_test.css | a_test.css ) (:s:) /[[:graph:]]+/ [:t2:] output / type M \n:*:s:\n/ } alius { output / type A \n:*:s:\n/ } } crm114-20100106-BlameMichelson.src/crm_svm_matrix_util.h0000644000000000017500000002271711321154266021246 0ustar rootwsy// crm_svm_matrix_util.h - Support Vector Machine //////////////////////////////////////////////////////////////////////// // This code is originally copyright and owned by William // S. Yerazunis as file crm_neural_net. In return for addition of // significant derivative work, Jennifer Barry is hereby granted a full // unlimited license to use this code, includng license to relicense under // other licenses. //////////////////////////////////////////////////////////////////////// // // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. #ifndef __CRM_SVM_MATR_UTIL__H #define __CRM_SVM_MATR_UTIL__H //the libraries we'll want everywhere #include #include #include #include #include /********************************************************************** *A utility library that defines some data structures, specifically an *expanding array and a linked list. This library can be used with *the matrix library or on its own. The expanding array is general, but *the linked list would need the union expanded to be general. * *The functions in util.c are commented. See them for details. *********************************************************************/ #define SVM_EPSILON 1e-10 //approximation to 0 //using doubles, 1e-10 works well enough #define SVM_INF 1.0/SVM_EPSILON //likewise, an approximation to infinity #define MAX_INT_VAL 4294967295 //2^32-1 #define SVM_MAX_ALLOC_SIZE 4e8 //Maximum amount of memory to allocate in //one chunk (used for counting sort) //these are debug settings //the higher the setting, the more output you get //these are cummulative so that level QP_DEBUG prints everything //from level SOLVER_DEBUG and QP_DEBUG #define MATR_DEBUG 1 #define MATR_OPS 6 //information about the matrix operations #define MATR_OPS_MORE 7 //even more information about the matrix //operations (ie printing out intermediate //results for vector_add and dot, etc) #ifdef DO_INLINES #define MY_INLINE __attribute__((always_inline)) static inline #else #define MY_INLINE static inline #endif int MATR_DEBUG_MODE; //the debug value //for SVM, if internal trace is on //MATR_DEBUG_MODE = SVM_INTERNAL_TRACE_LEVEL //for PCA, if internal trace is on //MATR_DEBUG_MODE = PCA_INTERNAL_TRACE_LEVEL //Sparse elements typedef struct { unsigned int col; double data; //it's worth it to use doubles for higher precision. really. } PreciseSparseElement; typedef struct { unsigned int col; int data; //small data! } CompactSparseElement; typedef union { PreciseSparseElement *precise; CompactSparseElement *compact; } SparseElement; //Types that can go in an expanding array typedef union { int i; long l; float f; double d; PreciseSparseElement s; } PreciseExpandingType; typedef union { unsigned int i; CompactSparseElement s; } CompactExpandingType; typedef union { PreciseExpandingType *precise; CompactExpandingType *compact; } ExpandingType; //Expanding array struct typedef struct { ExpandingType data; //Actual data int length, //Current size of the array last_elt, //Location of the last data first_elt, //Location of the first data n_elts, //Number of elements = last_elt - first_elt compact, //1 for compactness was_mapped; //1 if the data was mapped into memory } ExpandingArray; //Elements of a linked list typedef struct precise_mythical_node { PreciseSparseElement data; struct precise_mythical_node *next, *prev; } PreciseSparseNode; typedef struct compact_mythical_node { CompactSparseElement data; struct compact_mythical_node *next, *prev; } CompactSparseNode; typedef struct { PreciseSparseNode *precise; CompactSparseNode *compact; int is_compact; } SparseNode; //Linked list struct typedef struct { SparseNode head, tail; int compact; void *last_addr; } SparseElementList; //Expanding array functions ExpandingArray *make_expanding_array(int init_size, int compact); void expanding_array_insert(ExpandingType d, ExpandingArray *A); void expanding_array_set(ExpandingType d, int c, ExpandingArray *A); ExpandingType expanding_array_get(int c, ExpandingArray *A); void expanding_array_trim(ExpandingArray *A); int expanding_array_search(unsigned int c, int init, ExpandingArray *A); int expanding_array_insert_before(ExpandingType ne, int before, ExpandingArray *A); int expanding_array_insert_after(ExpandingType ne, int after, ExpandingArray *A); void expanding_array_clear(ExpandingArray *A); void expanding_array_remove_elt(int elt, ExpandingArray *A); size_t expanding_array_write(ExpandingArray *A, FILE *fp); void expanding_array_read(ExpandingArray *A, FILE *fp); ExpandingArray *expanding_array_map(void **addr, void *last_addr); void expanding_array_free_data(ExpandingArray *A); void expanding_array_free(ExpandingArray *A); //SparseElementList functions SparseElementList *make_list(int compact); SparseNode list_search(unsigned int c, SparseNode init, SparseElementList *l); SparseNode list_insert_before(SparseElement newelt, SparseNode before, SparseElementList *l); SparseNode list_insert_after(SparseElement ne, SparseNode after, SparseElementList *l); void list_clear(SparseElementList *l); void list_remove_elt(SparseElementList *l, SparseNode toremove); int list_is_empty(SparseElementList *l); size_t list_write(SparseElementList *l, FILE *fp); int list_read(SparseElementList *l, FILE *fp, int n_elts); SparseElementList *list_map(void **addr, void *last_addr, int *n_elts_ptr); void *list_memmove(void *to, SparseElementList *from); //Sparse Nodes MY_INLINE SparseNode make_null_node(int compact); MY_INLINE int null_node(SparseNode n); MY_INLINE double node_data(SparseNode n); MY_INLINE unsigned int node_col(SparseNode n); MY_INLINE SparseNode next_node(SparseNode n); MY_INLINE SparseNode prev_node(SparseNode n); MY_INLINE void node_set_data(SparseNode n, double d); MY_INLINE void node_set_col(SparseNode n, unsigned int c); MY_INLINE void node_free(SparseNode n); //Comparator functions for QSort int compact_expanding_type_int_compare(const void *a, const void *b); int precise_sparse_element_val_compare(const void *a, const void *b); int precise_sparse_element_col_compare(const void *a, const void *b); /***********************Sparse Node Functions***************************/ //return a node with the correct compactness and //the appropriate pointer null MY_INLINE SparseNode make_null_node(int compact) { SparseNode n; n.is_compact = compact; n.compact = NULL; n.precise = NULL; if (compact) { n.compact = NULL; } else { n.precise = NULL; } return n; } //returns 1 if the pointer with the correct compactness //is null MY_INLINE int null_node(SparseNode n) { if (n.is_compact) { return (n.compact == NULL); } return (n.precise == NULL); } //returns the data associated with n MY_INLINE double node_data(SparseNode n) { if (null_node(n)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "node_data: null node.\n"); } return -RAND_MAX; } if (n.is_compact) { return (double)n.compact->data.data; } return n.precise->data.data; } //returns the column number associated with n MY_INLINE unsigned int node_col(SparseNode n) { if ((n.is_compact && !(n.compact)) || (!(n.is_compact) && !(n.precise))) { if (MATR_DEBUG_MODE) { fprintf(stderr, "node_col: null node.\n"); } return MAX_INT_VAL; } if (n.is_compact && n.compact) { return n.compact->data.col; } return n.precise->data.col; } //returns a pointer to the node after //the one n points to MY_INLINE SparseNode next_node(SparseNode n) { SparseNode ret; ret.is_compact = n.is_compact; ret.compact = NULL; ret.precise = NULL; if (null_node(n)) { return make_null_node(n.is_compact); } if (n.is_compact) { ret.compact = n.compact->next; } else { ret.precise = n.precise->next; } return ret; } //returns a pointer to the node before //the one n points to MY_INLINE SparseNode prev_node(SparseNode n) { SparseNode ret; ret.is_compact = n.is_compact; ret.compact = NULL; ret.precise = NULL; if (null_node(n)) { return make_null_node(n.is_compact); } if (n.is_compact) { ret.compact = n.compact->prev; } else { ret.precise = n.precise->prev; } return ret; } //sets the data associated with node n to be d MY_INLINE void node_set_data(SparseNode n, double d) { if (null_node(n)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "node_set_data: null node.\n"); } return; } if (n.is_compact) { n.compact->data.data = (int)d; } else { n.precise->data.data = d; } } //sets the column associated with node n to be c MY_INLINE void node_set_col(SparseNode n, unsigned int c) { if (null_node(n)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "node_set_col: null node.\n"); } return; } if (n.is_compact) { n.compact->data.col = c; } else { n.precise->data.col = c; } } //frees the pointer that n has //taking into account compactness MY_INLINE void node_free(SparseNode n) { if (null_node(n)) { return; } if (n.is_compact) { free(n.compact); } else { free(n.precise); } } #endif //crm_svm_matrix_util.h crm114-20100106-BlameMichelson.src/crm_css_maintenance.c0000644000000000017500000003277111321154266021146 0ustar rootwsy// crm_css_maintenance.c - migrogrooming utilities // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" static long crm_zapcss ( FEATUREBUCKET_STRUCT *h, unsigned long hs, unsigned long start, unsigned long end ); // How to microgroom a .css file that's getting full // // NOTA BENE NOTA BENE NOTA BENE NOTA BENE // // This whole section of code is under intense develoment; right now // it "works" but not any better than nothing at all. Be warned // that any patches issued on it may well never see the light of // day, as intense testing and comparison may show that the current // algorithms are, well, suckful. // // // There are two steps to microgrooming - first, since we know we're // already too full, we execute a 'zero unity bins'. Then, we see // how the file looks, and if necessary, we get rid of some data. // R is the "MICROGROOM_RESCALE_FACTOR" // long crm_microgroom (FEATUREBUCKET_STRUCT *h, unsigned char *seen_features, long hs, unsigned long hindex) { long i, j, k; static long microgroom_count = 0; long steps; long packstart; // first used bucket in the chain long packlen; // # of used buckets in the chain long packend; // last used bucket in the chain // for stochastic grooming we need a place for the random... unsigned long randy; long zeroed_countdown; long actually_zeroed; long force_rescale; j = 0; k = 0; zeroed_countdown = MICROGROOM_STOP_AFTER; // i = j = k = 0; microgroom_count++; if (user_trace) { if (microgroom_count == 1) fprintf (stderr, "CSS file too full: microgrooming this css chain: "); fprintf (stderr, " %ld ", microgroom_count); }; // We have two different algorithms for amnesia - stochastic // (meaning random) and weight-distance based. // steps = 0; randy = 0; force_rescale = 0; #ifdef STOCHASTIC_AMNESIA // set our stochastic amnesia matcher - note that we add // our microgroom count so that we _eventually_ can knock out anything // even if we get a whole string of buckets with hash keys that all alias // to the same value. // // We also keep track of how many buckets we've zeroed and we stop // zeroing additional buckets after that point. NO! BUG! That // messes up the tail length, and if we don't repack the tail, then // features in the tail can become permanently inaccessible! Therefore, // we really can't stop in the middle of the tail (well, we could // stop zeroing, but we need to pass the full length of the tail in. // // Note that we can't do this "adaptively" in packcss, because zeroes // there aren't necessarily overflow chain terminators (because -we- // might have inserted them here. // // start at initial chain start, move to back of // chain that overflowed, then scale just that chain. // i = j = hindex % hs; if (i == 0) i = 1; while (h[i].hash != 0) { i--; if (i < 1) i = hs - 1; if (i == j) break; // don't hang if we have a 100% full .css file // fprintf (stderr, "-"); } // now, move our index to point to the first bucket in this chain. i++; if (i >= hs) i = 1; packstart = i; steps = 0; force_rescale = 0; while (h[i].value != 0 ) { // fprintf (stderr, "="); randy = rand() + microgroom_count; if ( ( h[i].key != 0 ) // hash keys == 0 are SPECIALS like #learns, // and must never be deleted. && (force_rescale || (( h[i].key + randy ) & MICROGROOM_STOCHASTIC_MASK ) == MICROGROOM_STOCHASTIC_KEY )) { h[i].value = h[i].value * MICROGROOM_RESCALE_FACTOR; }; if (h[i].value == 0) zeroed_countdown--; i++; if (i >= hs ) i = 1; steps++; } packlen = steps; #endif // STOCHASTIC_AMNESIA #ifdef WEIGHT_DISTANCE_AMNESIA // // Weight-Distance Amnesia is an improvement by Fidelis Assis // over Stochastic amnesia in that it doesn't delete information // randomly; instead it uses the heuristic that low-count buckets // at or near their original insert point are likely to be old and // stale so expire those first. // // i = j = k = hindex % hs; if (i == 0) i = j = k = 1; while (h[i].hash != 0) { i--; if (i < 1) i = hs - 1; if (i == j) break; // don't hang if we have a 100% full .css file // fprintf (stderr, "-"); } // now, move our index to point to the first _used_ bucket in // this chain. i++; if (i >= hs) i = 1; packstart = i; // Now find the _end_ of the bucket chain. // while (h[j].hash != 0) { j++; if (j >= hs) j = 1; if (j == k) break; // don't hang on 100% full .css file } j--; if (j == 0) j = hs - 1; // j is now the _last_ _used_ bucket. packend = j; // Now we have the start and end of the bucket chain. // // An advanced version of this algorithm would make just two passes; // one to find the lowest-ranked buckets, and another to zero them. // However, Fidelis' instrumentation indicates that an in-place, // multisweep algorithm may be as fast, or even faster, in the most // common situations. So for now, we'll do a multisweep. // // // Normal Case: hs=10, packstart = 4, packend = 7 // buck# 0 1 2 3 4 5 6 7 8 9 // R 0 0 0 X X X X 0 0 // so packlen = 4 ( == 7 - 4 + 1) // // fixup for wraparound - note the 0th bucket is RESERVED: // example hs = 10, packstart = 8, packend = 2 // buck# 0 1 2 3 4 5 6 7 8 9 // R X X 0 0 0 0 0 X X // and so packlen = 4 (10 - 8 + 2) if (packstart < packend ) { packlen = packend - packstart + 1; } else { packlen = ( hs - packstart ) + packend; }; // And now zap some buckets - are we in wraparound? // if ( packstart < packend ) { // fprintf (stderr, "z"); actually_zeroed = crm_zapcss ( h, hs, packstart, packend); } else { //fprintf (stderr, "Z"); actually_zeroed = crm_zapcss (h, hs, packstart, hs -1 ); actually_zeroed = actually_zeroed + crm_zapcss (h, hs, 1, (packlen - (hs - packstart))); }; #endif // WEIGHT_DISTANCE_AMNESIA // now we pack the buckets crm_packcss (h, seen_features, hs, packstart, packlen); return (actually_zeroed); } //////////////////////////////////////////////// // // crm_zapcss - the distance-heuristic microgroomer core. static long crm_zapcss ( FEATUREBUCKET_STRUCT *h, unsigned long hs, unsigned long start, unsigned long end ) { // A question- what's the ratio deprecation ratio between // "distance from original" vs. low point value? The original // Fidelis code did a 1:1 equivalence (being 1 place off is exactly as // bad as having a count of just 1). // // In reality, because of Zipf's law, most of the buckets // stay at a value of 1 forever; they provide scant evidence // no matter what. Therefore, we will allow separate weights // for V (value) and D (distance). Note that a D of zero // means "don't use distance, only value", and a V of zero // means "don't use value, only distance. Mixed values will // give intermediate tradeoffs between distance( ~~ age) and // value. // // Similarly, VWEIGHT2 and DWEIGHT2 go with the _square_ of // the value and distance. #define VWEIGHT 1.0 #define VWEIGHT2 0.0 #define DWEIGHT 1.0 #define DWEIGHT2 0.0 long vcut; long zcountdown; unsigned long packlen; unsigned long k; long actually_zeroed; vcut = 1; packlen = end - start; // fprintf (stderr, " S: %ld, E: %ld, L: %ld ", start, end, packlen ); zcountdown = packlen / 32.0 ; // get rid of about 3% of the data actually_zeroed = 0; while (zcountdown > 0) { // fprintf (stderr, " %ld ", vcut); for (k = start; k <= end; k++) { if (h[k].key != 0 ) // key == 0 means "special- don't zero!" { // fprintf (stderr, "a"); if (h[k].value > 0) // can't zero it if it's already zeroed { // fprintf (stderr, "b"); if ((VWEIGHT * h[k].value) + (VWEIGHT2 * h[k].value * h[k].value ) + (DWEIGHT * (k - h[k].hash % hs)) + (DWEIGHT2 * (k - h[k].hash % hs) * (k - h[k].hash % hs)) <= vcut) { // fprintf (stderr, "*"); h[k].value = 0; zcountdown--; actually_zeroed++; }; }; }; }; vcut++; }; return (actually_zeroed); } void crm_packcss (FEATUREBUCKET_STRUCT *h, unsigned char *seen_features, long hs, long packstart, long packlen) { // How we pack... // // We look at each bucket, and attempt to reinsert it at the "best" // place. We know at worst it will end up where it already is, and // at best it will end up lower (at a lower index) in the file, except // if it's in wraparound mode, in which case we know it will not get // back up past us (since the file must contain at least one empty) // and so it's still below us in the file. //fprintf (stderr, "Packing %ld len %ld total %ld", // packstart, packlen, packstart+packlen); // if (packstart+packlen >= hs) // fprintf (stderr, " BLORTTTTTT "); if (packstart+packlen <= hs) // no wraparound in this case { crm_packseg (h, seen_features, hs, packstart, packlen); } else // wraparound mode - do it as two separate repacks { crm_packseg (h, seen_features, hs, packstart, (hs - packstart)); crm_packseg (h, seen_features, hs, 1, (packlen - (hs - packstart))); }; } void crm_packseg (FEATUREBUCKET_STRUCT *h, unsigned char *seen_features, long hs, long packstart, long packlen) { unsigned long ifrom, ito; unsigned long thash, tkey, tvalue, tseen; // keep the compiler quiet - tseen is used only if seen_features // is non-null, but the compiler isn't smart enough to know that. tseen = 0; if (internal_trace) fprintf (stderr, " < %ld %ld >", packstart, packlen); for (ifrom = packstart; ifrom < packstart + packlen; ifrom++) { // Is it an empty bucket? (remember, we're compressing out // all placeholder buckets, so any bucket that's zero-valued // is a valid target.) if ( h[ifrom].value == 0) { // Empty bucket - turn it from marker to empty if (internal_trace) fprintf (stderr, "x"); h[ifrom].key = 0; h[ifrom].hash = 0; if (seen_features) seen_features[ifrom] = 0; } else { if (internal_trace) fprintf (stderr, "-");}; } // Our slot values are now somewhat in disorder because empty // buckets may now have been inserted into a chain where there used // to be placeholder buckets. We need to re-insert slot data in a // bucket where it will be found. // ito = 0; for (ifrom = packstart; ifrom < packstart+packlen; ifrom++) { // Now find the next bucket to place somewhere // thash = h[ifrom].hash; tkey = h[ifrom].key; tvalue = h[ifrom].value; if (seen_features) tseen = seen_features[ifrom]; if (tvalue == 0) { if (internal_trace) fprintf (stderr, "X"); } else { ito = thash % hs; if (ito == 0) ito = 1; // fprintf (stderr, "a %ld", ito); while ( ! ( (h[ito].value == 0) || ( h[ito].hash == thash && h[ito].key == tkey ))) { ito++; if (ito >= hs) ito = 1; // fprintf (stderr, "a %ld", ito); }; // // found an empty slot, put this value there, and zero the // original one. Sometimes this is a noop. We don't care. if (internal_trace) { if ( ifrom == ito ) fprintf (stderr, "="); if ( ito < ifrom) fprintf (stderr, "<"); if ( ito > ifrom ) fprintf (stderr, ">"); }; h[ifrom].hash = 0; h[ifrom].key = 0; h[ifrom].value = 0; if (seen_features) seen_features[ifrom] = 0; h[ito].hash = thash; h[ito].key = tkey; h[ito].value = tvalue; if (seen_features) seen_features[ito] = tseen; }; }; } int crm_create_cssfile(char *cssfile, long buckets, long major, long minor, long spectrum_start) { FILE *f; long i; FEATUREBUCKET_STRUCT feature = {0, 0, 0}; if (user_trace) fprintf (stderr, "Opening file %s for writing.\n", cssfile); f = fopen (cssfile, "wb"); if (!f) { fprintf (stderr, "\n Couldn't open file %s for writing; errno=%d .\n", cssfile, errno); return (EXIT_FAILURE); } // Initialize CSS file - zero all buckets feature.hash = major; feature.key = minor; feature.value = spectrum_start; for (i=0; i < buckets; i++) { if (fwrite(&feature, sizeof(feature), 1, f) != 1) { fprintf (stderr, "\n Couldn't initialize .CSS file %s, " "errno=%d.\n", cssfile, errno); return (EXIT_FAILURE); } // // HACK ALERT HACK ALERT HACK ALERT // // yeah,there's more efficient ways to do this, but this will // stay in cache; an IF-statement will need at least three ops as // well. Probably six of one... feature.hash = 0; feature.key = 0; feature.value = 0; } fclose (f); return (EXIT_SUCCESS); } crm114-20100106-BlameMichelson.src/mailtrainer.crm0000755000000000017500000011200111321154266020005 0ustar rootwsy#! /usr/bin/crm # --(spam good repeat streak random worst verbose validate thick reload collapse report_header rfile goodcss spamcss config fileprefix) # # mailtrainer.crm - a TUNE type mailtrainer # # Note to SunOS and FreeBSD users - do not place command arguments of # "-([arguments])" format on the first line of this program # or you will not get what you expect. This is due to a kernel # difference in how a bangline should be dealt with. # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # A TUNE-type mailtrainer; repeat is the maximum number of # repeated executions. This filter uses the same config as # mailfilter.crm. Good and spam are trained alternately until # each file has been examined and possibly trained at least REPEAT # times, or until a run of at least STREAK length has been correctly # classified. If RANDOM is set, then randomize the order of the # files being trained on each pass. # # Worst means to rerun the entire set, and then retrain only the # N worst errors. In the limit, this is the single worst error. # This is slow but yields very "tight" and accurate files. # # Copyright (C) 2002-2006 William S. Yerazunis; licensed under the # GNU Public License (GPL) version 2. A copy of this license is included # in the distribution media, or obtain one from www.fsf.org . # # Note to BSD users - you MUST remove EVERYTHING on the first line # of this program from the first "-" to the end of the first line # (including the "-" sign itself) or you will not get what you # expect. This is due to a bug in the BASH code on BSD. # # # --->>> Basic Design Philosophy ( do these IN ORDER ) - note # that this is different for the worst strategy. # # 1) get directory listing of the good directory # # 2) get directory listing of the bad directory # # 3) for repcount < repeat (N passes through entire set) # and cleanrun < streak (M tests without a single error) # # 3a) cleanrun++, # test one from good ; if less than thresh, learn good, cleanrun=0. # # 3b) cleanrun++, # test one from spam; if more than -thresh, learn spam, cleanrun=0 # # 3c) repcount++ # # 4) email results to spam results and out to stdout as well. # ############################################################## # window # # --- uncomment this if you want to include a "forced" # configuration file --- # insert mailfilterconfig.crm # # # --- These vars must have a value, or else we'll get errors ---- # isolate (:classifier_reason:) /no reason yet/ # isolate (:classify_status:) // # isolate (:our_exit_code:) /0/ # isolate (:stats:) / pR: 0.000000 / # isolate (:pr:) / pR: 0.00000/ # isolate (:subj_text:) / (None) / # isolate (:add_extra_stuff:) // # isolate (:decision_length:) /4096/ # # Isolate these email addresses, and give them values, # in case the user doesn't. isolate (:reject_address:) // isolate (:fail_priority_mail_to:) // isolate (:fail_blacklist_mail_to:) // isolate (:fail_SSM_mail_to:) // isolate (:log_rejections:) // # # this ISOLATE will guarantee that :fileprefix: exists, and keep it's # prior (commandline) value if it does, and an empty string if it doesnt isolate (:fileprefix:) // isolate (:spamcss:) /spam.css/ isolate (:goodcss:) /nonspam.css/ # # This ISOLATE will guarantee that :force: will exist, and keep the # commandline value ("SET") , or the null string if the user doesn't # use --force on the command line. isolate (:force:) # # This ISOLATE will guarantee that :unlearn: will exist, and will keep # the commandline value ("SET") or the null string if the user doesn't # use --unlearn on the command line. isolate (:unlearn:) # # now, :clf: is the classify & learn flags; note that we have two # separate flags here in a bizarre chain. The reason is that :unlearn: # can have the value "SET", whereas :rft: needs "refute" isolate (:clf:) // # { isolate (:_arg2:) match [:_arg2:] /--help/ output / This is CRM114's mailtrainer, which builds .css statistics \n/ output / files. It uses DSTTTR and mailfilter.cf configuration setup. \n/ output / You *must* supply at least --good and --spam to run.\n/ output / Command Format: \n/ output / .\/mailtrainer.crm [options]* \n / output / Required options: \n/ output / --spam=\/spam\/directory\/ (one msg per file) \n/ output / --good=\/good\/directory\/ (one msg per file) \n/ output / Optional options: \n/ output / --spamcss=spam_statistics.css \n/ output / --goodcss=good_statistics.css \n/ output / --repeat=N (limit how many passes, default 5) \n/ output / --streak=N (exit on N perfect, default 10000) \n/ output / --random (train in random order, default not) \n/ output / --worst=N (train only the N worst errors per pass. SLOW!)\n/ output / --verbose (tell me more.) \n/ output / --validate=regex (Don't train any filename matching regex;\n/ output / instead, hold them back and make a final test \n/ output / pass with those hold-backs at the end.) \n/ output / --thick=N.N (TTT value; default 10.0 for OSB is good)\n/ output / --reload (if not randomizing, whenever one set of files \n/ output / is exhausted, reload it. Default- don't.)\n/ output / --collaspe (collapse intermediate reporting lines)\n/ output / --report_header='string' (include string in the header)\n/ output / --config=file (set config file. Default is mailfilter.cf)\n/ output / --fileprefix=dir (expect all files in "fileprefix")\n/ output /\n That's all! Enjoy. \n/ exit } ##################################################################### # # This is the code to read the per-user configuration. Note # that because this happens during the run, it will _override_ # any comand line arguments that get set. { isolate (:option_txt:) isolate (:ev:) isolate (:verbose_startup:) isolate (:config:) # # Part 1 - read in the options/configuration file # { { match [:config:] /.+/ input [:*:config:] (:option_txt:) } alius { # read in the standard mail filter configuration file. input [:*:fileprefix:mailfilter.cf] (:option_txt:) } } # # # reset loop for matching to start of :option_txt: match [:option_txt:] // # and loop till there are no more options. { # find a line that looks like a parameter setting... match < fromend nomultiline > (:line: :name: :value:) \ [:option_txt:] /^[ ]*(:[[:graph:]]+:)[ \t]+\/(.*)\// { # don't execute the assign if there's # a # at the start of the line. match [:name:] /^\x23/ { # Verbose startup? match [:verbose_startup:] /SET/ output / :*:name:\n :*:value:\n/ } isolate (:*:name:) /:*:value:/ } liaf } } # # Do a quick check- has the password been changed or not? If it's # still the default, put in something that will be well-nigh unguessable # (esp. since it will contain recieved headers that the sender cannot # see nor control.) { match [:spw:] /DEFAULT_PASSWORD/ # yes, it's the same as default. So we scramble it just so # nobody can hack in hash (:spw:) /:*:_env_string::*:_dw:/ } ############################################################# # # Set up the addresses that we might need to mail to # isolate (:reject_address:) /:*:general_fails_to:/ { match [:fail_priority_mail_to:] /[[:graph:]]/ alter (:fail_priority_mail_to:) /:*:general_fails_to:/ } { match [:fail_blacklist_mail_to:] /[[:graph:]]/ alter (:fail_blacklist_mail_to:) /:*:general_fails_to:/ } { match [:fail_SSM_mail_to:] /[[:graph:]]/ alter (:fail_SSM_mail_to:) /:*:general_fails_to:/ } # # Does the user want us to log all incoming mail? This is handy for # testing and auditing purposes. { match [:log_to_allmail.txt:] /yes/ output [:*:fileprefix:allmail.txt] /:*:_dw:/ } ########################################################### # Set up defaults for mail training... # isolate (:spam:) /ERROR!!!/ isolate (:good:) /ERROR!!!/ isolate (:repeat:) /1/ isolate (:streak:) /10000/ isolate (:random:) /no/ isolate (:worst:) /no/ isolate (:verbose:) /no/ isolate (:validate:) // # note that this is a bit tricky isolate (:reload:) /no/ isolate (:collapse:) // isolate (:report_header:) // isolate (:rfile:) // ##### if --thick is specified, it overrides the :thick_threshold from *.cf isolate (:thick:) /no/ { match [:thick:] /^no$/ alter (:thick_threshold:) /:*:thick:/ } # and set up our bookkeeping variables # isolate (:throughall:) /0/ isolate (:cleanrun:) /0/ isolate (:spamfiles:) // isolate (:goodfiles:) // isolate (:filename:) // isolate (:lfilename:) // isolate (:worst_results:) // isolate (:worst_retrains:) // isolate (:z:) // isolate (:exp_text: :a: :b: :c: :h:) isolate (:m_text: :b_text: :i_text: :comment: :commentbin: :rewrites:) ########################################################### # # set gooddir and spamdir to the directory parts of the spec # match [:good:] (:gooddir:) /^.*\// match [:spam:] (:spamdir:) /^.*\// #############################################################\ # # Start our report: # isolate (:report:) / MailTrainer Report \n:*:report_header:\n\n/ # alter (:report:) /:*:report: Commanded on: \n/ alter (:report:) /:*:report: spam source directory: :*:spamdir: \n/ alter (:report:) /:*:report: good source directory: :*:gooddir: \n/ alter (:report:) /:*:report: classifier config: :*:clf: \n/ alter (:report:) /:*:report: threshold thickness: :*:thick_threshold: \n/ alter (:report:) /:*:report: max repetitions: :*:repeat: \n/ alter (:report:) /:*:report: stop when a streak of: :*:streak: \n/ alter (:report:) /:*:report: randomization is: :*:random: \n/ alter (:report:) /:*:report: worst is: :*:worst: \n/ alter (:report:) /:*:report: verbose is: :*:verbose: \n/ alter (:report:) /:*:report: auto-reload: :*:reload: \n/ alter (:report:) /:*:report: concise log file: :*:rfile: \n/ { { match [:validate:] /./ alter (:report:) /:*:report: validation regex: :*:validate: \n/ } alius { alter (:report:) /:*:report: validation regex: (none) \n/ alter (:validate:) /[^\x00-\xFF]/ # this regex never matches. } } # { # do we do an output report at the top? match [:collapse:] /SET/ output /:*:report:/ } { # do we output the report header anyway match [:report_header:] /SET/ output /:*:report:/ { # do we have an rfile? match [:rfile:] /./ output [:*:rfile:] /:*:report:/ } } ########################################################### # # ############################################################ # # Get the good directory and the spam directory files # { syscall /ls :*:spam: / () (:spamfiles:) # output /spamfiles: ':*:spamfiles:'\n/ trap /.*/ (:reason:) { output / :*:reason:/ output /Unable to read your spamdir at :*:spamdir: \n/ alter (:report:) /:*:report: Unable to read your spamdir at :*:spamdir: \n/ goto /:error_exit:/ } } { syscall /ls :*:good: / () (:goodfiles:) # output /goodfiles: ':*:goodfiles:'\n/ trap /.*/ (:reason:) { output /:*:reason:/ output /Unable to read your gooddir at :*:gooddir: \n/ alter (:report:) /:*:report: Unable to read your gooddir at :*:gooddir: \n/ goto /:error_exit:/ } } ################################################################# # # If --random, then we create the randomized interleaved list. # The list is the full filenames of the spam and good files, # each line is prefixed by S for Spam and G for Good { match [:random:] /SET/ isolate (:randfiles:) // # put the full filename for the spam files first match [:spamfiles:] // { match [:spamfiles:] /[[:graph:]]+/ (:f:) alter (:randfiles:) /:*:randfiles:S:*:spam::*:f:\n/ liaf } # and the full filename of the good files next match [:goodfiles:] // { match [:goodfiles:] /[[:graph:]]+/ (:f:) alter (:randfiles:) /:*:randfiles:G:*:good::*:f:\n/ liaf } # output / Full set of files, before sort-randomization: \n:*:randfiles:\n/ # now randomize the files. NOTE that this requires a shuffler # command somewhere. syscall (:*:randfiles:) (:randfiles:) /:*:fileprefix::*:trainer_randomizer_command: / # output /\n Full set of files, after randomize: \n:*:randfiles:\n/ } ################################################################# # # Create spam.css and nonspam.css if they don't exist. # (just learn a newline into each one) learn [:_nl:] <:*:clf:> /:*:lcr:/ \ (:*:fileprefix::*:goodcss:) learn [:_nl:] <:*:clf:> /:*:lcr:/ \ (:*:fileprefix::*:spamcss:) #################################################################### # # Top of the "entire directory" loop :directory_loop_top: # ################################################################### { { eval /:@: :*:repeat: > :*:throughall: :/ alter (:report:) /:*:report: \n\n Running all files\n/ output / \nRunning all files \n/ } alius { alter (:report:) /:*:report:\n\nFinished :*:repeat: passes \n/ output / \n Finished :*:repeat: passes. \n/ goto /:good_exit:/ } } ######################################################################## # # Set up training lists for this pass. We'll be chopping them up # presently. # # the spam training list for this pass: isolate (:stl:) /:*:spamfiles:/ # # the good training list for this pass: isolate (:gtl:) /:*:goodfiles:/ # # the random training list for this pass: { match [:random:] /SET/ isolate (:rtl:) /:*:randfiles:/ } # { match [:worst:] /no/ output /\nTake a break. This will require some time to finish. \n\n/ { match [:worst:] /SET/ # if no count set, default is 10 alter (:worst:) /10/ } goto /:worst_training:/ } ################################################################### # the top of the one-file-pair loop ###################################################################3 :one_file_pair_top: ############################################################## # Are we in "alternate one each" mode, or random shuffled mode? # { { match [:random:] /SET/ # get a filename # Remember that random-mode filenames are full-length already. call /:clip_filename:/ [:rtl:] (:lfilename:) match [:lfilename:] /(.)(.*)/ (:: :ftype: :filename:) # Don't run it if it's in the "validate" set match [:filename:] /:*:validate:/ { match [:ftype:] /G/ output / \nGood :*:filename: / { # Maybe it's a qualified name, maybe it's not input [:*:filename: 0 :*:decision_length:] trap /unable to read-open/ output /\n COULDN'T READ THE GOOD FILE ':*:filename:' \n/ alter (:_dw:) /:*:_nl:/ } call /:do_mutilations:/ # leaves the result in :m_text: # output / M_TEXT: :*:m_text:\n/ call /:test_train_good:/ } alius { output / \nSpam :*:filename: / { # Maybe it's a qualified name, maybe it's not input [:*:filename: 0 :*:decision_length:] trap /unable to read-open/ output /\n COULDN'T READ THE SPAM FILE ':*:filename:'\n/ alter (:_dw:) /:*:_nl:/ } call /:do_mutilations:/ # leaves the result in :m_text: # output / M_TEXT: :*:m_text:\n/ call /:test_train_spam:/ } } alius { match [:random:] /SET/ # No, we are in alternate mode. Do 1 good, then 1 spam file. # # Get the first good file. Note that if no files left, the match # just falls through and we don't do anything with this. { call /:clip_filename:/ [:gtl:] (:filename:) match [:filename:] /./ # check - is this file in the validate set ? If no, proceed. match [:filename:] /:*:validate:/ output / \nGood file :*:filename: / { # Maybe it's a qualified name, maybe it's not input [:*:gooddir::*:filename: 0 :*:decision_length:] trap /unable to read-open/ input [:*:filename: 0 :*:decision_length:] trap /unable to read-open/ output /\n COULDN'T READ THE GOOD FILE ':*:filename:'\n/ alter (:_dw:) /:*:_nl:/ } call /:do_mutilations:/ # leaves the result in :m_text: # output / M_TEXT: :*:m_text:\n/ call /:test_train_good:/ } # repeat for the first spam file. Again, if the match fails, there # were no filenames left, so we just fall through. { call /:clip_filename:/ [:stl:] (:filename:) match [:filename:] /./ # check - is this file in the validate set ? If no, proceed. match [:filename:] /:*:validate:/ output / \nSpam file :*:filename: / { # maybe it's a qualified name, maybe it's not input [:*:spamdir::*:filename: 0 :*:decision_length:] trap /unable to read-open/ input [:*:filename: 0 :*:decision_length:] trap /unable to read-open/ output /\n COULDN'T READ THE SPAM FILE ':*:filename:'\n/ alter (:_dw:) /:*:_nl:/ } call /:do_mutilations:/ # leaves the result in :m_text: call /:test_train_spam:/ } } } # Do we exit, or go 'round again? # # First check on a long-enough streak of good classifications. { eval /:@: :*:cleanrun: > :*:streak: : / alter (:report:) /:*:report: \n Got a clean run of :*:cleanrun: \n Exiting now. \n\n/ output /\nExcellent! Got a streak of :*:cleanrun: without errors. \n/ output / Finishing up. \n/ goto /:good_exit:/ } # did we get through all of the file names? # { # Most common case - neither fileset is empty { match [:random:] /SET/ { match [:rtl:] /./ goto /:one_file_pair_top:/ } } alius { match [:gtl:] /./ match [:stl:] /./ goto /:one_file_pair_top:/ } } # # If a fileset is empty, and reload is set, we reload that fileset # independently and immediately { match [:reload:] /SET/ { match [:gtl:] /./ eval (:throughall:) /:@: :*:throughall: + 0.5 :/ { eval /:@: :*:throughall: < :*:repeat: : / isolate (:gtl:) /:*:goodfiles:/ alter (:report:) /:*:report: \n\n Repeating good files\n/ output / \nRepeating good files \n/ goto /:one_file_pair_top:/ } alius { goto /:good_exit:/ } } { match [:stl:] /./ eval (:throughall:) /:@: :*:throughall: + 0.5 :/ { eval /:@: :*:throughall: < :*:repeat: : / isolate (:stl:) /:*:spamfiles:/ alter (:report:) /:*:report: \n\n Repeating spam files\n/ output / \nRepeating spam files \n/ goto /:one_file_pair_top:/ } alius { goto /:good_exit:/ } } } # yep; through all the filenames. Increment the :throughall: counter # and maybe go through it all again. { eval (:throughall:) /:@: :*:throughall: + 1 :/ goto /:directory_loop_top:/ } # All done now with repeats through the loop. # Now we update the report and we're done. alter (:report:) /:*:report: \n\n Finished with :*:repeat: passes. \n\n Training complete! \n\n\n / goto /:good_exit:/ ############################################################## # # Grab the text that we're going to actually work with. # :do_mutilations: # # We copy this into m_text - the "mutilated text". It # will become an annotated _copy_ of the incoming text, # with whatever changes we think will help us classify better. # # We clip m_text to be the first :decision_length: characters of # the incoming mail. # match (:m_text:) [:_dw: 0 :*:decision_length:] /.*/ isolate (:m_text:) # # :b_text: is the text with base64's expanded. isolate (:b_text:) /:*:m_text:/ # # :i_text: is the text with Hypertextus Interruptus removed. isolate (:i_text:) /:*:m_text:/ # # # # do we do any expansions? { # expansion 1: - do we perform base64 expansions? { { match [:do_base64:] /yes/ { # yes, expand base64's if there are any # # Note: some spams don't even bother to use # a 'Content-Transfer-Encoding marker, # and even fewer use Content-Type: text/whatever # so we have to sort of wing it, when to expand # what _might_ be base64 and when to ignore it. # For now, if it says it's a base64, it gets # expanded, no matter what the type. Maybe # someday someone will put in a lockout for # things like .jpg files, .doc files, etc. # #isolate (:exp_text: :a: :b: :c: :h:) match [:b_text:] (:a: :h: :b:) \ /(Content-Transfer-Encoding): base64(.*)/ #match (:a:) \ # /Content-Transfer-Encoding: base64((.)*)/ #match [:a:] (:h: :b:) \ # /base64(.*)/ match (:c:) [:b:] \ /([a-zA-Z0-9+=!\/]+:*:_nl:){2,200}/ # syscall (:*:c:) (:exp_text:) /:*:mime_decoder: / # and stuff the result back into b_text for # classification right in context. alter (:c:) /:*:exp_text:/ # and mark this piece of mime as "prior". alter (:h:) /Content-Transfer-Prior-Encoding/ # repeat till no more Mime base64 encodings liaf } } alius { # if no base64 expansions enabled, empty out :b_text: # alter (:b_text:) // } } # # If we had expansions, bust the html contents out of them, otherwise # ignore b_text as it's redundant { { match [:b_text:] /Content-Transfer-Prior-Encoding/ alter (:i_text:) /:*:b_text:/ } alius { # if :b_text: _didn't_ have a base64, it's useless alter (:b_text:) // } } # expansion 2 : do we bust HTML comments ( a.k.a. # hypertextus interruptus) out? { match [:undo_interruptus:] /yes/ alter (:commentbin:) // { match [:i_text:] (:comment:) // alter (:commentbin:) /:*:commentbin: :*:comment:/ alter ( :comment: ) // liaf } # if we had at least 80 characters worth of comments, then # it's worth using the decommented text, else not. # (this my personal judgement call) { { match [:commentbin:] /(.){80,}/ } alius { alter (:i_text:) // } } } } # and reassemble the mucked-over text into the :m_text: var, always # with the base64's expanded, then a second decommented copy # { alter (:m_text:) \ /:*:m_text: :*:_nl: :*:b_text: :*:_nl: :*:i_text: :*:_nl:/ } ######################################################### # # Do we want to do any rewrites before running? # { match [:rewrites_enabled:] /yes/ # # NOTE CHANGE THIS ONE TO ISOLATE AND THE PROGRAM FAILS! # isolate (:rewrites:) // alter (:rewrites:) // input (:rewrites:) [:*:fileprefix:rewrites.mfp] # reset matching on rewrites to start of string - if no string, no more # processing of rewrites !! match [:rewrites:] // # # { # Grab the next regex; turn the one-per-line patterns into a # regex and a replacement string. # First, do the line-spanning regexes. match (:ch: :fr1: :to:) [:rewrites:] /(.+)>-->(.*)/ # see if the "fr" regex matches anywhere { match [:m_text:] (:place:) /:*:fr1:/ # Yep, it matched... alter it and do it again # alter (:place:) /:*:to:/ liaf } # Nope, didn't match... grab the next regex and try again, liaf } # # reset back to the start of the rewrites. # match [:rewrites:] // # # and do it again for non-line-spanners { # Go through and do it again, except this time do it for # the non-line-spanning regexes. match (:ch: :fr2: :to:) [:rewrites:] /(.+)>->(.*)/ # see if the "fr" regex matches anywhere { match [:m_text:] (:place:) /:*:fr2:/ # Yep, it matched... alter it and do it again # alter (:place:) /:*:to:/ liaf } # Nope, didn't match... grab the next regex and try again, liaf } } # done with rewrites. # all done; m_text now has the fully mutilated text. return ############################################################# # Get a filename off the front of a list, whacking the list. ############################################################# # :clip_filename: (:namelist:) { { isolate (:filename:) // # start with an empty filename match [:*:namelist:] /([[:print:]]+)\n/ (:wholeline: :filename:) match [:filename:] /./ # assure filename is non-NULL isolate (:filename:) alter (:wholeline:) // # surgically delete the found filename # output /Got the filename :*:filename:/ } alius { alter (:filename:) // } } return /:*:filename:/ ############################################################# # Get the pR of whatever (passed in) ############################################################# # :get_pr: (:text:) { { #output /Good css: ':*:fileprefix::*:goodcss:'\n/ #output /spam css: ':*:fileprefix::*:spamcss:' \n/ #output /lcr: ':*:lcr:' \n/ #output /clf: ':*:clf:' \n/ #output /text: ':*:text:' \n/ classify <:*:clf:> [:*:text:] /:*:lcr:/ \ (:*:fileprefix::*:goodcss: :*:fileprefix::*:spamcss: ) \ (:classify_status:) } ### output /\n:*:classify_status:\n/ match [:classify_status:] \ /^#0.* pR: ([-. 0-9]+)/ ( :: :pr:) } return /:*:pr:/ ############################################################## # The actual code that does a CLASSIFY and maybe a LEARN # # This assumes the input text is in :m_text: # ############################################################## # :test_train_good: { # Classify the text call /:get_pr:/ [:m_text:] { eval /:@: :*:pr: < :*:thick_threshold: : / { { eval /:@: :*:pr: > 0 :/ output / -- (:*:pr:) train / } alius { output / ER (:*:pr:) train / } } alter (:report:) /:*:report: Training GOOD on :*:filename: (pR was :*:pr:) \n / learn [:m_text:] <:*:clf:> /:*:lcr:/ \ (:*:fileprefix::*:goodcss:) alter (:cleanrun:) /0/ # REclassify to see if we're now "good"; if not, refute! call /:get_pr:/ [:m_text:] { eval /:@: :*:pr: < :*:thick_threshold: : / { match [:clf:] /hyperspace/ output /& refute / alter (:report:) /:*:report: & refute/ learn [:m_text:] <:*:clf: refute> /:*:lcr:/ \ (:*:fileprefix::*:spamcss:) } alius { output /& repeat / alter (:report:) /:*:report: & repeat/ learn [:m_text:] <:*:clf:> /:*:lcr:/ \ (:*:fileprefix::*:goodcss:) } } { match [:collapse:] /SET/ output /\n/ } } alius { eval (:cleanrun:) /:@: :*:cleanrun: + 1:/ } } return :test_train_spam: { # Classify the text call /:get_pr:/ [:m_text:] { eval /:@: :*:pr: > (0 - :*:thick_threshold:) : / { { eval /:@: :*:pr: < 0 :/ output / -- (:*:pr:) train / } alius { output / ER (:*:pr:) train / } } alter (:report:) /:*:report: Training SPAM on :*:filename: (pR was :*:pr:) \n / learn [:m_text:] <:*:clf:> /:*:lcr:/ \ (:*:fileprefix::*:spamcss:) alter (:cleanrun:) /0/ # REclassify to see if we're now "good"; if not, refute! call /:get_pr:/ [:m_text:] { eval /:@: :*:pr: > (0 - :*:thick_threshold:) : / { match [:clf:] /hyperspace/ output /& refute / alter (:report:) /:*:report: & refute/ learn [:m_text:] <:*:clf: refute> /:*:lcr:/ \ (:*:fileprefix::*:goodcss:) } alius { output /& repeat / alter (:report:) /:*:report: & repeat/ learn [:m_text:] <:*:clf:> /:*:lcr:/ \ (:*:fileprefix::*:spamcss:) } } { match [:collapse:] /SET/ output /\n/ } } alius { eval (:cleanrun:) /:@: :*:cleanrun: + 1:/ } } return ################################################### :error_exit: { output /\n\n Something went very wrong. You might want to debug.\n\n/ exit /1/ } ##################################################### :good_exit: { # did the user ask for a validation pattern? # ( this is the never-match pattern ) match [:validate:] /[^\x00-\xFF]/ isolate (:spamtested:) /0/ isolate (:spampassed:) /0/ isolate (:goodtested:) /0/ isolate (:goodpassed:) /0/ isolate (:tottested:) /0/ isolate (:totpassed:) /0/ isolate (:overall:) /0/ isolate (:stl:) /:*:spamfiles:/ isolate (:gtl:) /:*:goodfiles:/ output /\n Starting validation run, pattern ':*:validate:' \n/ { call /:clip_filename:/ [:gtl:] (:filename:) match [:filename:] /./ { # check - is this file in the validate set ? If YES, proceed match [:filename:] /:*:validate:/ input [:*:gooddir::*:filename: 0 :*:decision_length:] call /:do_mutilations:/ call /:get_pr:/ [:m_text:] # Keep track of our statistics { # eval (:pr:) /:@: 0 - :*:pr: :/ eval (:goodtested:) /:@: :*:goodtested: + 1 :/ eval (:goodpassed:) /:@: :*:goodpassed: + ( :*:pr: > 0 ) :/ } output /\n:*:gooddir::*:filename: G (:*:goodpassed:) :*:pr: / } liaf } { call /:clip_filename:/ [:stl:] (:filename:) match [:filename:] /./ { # check - is this file in the validate set ? If YES, proceed match [:filename:] /:*:validate:/ input [:*:spamdir::*:filename: 0 :*:decision_length:] call /:do_mutilations:/ call /:get_pr:/ [:m_text:] # Keep track of our statistics { eval (:pr:) /:@: 0 - :*:pr: :/ eval (:spamtested:) /:@: :*:spamtested: + 1 :/ eval (:spampassed:) /:@: :*:spampassed: + ( :*:pr: > 0 ) :/ } output /\n:*:spamdir::*:filename: G (:*:spampassed:) :*:pr: / } liaf } output /\n Summary of validation on :*:validate::/ alter (:report:) /:*:report: \n\n Summary of validation:\n/ eval (:overall:) /:@: 100 * :*:goodpassed: \/ :*:goodtested: :/ output /\nGood files: :*:goodtested: correct: :*:goodpassed: accuracy: :*:overall:/ alter (:report:) /:*:report: \nGood files: :*:goodtested: correct: :*:goodpassed: accuracy: :*:overall:/ { match [:rfile:] /./ output [:*:rfile:] /\n:*:overall:/ } eval (:overall:) /:@: 100 * :*:spampassed: \/ :*:spamtested: :/ output /\nSpam files: :*:spamtested: correct: :*:spampassed: accuracy: :*:overall:/ alter (:report:) /:*:report: \nSpam files: :*:spamtested: correct: :*:spampassed: accuracy: :*:overall:/ { match [:rfile:] /./ output [:*:rfile:] /\n:*:overall:/ } eval (:tottested:) /:@: :*:goodtested: + :*:spamtested: : / eval (:totpassed:) /:@: :*:goodpassed: + :*:spampassed: : / eval (:overall:) /:@: 100 * (:*:totpassed: \/ :*:tottested: ) :/ output /\nOverall: :*:tottested: correct: :*:totpassed: accuracy: :*:overall:\n/ alter (:report:) /:*:report: \nOverall: :*:tottested: correct: :*:totpassed: accuracy: :*:overall: \n/ { match [:rfile:] /./ output [:*:rfile:] /\n:*:overall:\n/ } } # output /:*:report:/ exit /0/ :nada: return ########################################################### # # Worst training - similar to an SVM, but without the # grace and beauty. We train only the minimal set of # features that gain us the maximal response. # # Algorithm # 1 - train a *single* example into each class. # 2 - evaluate all other examples. # 3 - pick the "worst error(s)" in each class # 4 - Are the worst errors close enough? If so, stop # 5 - train those worst errors # 6 - go to 2 :worst_training: ##### Step 1 - train a single example in each class, to get "off center" { match [:gtl:] /([[:print:]]+)\n/ (:wholeline: :filename:) match [:filename:] /./ isolate (:filename:) alter (:wholeline:) // input [:*:gooddir::*:filename: 0 :*:decision_length:] call /:do_mutilations:/ { # if this is a ZERO :pr:, train something call /:get_pr:/ [:m_text:] eval /:@: :*:pr: = 0 :/ output /\n Learning :*:filename: as initial goodmail seed.\n/ learn [:m_text:] <:*:clf:> /:*:lcr:/ \ (:*:fileprefix::*:goodcss:) } } { match [:stl:] /([[:print:]]+)\n/ (:wholeline: :filename:) match [:filename:] /./ isolate (:filename:) alter (:wholeline:) // input [:*:spamdir::*:filename: 0 :*:decision_length:] call /:do_mutilations:/ { # if this is a ZERO :pr:, train something call /:get_pr:/ [:m_text:] eval /:@: :*:pr: = 0 :/ output /\n Learning :*:filename: as initial spam seed.\n/ learn [:m_text:] <:*:clf:> /:*:lcr:/ \ (:*:fileprefix::*:spamcss:) } } output /\n/ # make a little space before the "sputter" display. ##### Step 2 - Test each of the files, and keep track of the :worst: worst ##### error cases. To make our life easy, we just call out to ##### the "sort" utility via syscall. ##### Note that there's no need to do this alternating, because ##### sort()ing will fix order anyway. :worst_loop: # load up the good files and spam files isolate (:gtl:) /:*:goodfiles:/ isolate (:stl:) /:*:spamfiles:/ { alter (:worst_results:) // { call /:clip_filename:/ [:stl:] (:filename:) match [:filename:] /./ { # check- is this file in the validate set ? If no, proceed. match [:filename:] /:*:validate:/ input [:*:spamdir::*:filename: 0 :*:decision_length:] call /:do_mutilations:/ call /:get_pr:/ [:m_text:] eval (:pr:) /:@: 0 - :*:pr: f 10.4:/ alter (:worst_results:) \ /:*:worst_results:\n:*:pr: S :*:spamdir::*:filename:/ output \ /\n:*:pr: S :*:spamdir::*:filename:/ } liaf } { call /:clip_filename:/ [:gtl:] (:filename:) match [:filename:] /./ { # check- is this file in the validate set ? If no, proceed. match [:filename:] /:*:validate:/ input [:*:gooddir::*:filename: 0 :*:decision_length:] call /:do_mutilations:/ call /:get_pr:/ [:m_text:] eval (:pr:) /:@: :*:pr: f 10.4 :/ alter (:worst_results:) \ /:*:worst_results:\n:*:pr: G :*:gooddir::*:filename:/ output \ /\n:*:pr: G :*:gooddir::*:filename:/ } liaf } } ##### Step 3 --- Sort the worst results, to get the N worst errors ##### Note that this is a "numeric" sort, and minimum values ##### are "worst" in either direction. { syscall /sort -n / (:*:worst_results:) (:worst_results:) match /([^\n]+\n){1,:*:worst:}/ [:worst_results:] \ (:worst_retrains:) output /\n\n Worst Training Candidates: \n\n/ output /:*:worst_retrains:/ } ##### Step 4 --- Is the worst of the worst good enough? ##### Note that we test here, before we retrain anything. ##### (note the .00001 adder - that's to prevent getting exactly stuck) { match (:pr:) [:worst_retrains:] /[[:graph:]]+/ eval /:@: ( :*:pr: + .00001 ) > :*:thick_threshold: :/ output / \n Looks good... exiting! \n/ goto /:good_exit:/ } ##### Step 5 --- Train only those worst errors ##### Easily done, as the full filename is "coded" into ##### the report, as well as what to train it as. { match (:nextline: :pr: :type: :filename:) [:worst_retrains:] \ /([[:graph:]]+) +([[:graph:]]+) +([[:graph:]]+)\n/ input [:*:filename: 0 :*:decision_length:] call /:do_mutilations:/ isolate (:m_text:) /:*:_dw:/ { { match [:type:] /G/ output /\nConsidering :*:filename: as good\n/ call /:test_train_good:/ } alius { output /\nConsidering :*:filename: as spam\n/ call /:test_train_spam:/ } } alter (:nextline:) // liaf } output /\n/ ###### Step 6 - otherwise, go to 2 ###### goto /:worst_loop:/ ######################################################3 #trap (:broken_program_message:) /.*/ { output /:*:_nl: Aw, crud. mailtrainer.crm broke. Here's the error: :*:_nl:/ output /:*:broken_program_message:/ output [stderr] /:*:_nl: ERROR: mailtrainer.crm broke. Here's the error: :*:_nl:/ output [stderr] /ERROR: :*:broken_program_message:/ } exit /:*:program_fault_exit_code:/ crm114-20100106-BlameMichelson.src/whitelist.mfp.example0000644000000000017500000000010211321154266021140 0ustar rootwsyCRM114 crm114 harvard.edu tufts.edu dartmouth.edu mit.edu rpi.edu crm114-20100106-BlameMichelson.src/crm_str_funcs.c0000644000000000017500000006734611321154266020030 0ustar rootwsy// crm_str_funcs.c - string handling functions // Copyright 2004 Fidelis Assis // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // strnhash - generate the hash of a string of length N // goals - fast, works well with short vars includng // letter pairs and palindromes, not crypto strong, generates // hashes that tend toward relative primality against common // hash table lengths (so taking the output of this function // modulo the hash table length gives a relatively uniform distribution // // In timing tests, this hash function can hash over 10 megabytes // per second (using as text the full 2.4.9 linux kernel source) // hashing individual whitespace-delimited tokens, on a Transmeta // 666 MHz. // This is a more portable hash function, compatible with the original. // It should return the same value both on 32 and 64 bit architectures. // The return type was changed to unsigned long hashes, and the other // parts of the code updated accordingly. // -- Fidelis // // // unsigned long -> unsigned int, following Bill's idea that int is // likely to be 32 bits. // int32_t -> uint32_t, to get logical >> instead of arithmetic, // and an and and in case some compiler takes the loophole that allows // it not to implement logical right shift on processors that don't // have that instruction. // -- Kurt Hackenberg unsigned int strnhash (char *str, long len) { long i; uint32_t hval; unsigned int tmp; // initialize hval hval = len; // for each character in the incoming text: for ( i = 0; i < len; i++) { // xor in the current byte against each byte of hval // (which alone guarantees that every bit of input will have // an effect on the output) tmp = str[i] & 0xFF; tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24); hval ^= tmp; // add some bits out of the middle as low order bits. hval = hval + (( hval >> 12) & 0x0000ffff) ; // swap most and min significative bytes tmp = (hval << 24) | ((hval >> 24) & 0xff); hval &= 0x00ffff00; // zero most and min significative bytes of hval hval |= tmp; // OR with swapped bytes // rotate hval 3 bits to the left (thereby making the // 3rd msb of the above mess the msb of the output hash) hval = (hval << 3) | ((hval >> 29) & 0x7); } return (hval); } //////////////////////////////////////////////////////////////////////////// // // Cached mmap stuff. Adapted from Win32 compatibility code from // Barry Jaspan. Altered to not reveal the difference between a // mapped file pointer and one of Barry's 'map' structs. In this // code (unlike Barry's patches), all that is ever seen are // pointers to memory (i.e. crm_mmap and crm_munmap have the same // API and semantics as with the libc mmap() and munmap() calls), // no structs are ever seen by the callers of this code. // // Bugs in the POSIX code are my fault. Bugs in the CRM_WINDOWS code are // either mine or his. So there. // /////////////////////////////////////////////////////////////////////////// // // This code section (from this line to the line below that states // that it is the end of the dual-licensed code section) is // copyright and owned by William S. Yerazunis. In return for // addition of significant derivative work, Barry Jaspan is hereby // granted a full unlimited license to use this code section, // including license to relicense under other licenses. // //////////////////////////////////////////////////////////////////////////// // An mmap cell. This is how we cache. // typedef struct prototype_crm_mmap_cell { char *name; long start; long requested_len; long actual_len; time_t modification_time; // st_mtime - time last modified void *addr; long prot; // prot flags to be used, in the mmap() form // that is, PROT_*, rather than O_* long mode; // Mode is things like MAP_SHARED or MAP_LOCKED int unmap_count; // counter - unmap this after UNMAP_COUNT_MAX struct prototype_crm_mmap_cell *next, *prev; #ifndef CRM_WINDOWS int fd; #else // CRM_WINDOWS HANDLE fd, mapping; #endif // CRM_WINDOWS } CRM_MMAP_CELL; // We want these to hang around but not be visible outside this file. static CRM_MMAP_CELL *cache = NULL; // "volatile" for W32 compile bug ////////////////////////////////////// // // Force an unmap (don't look at the unmap_count, just do it) // Watch out tho- this takes a CRM_MMAP_CELL, not a *ptr, so don't // call it from anywhere except inside this file. // static void crm_unmap_file_internal ( CRM_MMAP_CELL *map) { long munmap_status; #ifndef CRM_WINDOWS if (map->prot & PROT_WRITE) msync (map->addr, map->actual_len, MS_ASYNC | MS_INVALIDATE); munmap_status = munmap (map->addr, map->actual_len); // fprintf (stderr, "Munmap_status is %ld\n", munmap_status); // Because mmap/munmap doesn't set atime, nor set the "modified" // flag, some network filesystems will fail to mark the file as // modified and so their cacheing will make a mistake. // // The fix is that for files that were mmapped writably, to do // a trivial read/write on the mapped file, to force the // filesystem to repropagate it's caches. // if (map->prot & PROT_WRITE) { FEATURE_HEADER_STRUCT foo; lseek (map->fd, 0, SEEK_SET); dontcare = read (map->fd, &foo, sizeof(foo)); lseek (map->fd, 0, SEEK_SET); dontcare = write (map->fd, &foo, sizeof(foo)); } // Although the docs say we can close the fd right after mmap, // while leaving the mmap outstanding even though the fd is closed, // actual testing versus several kernels shows this leads to // broken behavior. So, we close here instead. // close (map->fd); // fprintf (stderr, "U"); #else // CRM_WINDOWS FlushViewOfFile(map->addr, 0); UnmapViewOfFile(map->addr); CloseHandle(map->mapping); CloseHandle(map->fd); #endif } ///////////////////////////////////////////////////// // // Hard-unmap by filename. Do this ONLY if you // have changed the file by some means outside of // the mmap system (i.e. by writing via fopen/fwrite/fclose). // void crm_force_munmap_filename (char *filename) { CRM_MMAP_CELL *p; // Search for the file - if it's already mmaped, unmap it. // Note that this is a while loop and traverses the list. for (p = cache; p != NULL; p = p->next) { if (strcmp(p->name, filename) == 0) { // found it... force an munmap. crm_force_munmap_addr (p->addr); //This was commented out and I uncommented it. //I'm not sure why it was commented out, //but it was definitely creating a seg fault //during some testing. I hope that's ok. -JB break; // because p may be clobbered during unmap. } } } ////////////////////////////////////////////////////// // // Hard-unmap by address. Do this ONLY if you // have changed the file by some means outside of // the mmap system (i.e. by writing via fopen/fwrite/fclose). // void crm_force_munmap_addr (void *addr) { CRM_MMAP_CELL *p; // step 1- search the mmap cache to see if we actually have this // mmapped // p = cache; while ( p != NULL && p->addr != addr) p = p->next; if ( ! p ) { nonfatalerror5 ("Internal fault - this code has tried to force unmap memory " "that it never mapped in the first place. ", "Please file a bug report. ", CRM_ENGINE_HERE); return; } // Step 2: we have the mmap cell of interest. Mark it for real unmapping. // p->unmap_count = UNMAP_COUNT_MAX + 1; // Step 3: use the standard munmap to complete the unmapping crm_munmap_file (addr); return; } ////////////////////////////////////////////////////// // // This is the wrapper around the "traditional" file unmap, but // does cacheing. It keeps count of unmappings and only unmaps // when it needs to. // void crm_munmap_file (void *addr) { CRM_MMAP_CELL *p; struct stat statbuf; // step 1- search the mmap cache to see if we actually have this // mmapped // p = cache; while ( p != NULL && p->addr != addr) p = p->next; if ( ! p ) { nonfatalerror5 ("Internal fault - this code has tried to unmap memory " "that either was never mapped in the first place, or " "has already been unmapped. ", "Please file a bug report. ", CRM_ENGINE_HERE); return; } // Step 2: we have the mmap cell of interest. Do the right thing. // p->unmap_count = (p->unmap_count) + 1; if (p->unmap_count > UNMAP_COUNT_MAX) { crm_unmap_file_internal (p); // // File now unmapped, take the mmap_cell out of the cache // list as well. // if (p->prev != NULL) p->prev->next = p->next; else cache = p->next; if (p->next != NULL) p->next->prev = p->prev; free(p->name); free(p); } else { if (p->prot & PROT_WRITE) { #ifndef CRM_WINDOWS msync (p->addr, p->actual_len, MS_ASYNC | MS_INVALIDATE); stat(p->name, &statbuf); //Since WE did this update, update the modification time //What we have in memory is still correct! -JB p->modification_time = statbuf.st_mtime; #else // CRM_WINDOWS //unmap our view of the file, which will lazily write any //changes back to the file UnmapViewOfFile(p->addr); //and remap so we still have it open p->addr = MapViewOfFile(p->mapping, (p->mode & MAP_PRIVATE)?FILE_MAP_COPY:((p->prot & PROT_WRITE)?FILE_MAP_WRITE:FILE_MAP_READ), 0, 0, 0); //if the remap failed for some reason, just free everything // and get rid of this cached mmap entry. if (p->addr == NULL) { CloseHandle(p->mapping); CloseHandle(p->fd); if (p->prev != NULL) p->prev->next = p->next; else cache = p->next; if (p->next != NULL) p->next->prev = p->prev; free(p->name); free(p); } #endif } } } ///////////////////////////////////////////////////////// // // Force an Unmap on every mmapped memory area we know about void crm_munmap_all() { while (cache != NULL) { cache->unmap_count = UNMAP_COUNT_MAX + 1; crm_munmap_file (cache->addr); } } ////////////////////////////////////////////////////////// // // MMap a file in (or get the map from the cache, if possible) // (length is how many bytes to get mapped, remember!) // // prot flags are in the mmap() format - that is, PROT_, not O_ like open. // If you want the full file, pass -1 as requested_len, the result is in // actual_len. void *crm_mmap_file (char *filename, long start, long requested_len, long prot, long mode, long *actual_len) { CRM_MMAP_CELL *p; long pagesize = 0, k; struct stat statbuf; #ifndef CRM_WINDOWS mode_t open_flags; #else // CRM_WINDOWS DWORD open_flags = 0; DWORD createmap_flags = 0; DWORD openmap_flags = 0; #endif // CRM_WINDOWS pagesize = 0; // Search for the file - if it's already mmaped, just return it. for (p = cache; p != NULL; p = p->next) { if (strcmp(p->name, filename) == 0 && p->prot == prot && p->mode == mode && p->start == start && p->requested_len == requested_len) { // check the mtime; if this differs between cache and stat // val, then someone outside our process has played with the // file and we need to unmap it and remap it again. int k; struct stat statbuf; k = stat (filename, &statbuf); if (k != 0 || p->modification_time < statbuf.st_mtime) { // yep, someone played with it. unmap and remap crm_force_munmap_filename (filename); } else { // nope, it looks clean. We'll reuse it. if (actual_len) *actual_len = p->actual_len; return (p->addr); } } } // No luck - we couldn't find the matching file/start/len/prot/mode // We need to add an mmap cache cell, and mmap the file. // p = (void *) malloc( sizeof ( CRM_MMAP_CELL) ); if (p == NULL) { untrappableerror5(" Unable to malloc enough memory for mmap cache. ", " This is unrecoverable. Sorry.", CRM_ENGINE_HERE); return MAP_FAILED; } p->name = strdup(filename); p->start = start; p->requested_len = requested_len; p->prot = prot; p->mode = mode; #ifndef CRM_WINDOWS open_flags = O_RDWR; if ( ! (p->prot & PROT_WRITE) && (p->prot & PROT_READ) ) open_flags = O_RDONLY; if ( (p->prot & PROT_WRITE) && !(p->prot & PROT_READ)) open_flags = O_WRONLY; if (internal_trace) fprintf (stderr, "MMAP file open mode: %ld\n", (long) open_flags); //I changed all this so that the modification time would be //correct. -JB k = stat (p->name, &statbuf); if ( k != 0 ) { free (p->name); free (p); if (actual_len) *actual_len = 0; return (MAP_FAILED); } if (user_trace) fprintf (stderr, "MMAPping file %s for direct memory access.\n", filename); p->fd = open (filename, open_flags); if (p->fd < 0) { close (p->fd); free(p->name); free(p); if (actual_len) *actual_len = 0; return MAP_FAILED; } p->actual_len = p->requested_len; // If we didn't get a length, fill in the max possible length via statbuf if (p->actual_len < 0) p->actual_len = statbuf.st_size - p->start; p->addr = mmap (NULL, p->actual_len, p->prot, p->mode, p->fd, p->start); //We want the modification time to be AFTER the mmap since that //could change it (I assume) if we have a PROT_WRITE. So we need //to stat the file again k = stat (p->name, &statbuf); p->modification_time = statbuf.st_mtime; //fprintf (stderr, "M"); // we can't close the fd now (the docs say yes, testing says no, // we need to wait till we're really done with the mmap.) //close(p->fd); if (p->addr == MAP_FAILED) { close (p->fd); free(p->name); free(p); if (actual_len) *actual_len = 0; return MAP_FAILED; } #else // CRM_WINDOWS if (p->mode & MAP_PRIVATE) { open_flags = GENERIC_READ; createmap_flags = PAGE_WRITECOPY; openmap_flags = FILE_MAP_COPY; } else { if (p->prot & PROT_WRITE) { open_flags = GENERIC_WRITE; createmap_flags = PAGE_READWRITE; openmap_flags = FILE_MAP_WRITE; } if (p->prot & PROT_READ) { open_flags |= GENERIC_READ; if (!(p->prot & PROT_WRITE)) { createmap_flags = PAGE_READONLY; openmap_flags = FILE_MAP_READ; } } } if (internal_trace) fprintf (stderr, "MMAP file open mode: %ld\n", (long) open_flags); //GROT GROT GROT // this section was wrong under non-windows and the result was that // the modification time was messed up. I don't change code I can't // test, but someone with windows should fix this. Specifically, I // see no place the modification time is update, which seems like a // bug. -JB // If we need to, we stat the file. if (p->requested_len < 0) { long k; k = stat (p->name, &statbuf); if (k != 0) { free (p->name); free (p); if (actual_len) *actual_len = 0; return (MAP_FAILED); }; }; if (user_trace) fprintf (stderr, "MMAPping file %s for direct memory access.\n", filename); p->fd = CreateFile(filename, open_flags, 0, NULL, OPEN_EXISTING, 0, NULL); if (p->fd == INVALID_HANDLE_VALUE) { free(p->name); free(p); return NULL; } p->actual_len = p->requested_len; if (p->actual_len < 0) p->actual_len = statbuf.st_size - p->start; p->mapping = CreateFileMapping(p->fd, NULL, createmap_flags, 0, requested_len, NULL); if (p->mapping == NULL) { CloseHandle(p->fd); free(p->name); free(p); return NULL; } p->addr = MapViewOfFile(p->mapping, openmap_flags, 0, 0, 0); if (p->addr == NULL) { CloseHandle(p->mapping); CloseHandle(p->fd); free(p->name); free(p); return NULL; } { SYSTEM_INFO info; GetSystemInfo(&info); pagesize = info.dwPageSize; } // Jaspan-san says force-loading every page is a good thing // under Windows. I know it's a bad thing under Linux, // so we'll only do it under Windows. { char one_byte; char *addr = (char *) p->addr; long i; for (i = 0; i < p->actual_len; i += pagesize) one_byte = addr[i]; } #endif // CRM_WINDOWS // If the caller asked for the length to be passed back, pass it. if (actual_len) *actual_len = p->actual_len; // Now, insert this fresh mmap into the cache list // p->unmap_count = 0; p->prev = NULL; p->next = cache; if (cache != NULL) cache->prev = p; cache = p; return p->addr; } /////////////////////////////////////////////////////////////////////// // // End of section of code dual-licensed to Yerazunis and Jaspan // /////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////// // // strntrn - translate characters of a string. // // Original spec by Bill Yerazunis, original code by Raul Miller, // recode for CRM114 use by Bill Yerazunis. // // This code section (crm_strntrn and subsidiary routines) is // dual-licensed to both William S. Yerazunis and Raul Miller, // including the right to reuse this code in any way desired, // including the right to relicense it under any other terms as // desired. // ////////////////////////////////////////////////////////////////////// // // We start out with two helper routines - one to invert a string, // and the other to expand string ranges. // ////////////////////////////////////////////////////////////////////// // // Given a string of characters, invert it - that is, the string // that was originally 0x00 to 0xFF but with all characters that // were in the incoming string omitted and the string repacked. // // Returns a pointer to the fresh inversion, or NULL (on error) // // The old string is unharmed. Be careful of it. // // REMEMBER TO FREE() THE RESULT OR ELSE YOU WILL LEAK MEMORY!!! unsigned char * crm_strntrn_invert_string (unsigned char *str, long len, long *rlen) { unsigned char *outstr; long i, j; // create our output string space. It will never be more than 256 // characters. It might be less. But we don't care. outstr = malloc (256); // error out if there's a problem with MALLOC if (!outstr) { untrappableerror5 ("Can't allocate memory to invert strings for strntrn", "", CRM_ENGINE_HERE); } // The string of all characters is the inverse of "" (the empty // string), so a mainline string of "^" inverts here to the string // of all characters from 0x00 to 0xff. // // The string "^" (equivalent to total overall string "^^") is the // string of all characters *except* ^; the mainline code suffices // for that situation as well. // // BUT THEN how does one specify the string of a single "^"? Well, // it's NOT of NOT of "NOT" ("^"), so "^^^" in the original, or // "^^" here, is taken as just a literal "^" (one carat character). // if (len == 2 && strncmp ((char *)str, "^^", 2) == 0) { outstr[0] = '^'; *rlen = 1; return (outstr); }; // No such luck. Fill our map with "character present". // fill it with 1's ( :== "character present") // for (i=0; i < 256; i++) outstr[i] = 1; // for each character present in the input string, zero the output string. for (i = 0; i < len; i++) outstr [ str [i]] = 0; // outstr now is a map of the characters that should be present in the // final output string. Since at most this is 1:1 with the map (which may // have zeros) we can just reuse outstr. // for (i = 0, j = 0 ; i < 256; i++) if (outstr[i]) { outstr[j] = i; j++; }; // The final string length is j characters long, in outstr. // Don't forget to free() it later. :-) // printf ("Inversion: '%s' RLEN: %d\n", outstr, *rlen); *rlen = j; return (outstr); } // expand those hyphenated string ranges - input is str, of length len. // We return the new string, and the new length in rlen. // unsigned char * crm_strntrn_expand_hyphens(unsigned char *str, long len, long *rlen) { long j, k, adj; unsigned char* r; // How much space do we need for the expanded-hyphens string // (note that the string might be longer than 256 characters, if // the user specified overlapping ranges, either intentionally // or unintentionally. // // On the other hand, if the user used a ^ (invert) as the first // character, then the result is gauranteed to be no longer than // 255 characters. // for (j= 1, adj=0; j < len-1; j++) { if ('-' == str[j]) { adj+= abs(str[j+1]-str[j-1])-2; } } // Get the string length for our expanded strings // *rlen = adj + len; // Get the space for our expanded string. r = malloc ( 1 + *rlen); /* 1 + to avoid empty problems */ if (!r) { untrappableerror5( "Can't allocate memory to expand hyphens for strstrn", "", CRM_ENGINE_HERE); } // Now expand the string, from "str" into "r" // for (j= 0, k=0; j < len; j++) { r[k]= str[j]; // are we in a hyphen expression? Check edge conditions too! if ('-' == str[j] && j > 0 && j < len-1) { // we're in a hyphen expansion if (j && j < len) { int delta; int m = str[j-1]; int n = str[j+1]; int c; // is this an increasing or decreasing range? delta = m < n ? 1 : -1; // run through the hyphen range. if (m != n) { for (c= m+delta; c != n; c+= delta) { r[k++]= (unsigned char) c; }; r[k++]= n; } j+= 1; } } else { // It's not a range, so we just move along. Move along! k++; } }; // fprintf (stderr, "Resulting range string: %s \n", r); // return the char *string. return (r); } // strntrn - translate a string, like tr() but more fun. // This new, improved version not only allows inverted ranges // like 9-0 --> 9876543210 but also negation of strings and literals // // flag of CRM_UNIQUE means "uniquify the incoming string" // // flag of CRM_LITERAL means "don't interpret the alteration string" // so "^" and "-" regain their literal meaning // // The modification is "in place", and datastrlen gets modified. // This routine returns a long >=0 strlen on success, // and a negative number on failure. long strntrn ( unsigned char *datastr, long *datastrlen, long maxdatastrlen, unsigned char *fromstr, long fromstrlen, unsigned char *tostr, long tostrlen, long flags) { long len= *datastrlen; long flen, tlen; unsigned char map[256]; unsigned char *from = NULL; unsigned char *to = NULL; long j, k, last; // If tostrlen == 0, we're deleting, except if // ASLO fromstrlen == 0, in which case we're possibly // just uniquing or maybe not even that. // int replace = tostrlen; // Minor optimization - if we're just uniquing, we don't need // to do any of the other stuff. We can just return now. // if (tostrlen == 0 && fromstrlen == 0) { // fprintf (stderr, "Fast exit from strntrn \n"); *datastrlen = len; return (len); }; // If CRM_LITERAL, the strings are ready, otherwise build the // expanded from-string and to-string. // if (CRM_LITERAL & flags) { // Else - we're in literal mode; just copy the // strings. from = malloc (fromstrlen); strncpy ( (char *)from, (char *)fromstr, fromstrlen); flen = fromstrlen; to = malloc (tostrlen); strncpy ((char *) to, (char *)tostr, tostrlen); tlen = tostrlen; if (from == NULL || to == NULL) return (-1); } else { // Build the expanded from-string if (fromstr[0] != '^') { from = crm_strntrn_expand_hyphens(fromstr, fromstrlen, &flen); if (!from) return (-1); } else { unsigned char *temp; long templen; temp = crm_strntrn_expand_hyphens(fromstr+1, fromstrlen-1, &templen); if (!temp) return (-1); from = crm_strntrn_invert_string (temp, templen, &flen); if (!from) return (-1); free (temp); }; // Build the expanded to-string // if (tostr[0] != '^') { to = crm_strntrn_expand_hyphens(tostr, tostrlen, &tlen); if (!to) return (-1); } else { unsigned char *temp; long templen; temp = crm_strntrn_expand_hyphens(tostr+1, tostrlen-1, &templen); if (!temp) return (-1); to = crm_strntrn_invert_string (temp, templen, &tlen); if (!to) return (-1); free (temp); }; }; // If we're in mode, squish out any duplicated // characters in the input data first. We can do this as an in-place // scan of the input string, and we always do it if is // specified. // if (CRM_UNIQUE & flags) { unsigned char unique_map [256]; // build the map of the uniqueable characters // for (j = 0; j < 256; j++) unique_map[j] = 1; // all characters are keepers at first... for (j = 0; j < flen; j++) unique_map[from[j]] = 0; // but some need to be uniqued. // If the character has a 0 the unique map, // and it's the same as the prior character, // don't copy it. Just move along. for (j= 0, k= 0, last= -1; j < len; j++) { if (datastr[j] != last || unique_map[datastr[j]] ) { last= datastr[k++]= datastr[j]; }; }; len= k; }; // Minor optimization - if we're just uniquing, we don't need // Build the mapping array // if (replace) { // This is replacement mode (not deletion mode) so we need // to build the character map. We // initialize the map as each character maps to itself. // for (j= 0; j < 256; j++) { map[j]= (unsigned char)j; } // go through and mod each character in the from-string to // map into the corresponding character in the to-string // (and start over in to-string if we run out) // for (j= 0, k=0; j < flen; j++) { map[from[j]]= to[k]; // check- did we run out of characters in to-string, so // that we need to start over in to-string? k++; if (k >= tlen) { k= 0; } } // Finally, the map is ready. We go thorugh the // datastring translating one character at a time. // for (j= 0; j < len; j++) { datastr[j]= map[datastr[j]]; } } else { // No, we are not in replace mode, rather we are in delete mode // so the map now says whether we're keeping the character or // deleting the character. for (j= 0; j < 256; j++) { map[j]= 1; } for (j= 0; j < flen; j++) { map[from[j]] = 0; } for (j= 0, k= 0; j < len; j++) { if (map[datastr[j]]) { datastr[k++]= datastr[j]; } } len= k; } // drop the storage that we allocated // free(from); free(to); *datastrlen = len; return (len); } ///////////////////////////////////////////////////////////////// // // END of strntrn code (dual-licensed to both Yerazunis // and Miller // ////////////////////////////////////////////////////////////////// crm114-20100106-BlameMichelson.src/eval_infiniteloop.crm0000755000000000017500000000126511321154266021215 0ustar rootwsy#! /usr/bin/crm # # eval_infiniteloop.com - test that EVAL properly detects blowups # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # stress-out EVAL by an infinitely looping evaluation. This SHOULD # cause an nonfatal error. # window output / \n\n/ output / This test checks to see that EVAL properly detects blowups\n/ output / You should see an error message as the next thing,\n/ output / with a large string of Z's. \n/ { isolate (:e:) /:*/ isolate (:a:) /:*:e::a:Z/ eval (:b:) /:*:a:/ output /:*:a:/ } trap (:my_err:) /*/ { output /We should get an error here, due to the (intentional!) loop\n/ output /:*:my_err:/ } crm114-20100106-BlameMichelson.src/CLASSIFY_DETAILS.txt0000644000000000017500000010714211321154266020125 0ustar rootwsy# # classify_details.txt - How CRM114's LEARN and CLASSIFY really work. # # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. How CRM114's LEARN and CLASSIFY really work. This document describes the internal workings of the CRM114 LEARN and CLASSIFY functions. You do _not_ need to know this to use CRM114 effectively; this is to satisfy the curiosity of those who really want deep knowledge of the tools they use. (NOTE: since CRM114 now has multiple classifiers available, please read this whole document. Some of the classifiers are interoperable, and some are not.) The current distribution builds in this set of classifiers. The classifiers are: 1) SBPH Markovian (the default) This is an extension of Bayesian classification, mapping features in the input text into a Markov Random Field. This turns each token in the input into 2^(N-1) features, which gives high accuracy but at high computation and memory cost. (note- you can get plain old Bayesian by specifying the flag < unigram > ) 2) OSB Markovian - This is a version of the Markovian that uses an orthogonal sparse bigram (OSB) feature set, instead of the SBPH features. OSB seems to be neck-and-neck in accuracy versus SBPH but it's considerably faster and uses less memory for the same amount of detail. Because OSB Markovian is a subset of SBPH Markovian, you can "sort of" intermix .css files generated by SBPH Markovian and OSB Markovian, although there will be some loss of accuracy. Fidelis Assis contributed the idea of using OSB instead of full SBPH feature sets and showed that OSB actually had advantages. 3) OSB Winnow - This classifier uses the same feature set as OSB-Markovian but doesn't use a probabalistic estimation at all. Instead, it uses the Winnow algorithm. The data files aren't compatible, although a good hacker could probably come up with a way to get an approximate conversion back and forth from the Markovian models. Like Markovian, you can specify < unigram > to get single-word features instead of the OSB feature set; this decreases both disk space used and accuracy. 4) Correlator classification - This classifier doesn't do tokenization at all. Instead, it slides the example and unknown texts across each other and measures the cross-correllation. The final scores go with the square of the run-lengths of matching strings. This matcher is -very- slow... easily 40 to 100x slower than any of the other classifiers. It _will_ work against binary files, though, which none of the other classifiers will. 5) Hyperspatial classification - this experimental classifier tokenizes, but does not use Bayes law at all, nor statistical "clumping". During learning, each example document generates a single point in a 4 billion dimensional hyperspace. The classification algorithm places a light source at each of these points, and measures the sum of the radiant power from all of the light sources of each class. The class yielding the highest radiant power for the unknown document is considered to be the correct class. By default, this uses the OSB feature set, but you can use < unigram > to switch to single-word features (decreases disk usage but costs accuracy). 5) Format of the .css and .cow files and microgrooming - some design notes and how microgrooming works. Here's the details for each classifier: Classifier 1: Markovian The general concept is this: break the incoming text into short phrases of from one to five words each. A phrase can have words in the middle of the phrase skipped (e.g. "BUY ONLINE NOW!!!" is always a bad sign.), and more than one phrase can use the same word. You can't change the order of the words, but you _can_ bridge across newlines, punctuation, etc. Make all the phrases you can make. For each phrase you can make, keep track of how many times you see that phrase in both the spam and nonspam categories. When you need to classify some text, make the phrases, and count up how many times all of the phrases appear in the two different categories. The category with the most phrase matches wins. Note that you never have to cross-reference between the two category phrase sets. If a phrase appears in both categories an equal number of times, then both categories get an equal score boost. Since an equal score boost doesn't change which category will win, there's no need to cross-reference category phrase counts. NB: This process is called "sparse binary polynomial hashing" because it uses a set of polynomials to generate a hash-of-hashes; sparse because not all words are represented by nonzero terms, binary because the changing coefficient terms are always either 0 or 1, and a hash because, well, it's a hash. :) Instead of simply comparing raw count scores, we do a Bayesian chain-rule to calculate the probability of "good" versus "evil". (CRM114 actually has no knowledge of spam and nonspam, just two sets of user-defined classes that can be whatever you want to be. This explanation will use 'spam' and 'nonspam', but internally, it's just "these statistics files here" and "those statistics files there") The Bayesian chainrule formula is P(A|S) P(S) P (S|A) = ------------------------- P(A|S) P(S) + P(A|NS) P(NS) which (in words) says: "The NEW chance of spam, given some feature A, is equal to the chance of A given spam times the OLD chance that it's spam, divided by the sum of the chance of A given spam times the old chance it's spam plus the chance of A given nonspam, times the old chance it's nonspam".) We start assuming that the chance of spam is 50/50. We count up the total number of features in the "good" versus "evil" feature .css files. We use these counts to normalize the chances of good versus evil features, so if your training sets are mostly "good", it doesn't predispose the filter to think that everything is good. We repeatedy form a feature with the polynomials, check the .css files to see what the counts of that feature are for spam and nonspam, and use the counts to calculate P(A|S) and P(A|NS) [remember, we correct for the fact that we may have different total counts in the spam and nonspam categories]. We also bound P(A|S) and P(A|NS) to prevent any 0.0 or 1.0 probabilities from saturating the system. If you allow even _one_ 0.0 or 1.0 into the chain rule, there's no way for the system to recover even in the face of overwhelming evidence to the contrary. The actual bound in use depends on the total number of counts of the feature A ever encountered, irrespective of their good/evil nature. [additional note: versions from 20030630 to 20031200 used a fairly gentle method to generate the local probabilities from the relative hit counts. From 20031200 onward, this local probability was modified by the number and sequence of the terms of the polynomial. The best model found so far is a set of coefficients that model a Markov chain; polynomials that have a longer chain length (and therefore a closer match) get a significantly higher boost.] Once we have P(A|S) and P(A|NS), we can calculate the new P(S) and P(NS). Then we get the next feature out of the polynomial hash pipeline (each extra word makes 15 features) and repeat until we hit the end of the text. Whichever set has the greater probability wins. We also take multiple files AS A GROUP, so it's as though we added the corresponding hash buckets together for everything on the left of the | and everything on the right. ----- Now, on to the brutish details for the Markovian classifier: In terms of the actual implementation, LEARN and CLASSIFY are pipelined operations. The pipeline has these stages (as of the 2002-10-21 version) : 1) Tokenization. The input text is tokenized with the supplied regex (usually [[:graph:]]+ ) into a series of disjointed word tokens. 2) Each word token is hashed separately. The hash used is a "fast hash", not particularly secure, but with reasonably good statistics. 3) Each hash is pushed into the end of a five-stage pipeline. Each value previously pushed moves down one level in the pipeline. 4) The pipeline stages are tapped to supply values H0 through H4 that will be multiplied by the particular polynomial's coefficients. (H4 being the newest value). 5) After each value is pushed into the hash pipeline, the full set of polynomials are evaluated. These polynomials have changed over various releases, but as of 2002-10-23 the coefficients are: poly# \ for: H4 H3 H2 H1 H0 1 0 0 0 0 1 2 0 0 0 3 1 3 0 0 5 0 1 4 0 0 5 3 1 5 0 9 0 0 1 6 0 9 0 3 1 7 0 9 5 0 1 8 0 9 5 3 1 9 17 0 0 0 1 10 17 0 0 3 1 11 17 0 5 0 1 12 17 0 5 3 1 13 17 9 0 0 1 14 17 9 0 3 1 15 17 9 5 0 1 16 17 9 5 3 1 (yes, it's like counting in binary, but the low-order bit is always turned on so that the low order bits in the polynomial result is always affected by all nonzero elements of the hash pipeline. "skipped" words have a coefficient of zero, that zeroes their effect on the output of that polynomial, "skipping" the word) 6) These 16 results (call them "superhashes") reflect all phrases up to length 5 found in the input text. Each is 32 bits long. 7) Each of the .css files is mmapped into virtual memory. The default size of a .css file is one megabyte plus one byte, and each byte of a .css file is used as a single 8-bit unsigned integer. Using the length of the .css file as a modulus, each superhash value maps into a particular byte of the .css file. Each .css file also has a "score", initialized to zero. 8) if we're LEARNing, we increment the byte at that superhash index in the .css file (being careful to not overflow the bucket limit, so the maximum value is actually something quite smaller than 32 bits) 9) (pre-Nov-2002 versions): if we're CLASSIFYing, we increment the per-.css-file score of that .css file by the number found in that superhash-indexed byte. (post-Oct-2002 versions): if we're CLASSIFYing, instead of just incrementing the per-.CSS file scores, we (a) normalize the relative proportions of the .css files with respect to the total number of features in each .css file, (b) convert the bin values indexed by the superhash to a probability, (c) "clip" the probability values to reasonable values (there is no such thing as "certainty" with a finite sample of an infinite and nonstationary source such as human language), and (d) update the running probability using the Bayesian chain rule formula above. 10) repeat the above pipeline steps for each "word" in the text. 11) The .css file with the larger score (or probability) at the end "wins". There you have it. Previous plynomial sets (using only H0 thorugh H3 of the hash pipeline, with prime-number coefficients) have reached over 99.87% accuracy. The best that the 5-stage pipeline has reached for me is 99.984%, and it averages around 99.95% accuracy over months and months of use. n.b. slight error in edge effects - right now, we don't execute the pipeline polynomial set until the pipeline is full; conversely we stop executing the polynomial set when we run out of tokens. This means that we don't give the first and last few tokens of the email the full treatment; that's a bug that should be rectified. The other side of the problem is that filling and flushing the pipe gives worse results by putting too much emphasis on "zero hash" and too much emphasis on the first and last few words. n.b.: Arne's Optimization: If the singleton word (H0 alone) doesn't appear or has a count of 0, then it's useless to check for any further combinations, as you know they can't appear unless H0 also appeared. This speedup gives you about 2x speed improvement. ---More details on the post-Nov-2002 release:--- In releaes after Nov 1 2002, instead of just comparing counts, we do the true Bayesian chain rule to calculate the probabilities of pass versus fail. The bounding limits are first to bound within [ 1/featurecount+2 , 1 - 1/featurecount+2]. and then to add further uncertainty to that bound additionally by a factor of 1/(featurecount+1). We do the chain rule calculation and then we clip the minimum probability to MINDOUBLE, which is host specific but is a VERY small number (on the order of 10^-300 for Intel boxes). This further prevents getting the chain rule stuck in a 0.0 / 1.0 state, from which there is no recovery. Lastly, because of underflow issues, we quickly lose significance in the greater of the two probabilities. For example, 1.0 - (10^-30) is exactly equal to 1.00000; yet 10^-30 is easily achieveable in the first ten lines of text. Therefore, we calculate the chainrule probabilities twice, using P(S) and P(NS) separately, and then use the smaller one to recompute the larger one. Thus, even if there's arithmetic underflow in computing the larger probability, we still retain the full information in the smaller probability. --- Yet More Details - for Post-200310xx Versions ---- During the summer and fall of 2003, I continued experimenting with improvements to SBPH/BCR as described above. It became clear that SBPH/BCR was _very_ good, but that it was still operating within the limits of a linear classifier without hidden levels- e.g. it was a perceptron (with all of the limitations that perceptron-based classifiers have). Luckily, the databases in CRM114 are more than adequate to support a higher-level model than a simple linear perceptron classifier. I tested a 5th order Markovian classifier, and found that it was superior to any other classifier I had tried. A Markovian classifier operates on the concept that _patterns_ of words are far more important than individual words. For example, a Bayesian encountering the phrase "the quick brown fox jumped" would have five features: "the", "quick", "brown", "fox", and "jumped". A Sparse Binary Polynomial Hasher would have sixteen features: the the quick the brown the quick brown the fox the quick fox the brown fox the quick brown fox ... and so on. But each of these features would recieve the same weighting in the Bayesian chain rule above. The change to become a Markovian is simple- instead of giving each Sparse Binary Polynomial Hash (SBPH) feature a weight of 1, give each feature a weight corresponding to how long a Markov Chain it matches in either of the archetype texts. A simple way to do this would be to make the weight equal to the number of words matched - in this case the weights would be: the 1 the quick 2 the brown 2 the quick brown 3 the fox 2 the quick fox 3 the brown fox 3 the quick brown fox 4 and indeed, this gives some improvement over standard SBPH. But there is room for further improvement. The filter as stated above is still a linear filter; it cannot learn (or even express!) anything of the form: "A" or "B" but not both This is a basic limit discovered by Minsky and Papert in 1969 and published in _Perceptrons_. In this particular case there is a convenient way to work around this problem. The solution is to make the weights of the terms "superincreasing", such that long Markov chain features have so high a weight that shorter chains are completely overruled. For example, if we wanted to do "A or B but not both" in such a superincreasing filter, the weights: "A" at 1 "B" at 1 "A B" at -4 will give the desired results. For convenience in calculation, CRM114 uses the superincreasing weights defined by the series 2^(2n)- that is, the 1 the quick 4 the brown 4 the quick brown 16 the fox 4 the quick fox 16 the brown fox 16 the quick brown fox 64 Note that with these weights, a chain of length N can override all chains of length N-1, N-2, N-3... and so on. This is particularly satisfying, because the standard .css files already contain all of the information needed to do this more advanced calculation. The file format is not only compatible, it is _identical_ and so users don't have to re-start their training. This Markovian matching gives a considerable increase in accuracy over SBPH matching, and almost a factor of 2 improvement over Bayesian matching. It is now the default matching system in CRM114 as of version 200310xx. -------------------------------------------------------- The OSB Markovian classifer OSB (Orthogonal Sparse Bigram) is a simplification of SBPH inspired by Fidelis Assis. The change is to _omit_ all of the word combinations that don't have exactly two word tokens in it. This method has fewer features, but is often as good as or even better than Markovian in accuracy. Because it has fewer features, it needs less space in the .css files for equal accuracy; because it generates fewer features, it also runs considerably faster than Markovian. Other than that, it's pretty similar. It's sufficiently similar that OSB and Markovian can even use each other's .css files (with some decrease in accuracy). It's not recommended, but it works. --------------------------------------------------------------- The Winnow classifier Winnow is a different way of classifying; it doesn't generate probabilities but rather weights. The version in CRM114 at this particular time uses the OSB feature set. Christian Siefkes, Shalendra Chhabra, and Laird Breyer did the first hacking on this, then with Fidelis Assis' OSB feature generator it really took off. Here's a quick synopsys of the algorithm: 1) Every possible feature, from AAAA to ZZZZZZZ, starts with a weight of 1.000000 (note, we only record weights that _aren't_ 1.000000; so if we don't find a feature in our feature list, we can assume it has a value of 1.0000). 2) To learn, we do these steps in order: - generate all of the OSB features in the example text - delete all duplicate features - if the example is an example "in the class", multiply every found feature's weight by the "Promotion Constant", which is empirically set at 1.23 - if the example is a text that is NOT supposed to be "in the class", we multiply each found feature's weight by the "Demotion Constant", which is empirically set at .83 (note that no matter how many times a feature appears in a particular example, it only gets promoted or demoted ONCE). 3) To classify, we do these steps in order: - generate all of the OSB features in the unknown text - delete all duplicate features - add up all of the weights of these features in each of the statistics files. (don't forget that any feature that doesn't exist in the stats file gets a default value of 1.00000 !) - The score of each file is the total weight accumulated by the per-features, divided by the total number of features. (note that since not-seen-before features score 1.0000, a totally inconclusive score is Nfeatures/Nfeatures = 1.0000) - The file with the highest score wins. Winnow works best when you add a "Thickness factor" correction, where you train not just on error, but rather in this less subtle way: If the _correct_ class didn't score at least "Thickness" above the decision threshold (in pR, the decision threshold is 0.0) then train the _correct_ class with the example text in correct (promotion) mode. If the _incorrect_ class didn't score at least "Thickness" below the decision threshold ( again, in pR units), then train the incorrect class in error (demotion) mode. This is done with the < refute > flag. Winnow is a well-known classification algorithm in pattern recognition, the current implementation will probably be upgraded and debugged in newer releases. ---------------------------------------------------------------- The Correlator classifier The correlator classifier is different! The correlator classifier slides the window of the unknown text across the known texts, and counts places where the same character appears in both... well, actually, it counts the sum of the squares of the runlengths of the matching strings, reiterated at each point in the string. If the letters don't match, nothing is counted. So, "The quick brown fox jumped over the lazy dog's back 0123456789", matched against "cat", will get just three points- one for the C in back matching the c in cat, and two for the a in cat matching the a's in lazy and back. (note that the T in The doesn't match the t in cat, because they're different cases). However, "lawn fog" will match the five-character sequence "wn fo" giving 1 + 4 + 9 + 16 + 25 = 55 points. Note that EVERY POSSIBLE substring in the unknown text is compared against the known texts. This is Markovian with a major vengeance streak (or death wish, if you don't have lots of CPU and CPU cooling to spare. > 100x slowdown is entirely possible with this correlation classifier; consider yourself warned.). The databases of correlator classifiers is NOT compatible with the .css files of SBPH, OSB, Markovian, and Winnow classification. Don't even think of intermixing them. -------------------------------------------------------------- Update to versions Post June 27, 2005 - the Hyperspace Classifier The hypespatial classifier is a new classifier built into CRM114; at this writing it's more experimental than not. However, it shows _extremely_ good statistics and uses very little CPU and disk space, so we're putting it out there for people to play with. Like the other CRM114 classifiers, the hyperspace classifier is "activated" by an option flag (obviously, it's ); this makes it easy to compare hyperspatial results with more conventional methods without changing your testing framework (you can freely swap around , , etc. Just don't intermix the storage file types, most are not interchangeable). Most statistical classifiers combine statistics of a large number of example documents into a single class. The hyperspatial classifier doesn't - it considers each document to be a single data point in a very-high-dimensional hyperspace. Thus, each document retains it's individual identity; there is no mixing of features between documents. This is both a strength (hyperspatial classification is basically immune to word-salad spam) and a weakness (hyperspatial classification generalizes between document types more weakly than other classifiers). The current implementation of hyperspatial classification uses an intercepted-power decision algorithm- a light source is placed at each learned data point in the N-dimensional hyperspace. These light sources illuminate the hyperspace location of the unknown document, with some total power, called the total radiance. The class with the greatest total radiance at the unknown document's location in hyperspace is considered to be the "correct" class. By analogy - consider each known document to be a galaxy of stars; when viewed from the hyperspace position of an unknown document, the class with the brighter galaxies wins. Note that because radiance drops off with the inverse square of distance, proximity of two documents in hyperspace (that is, similarity of the two documents) is a very strong indicator of class membership. This is quite different from the linear separators such as Bayesian or Markovian classification. The hyperspace classifier does not create a linear border like a Bayesian classifier based on weights, nor like an SVM where the data is projected into a higer-dimensional space and then a linear separation plane is calculated to maximize the error margins. Instead, hyperspace classification uses 1/r^2 as the weighting function and the dividing surface between classes can become highly convoluted. The result is similar to a Voronoi diagram except that points that are distant from the surface still exert a small amount of control. The hyperspatial classification method can "learn" complex classification spaces that simply cannot be learned by linear classifiers or statistical classifiers, and perhaps not by SVMs. For example, no linear classifier can "learn" a checkerboard situation (an XOR is a 2x2 checkerboard). An SVM classifier can only learn such a situation if the original feature space can be mapped to some higher dimensional feature space where all of the squares of one color map to one side of a hyperplane and squares of the other color map to the other side of the hyperplane. In contrast, the hyperspatial classifier algorithm will learn such a highly nonlinear problem like a checkerboard in O(n) examples, where n is the number of squares on the checkerboard. Convergence is believed to be assured by the 1/r^2 falloff. --- Implementation Details --- The actual location in hyperspace of each document is determined by the features it contains. More precisely, each of the 2^32 dimensions of a document is determined by the following rule: "The coordinate of a document in a particular dimension of the hyperspace is equal to the number of times a feature appears in that document such that the feature hashes to the index of that hyperspace dimension." Because most documents contain far fewer than 2^32 features (the 'features' being obtained via the OSB algorithm), the most compact representation for a document's hyperspace coordinate is just the list of nonzero dimensions. Because most nonzero dimensions are of value 1, we dont even include the distance along that dimension. In the rare circumstance that more than one feature hashes to the same 32-bit hyperspace dimension index, the distance of 2 along that dimension is represented by having two copies of that dimension index in the dimension list; a distance of three being represented by three copies, and so on. The hashed feature values of the unknown document are sorted after generation; the known document feature hash values are stored in pre-sorted form. This means a single two-index pass, similar to a merge-sort, can quickly find all features that existed only in one document, that existed only in the other document, or existed in both documents. The distances between known and unknown documents are then calculated based on the counts of mutually present and mutually disjoint features found. The actual formula for distance used is: found_in_both ^ 2 distance = SQRT ( -------------------------------------------) found_only_in_known * found_only_in_unknown The radiance recieved by the unknown document is then calculated by the standard inverse-square law for radiant energy versus distance on a per-document basis (in the current version, each document has the same source energy emitted, but this is probably suboptimal and is a topic under active research): total radiance of class = SUM ( source_energy[i] / distance_to_source ^ 2 [i]) [ i = over all sources] This gives a total radiance for each of the N classes being considered for classification. The "winning" class is the class with the highest total radiance. Because every document that has ever been learned must be queried, this sounds like an expensive computation. The actual reality is that it's very fast to compute. The current implementation ( nasty, brutish, and straight C code) does no optimizations beyond sorting the features in each document mentioned above. With this minimal optimization, training with a thick threshold of pR of 0.5 the hyperspatial classifier runs more than four times as fast as OSBF or OSB, and more than ten times faster than Markovian. (the SpamAssassin test set of 4147 messages is fully processed in as little as 2 minutes 15 seconds- that's 32 milliseconds per message, versus Markovian at 25+ minutes per test set, both on a Transmeta 990 MHz laptop) The disk space required for hyperspace's data storage is on the order of 300 Kbytes per class (compared to 12 MEGAbytes for Markovian and Winnow, and 4 megabytes for OSB or OSBF), and the accuracy is about twice as good as Markovian or OSB. It seems to be superior in accuracy to everything except possibly OSBF (with which it's comparable) but unlike OSBF, the hyperspatial classifier seems to always converge. As a further improvement, we can greatly increase the speed of the current system by changing the storage of known documents from the simple array currently used (with NULL-bucket markers to indicate the end of each document) to a hash-based lookup. Each hashbucket contains the 32-bit hash of the feature and the 32-bit identifier of the known-class document that contained it; multiple documents containing the same feature consume multiple buckets. Embedded in the table are also hashed buckets containing the total feature count of eack known document (we can steal hash codes x00000000 through x0000ffff for these document-generic data buckets). In this way, we will make only references into the hash table corresponding to the actual features found; features not in the unknown document require zero memory cycles and zero computation. This improvement is not yet in the current code, and it may in fact not optimize for speed because the current sequential pass is highly coherent in the CPU cache and hash-based fetching is highly non-coherent. Another alternative to be tested is a tree-based lookup to try to keep greater cache locality. As a third improvement (which is portable to other classifiers doing feature lookup of any type) is to generate the document's features in one phase, then sort those features in the next phase in such a way as to maximize cache coherence (ideally, this means knowing the actual layout of the backing storage system) and then performing the actual feature database lookups in a third phase. This may have a significant impact on the overall system speed. ---- Update - August 2007 - the Bit-Entropy classifier ----- Last year at TREC Andrej Bratko showed off a new classifier technology based on optimal compression algorithms. The basic concept is that you train text compressors on each class of the known texts, and then use those compressors to compress the unknown text. A closer match between the unknown text and the compressor's pretraining and yields a higher compression rate (and a shorter output); the unknown text is judged to belong to the class with the shortest output stream. Bratko's original system is closed source and based on character-at-a-time compression, and used large amounts of memory - typically multiple 0.5 gigabyte chunks. The CRM114 implementation is totally new code and instead works on the "bit at a time" principle; each bit in the incoming message is treated as a separate symbol for the purpose of classification. Thus, there is no need for a tokenization or feature generation step; the features are the incoming stream of zeroes and ones. The result is a classifier that is comparable in speed to OSB but needs no tokenization; the downside is that it has a fairly large memory footprint. (36 megabytes for a million-node statistics file, versus 6 megabytes for an OSB system.). Additionally, the creation of an optimal incrementally-taught compression algorithm is nontrivial. Typical optimal compression is done with a Markov model, however the reader has to be aware that this is not the same as the "Markov" classifier described above (which is better described as a Markov Random Field representing a hidden Markov model of the text, rather than a bit-serial Markov chain as used for bit-entropy classification). There are two methods used to construct the bit-entropy Markov chain: one where an initial assumption of the chain is made, and one where it isn't made. The default configuration is to assume that the bit-entropy Markov chain is representable by a toroid, of some reasonable shape, with the interconnections done in a reasonable way. The default in CRM114 as of 20070101 is to form an array of 256 rows and 256 columns of nodes (total 64,000 nodes), and connect them into a toroid by a "perfect shuffle"- that is, the next state for a zero bit is the next column, row index -------------------------------------------------------------------- Data File Formats The format of a SBPH or OSB Markovian .css file (and, for winnow a .cow file) is a 64-bit hash of a feature (whether the feature is a single word, a bigram, or a full SBPH does not matter) and a 32-bit representation of the value. In .css files, the 32 bits is an unsigned integer showing the number of occurrences of this particular feature in the training set; in .cow files it's a 32-bit floating point weight; greater than 1.000 means "preponderance of evidence in favor", less than 1.000 means "preponderance of evidence against", thus a value of 1.000 exactly means "no information" (and in the case of a .cow file, like a .css file, 0.000 exactly, with all 64 words of hash == 0, means "unused slot"). For fast access, the first 32 bits of the hash ( called h1 in the code) is used as an index (modulo the length of the .css/.cow file) and that's the preferred slot location to put this data. If that slot is already in use, the next slot is used. If that is already in use, the _next next_ slot is used... and so on. This "next slot not yet used" is an overflow chain. For best performance, you want to keep the chains short. But that wastes a lot of space. 90-95 per cent utilization is a good compromise. Note that the time-to-find a slot (or find it's not there) goes with the length of the overflow chains- so long chains are _very_ bad for performance. I usually set a limit of 256 or even 128 on chains. Once you go past that limit, you need to start expiring old data out of the chain. You can do that by zapping out low-valued (not very significant) slots, but that means old, stale, but originally high-valued slots never expire. Another method would be to use an LRS (Least Recently Seen) tracking system, but that would use up a lot more disk space for the .css/.cow files- almost doubling it is the best estimate I have. "Microgrooming" adds a random factor. A feature is microgroomed if it's hash is equal to a pseudorandom number - and the microgrooming is merely a _lessening_ of the significance. If the significance of a slot drops below "saw it once", the slot is reclaimed for reuse. Note also we don't groom the whole .css/.cow file. We groom _only_ the chain that we noticed was too long. This minimizes how much data we lose in a microgroom (face it, database grooming/expiring is brain surgery with a butter knife... microgrooming is just using the serrations on the edge to minimize how much we scrape away). Note that this works pretty well- most of the slots ever used in a .css/.cow file contain only a single occurrence, so reclaiming a small fraction of them (currently 1 in 16, scattered randomly) is a good compromise. It also will eventually expire out even the largest feature if that feature is not ever retrained. (and the killer bug? Well, consider how we know we've reached end-of-chain. We see a zeroed slot. Microgrooming puts in a number of zeroed slots - each of which is seen as a chain terminator. BUT- when we microgroom, we need to re-check the locations of each slot's worth of data, to make sure it's findable - that is, it isn't separated from it's optimal location by a freshly zeroed slot (which would indicate end-of-chain). This is "repacking" the chain. And the code that did it had a bug that repacked only the first part of the chain and then stopped. This meant that the tail of the chain (avg 50% or so) could NOT be found- the data there was lost! This bug has now been (hopefully) stomped. -Bill Yerazunis crm114-20100106-BlameMichelson.src/crm_svm_quad_prog.h0000644000000000017500000000510111321154266020652 0ustar rootwsy// crm_svm_quad_prog.h - Support Vector Machine //////////////////////////////////////////////////////////////////////// // This code is originally copyright and owned by William // S. Yerazunis as file crm_neural_net. In return for addition of // significant derivative work, Jennifer Barry is hereby granted a full // unlimited license to use this code, includng license to relicense under // other licenses. //////////////////////////////////////////////////////////////////////// // // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. #ifndef __CRM_SVM_QUAD_PROG__H #define __CRM_SVM_QUAD_PROG__H #include "crm_svm_matrix_util.h" #include "crm_svm_matrix.h" extern int MATR_DEBUG_MODE; //debugging mode. see crm_svm_matrix_util.h for //possible values. int QP_DEBUG_MODE; #define QP_DEBUG 2 //basic information about the qp solver #define QP_DEBUG_LOOP 3 //prints some information during each qp loop //useful if the svm is getting stuck during a QP //problem #define QP_LINEAR_SOLVER 4 //prints some information during each cg loop //useful to discover if the run goes on forever //because the cg isn't converging //(usually indicates a bug in the add or remove //constraint functions!) #define QP_CONSTRAINTS 5 //prints out information about adding and //removing constraints during the qp solver //the accuracy to which we run conjugate_gradient //this should be a pretty small number!!! #define QP_LINEAR_ACCURACY 1e-10 //we should never exceed this but here just in case #define QP_MAX_ITERATIONS 1000 void run_qp(Matrix *G, Matrix *A, Vector *c, Vector *b, Vector *x); void add_constraint(int toadd, Matrix *A, Matrix *Q, Matrix *R, Matrix **Z_ptr); void delete_constraint(int toad, Matrix *A, Matrix *Q, Matrix *R, Matrix **Z_ptr); int compute_lambda(Matrix *R, Matrix *Q, Vector *g); void back_sub(Matrix *U, Vector *b, Vector *ret); double find_direction(Matrix *Z, Matrix *G, Vector *g, Vector *p); void conjugate_gradient(Matrix **A, int nmatrices, int maxrows, Vector *b, Vector *x); void gradient_descent(Matrix **A, int nmatrices, int maxrows, Vector *v, Vector *x); void run_linear(Matrix *A, Vector *b, Vector *x); //int main(int argc, char **argv); #endif //crm_svm_quad_prog.h crm114-20100106-BlameMichelson.src/beeptest.crm0000755000000000017500000000041011321154266017311 0ustar rootwsy#! /usr/bin/crm # # beelptest.crm - test syscall printf beep # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. { window output /:*:_nl: CRM114: test syscall 'printf beep' :*:_nl:/ syscall /printf '\a' > \/dev\/tty / } crm114-20100106-BlameMichelson.src/matchtest.crm0000755000000000017500000002055311321154266017504 0ustar rootwsy#! /usr/bin/crm # # matchtest.crm - test matching functionality of CRM114 # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # this program exercises the matching functionality of crm. It expects # an input that may contain foo*, bar*, and possibly START baz END window output /:*:_nl: CRM114 testing match functionality :*:_nl:/ { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing exact match on foo.../ match <> (:x:) /foo/ output / found exact match on ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing exact match on foo.../ match <> (:x:) /foo/ output / found exact match on ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing absent match on foo.../ match () /foo/ output / match says "no foo found"./ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing absent match on foo.../ match () /foo/ output / match says "no foo found"./ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing nocase match on foo.../ match (:x:) /foo/ output / found a nocase match on ':*:x:' / } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing nocase match on foo.../ match (:x:) /foo/ output / found a nocase match on ':*:x:' / } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing nocase absent match on foo.../ match /foo/ output / match says "no foo found"./ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing nocase absent match on foo.../ match /foo/ output / match says "no foo found"./ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing multiline match on foo.../ match <> (:x:) /.*foo.*/ output / found an allowed multiline match on ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing multiline match on foo.../ match <> (:x:) /.*foo.*/ output / found an allowed multiline match on ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing nomultiline match on foo.../ match (:x:) /.*foo.*/ output / found a nomultiline match on ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing nomultiline match on foo.../ match (:x:) /.*foo.*/ output / found a nomultiline match on ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing fromendchar match for foo then bar.../ match () /foo/ output /... found the foo.../ match () /bar/ output /then bar / } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing fromendchar match for foo then bar.../ match () /foo/ output /... found the foo.../ match () /bar/ output /found a bar after the foo. (text: :*:_dw: ) / } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing fromnext match for f+oo+../ match <> (:x:) /f+oo+/ output / first ':*:x:'../ match (:x:) /f+oo+/ output / found one on ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing fromnext match for f+oo+../ match <> (:x:) /f+oo+/ output / first ':*:x:'../ match (:x:) /f+oo+/ output / found one on ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing newend match for f+oo+../ match <> (:x:) /f+oo+/ output / first ':*:x:'../ match (:x:) /f+oo+/ output / found one on ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing newend match for f+oo+../ match <> (:x:) /f+oo+/ output / first ':*:x:'../ match (:x:) /f+oo+/ output / found one on ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing indirect goto ":twist:" and ":shout:"..../ match <> (:x: :whereto:) /go to (:twist:|:shout:)/ output / got :*:whereto:.../ goto /:*:whereto:/ output / FAILED - shouldn't get here. / FAIL :twist: output / got to TWIST./ FAIL :shout: output / got to SHOUT./ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing indirect goto ":twist:" and ":shout:"..../ match <> (:x: :whereto:) /go to (:twist2:|:shout2:)/ output / got :*:whereto:.../ goto /:*:whereto:/ output / FAILED - shouldn't get here. / FAIL :twist2: output / got to TWIST./ FAIL :shout2: output / got to SHOUT./ } { window /ZZZ/ /ZZZ/ output / :*:_nl:Testing self-supplied-match .../ match (:x: :pat:) /foo(.*)bar/ output / found ':*:pat:' .../ match (:x:) /.*:*:pat:.*/ output / found ':*:x:'/ } { window /ZZZ/ /ZZZ/ output / :*:_nl:Testing self-supplied-match .../ match (:x: :pat:) /foo(.*)bar/ output / found ':*:pat:' .../ match (:x:) /.*:*:pat:.*/ output / found ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing indep start\/end .../ match (:f:) /foo.*foo/ output / found ':*:f:' .../ match (:x:) [:f:] /1/ output / found ':*:x:' .../ match (:b:) /bar.*bar/ output / found ':*:b:'.../ match (:x:) [:b:] /2/ output / found ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing indep start\/end .../ match (:f:) /foo.*foo/ output / found ':*:f:' .../ match (:x:) [:f:] /1/ output / found ':*:x:' .../ match (:b:) /bar.*bar/ output / found ':*:b:'.../ match (:x:) [:b:] /2/ output / found ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing indep start\/end .../ match (:f:) /foo.*foo/ output / found ':*:f:' .../ match (:x:) [:f:] /1/ output / found ':*:x:' .../ match (:b:) /bar.*bar/ output / found ':*:b:'.../ match (:x:) [:b:] /2/ output / found ':*:x:'/ } { window /ZZZ/ /ZZZ/ output /:*:_nl:Testing indep start\/end .../ match (:f:) /foo.*foo/ output / found ':*:f:' .../ match (:x:) [:f:] /1/ output / found ':*:x:' .../ match (:b:) /bar.*bar/ output / found ':*:b:'.../ match (:x:) [:b:] /2/ output / found ':*:x:'/ } { alter (:_dw:) /Test text >correct< results hopefully/ output /\nTesting box region control\n/ match [:_dw: 11 7] /.*/ (:x:) output /got 10-17 as this: :*:x: (should be 'correct', no angles)\n/ } { alter (:_dw:) /Test text >correct< results hopefully/ output /\nTesting box region control, part 2\n/ match [:_dw: 11 7] /r.*c/ (:x:) output /got 10-17 as this: :*:x: (should be 'rrec')\n/ } { isolate (:z:) /Test text >correct< results hopefully without problems/ output /\nTesting box region control, isolated variable\n/ match [:z: 11 7] /r.*c/ (:x:) output /got 10-17 as this: :*:x: (should be 'rrec')\n/ match [:z:] /e..../ (:y:) output /Fromend match: :*:y: (should be 'esult')\n/ } { alter (:_dw:) /Test text >correct< results hopefully/ output /\nTesting box region indexed control\n/ match [:_dw: 11 7] /.*/ (:x:) output /got this: ':*:x:' (should be 'correct', no angles)\n/ } { alter (:_dw:) /Test text >correct< results hopefully/ output /\nTesting box region regex control\n/ match [:_dw: />.*correct<', with angles)\n/ } { alter (:_dw:) /Test text >correct< results hopefully/ output /\nTesting box region failing regex control\n/ match [:_dw: />abcdefgcorrect< results hopefully/ output /\nTesting box region regex\/index control\n/ match [:_dw: />.*correct< results hopefully/ output /\nTesting box region index\/regex control\n/ match [:_dw: 5 20 />.*correct<', with angles)\n/ } { alter (:_dw:) /Test text >correct< results hopefully/ output /\nTesting box region index\/index control\n/ match [:_dw: 6 20 5 7 ] /.*/ (:x:) output /got this: ':*:x:' (should be 'correct', no angles)\n/ } { alter (:_dw:) /Test text > correct < results hopefully/ output /\nTesting box region regex with spaces control\n/ match [:_dw: /> (?:[a-z])+ correct <', with angles)\n/ } { alter (:_dw:) /Test text > correct < results hopefully/ output /\nTesting box region regex with spaces control\n/ match [:_dw: /> ((?:[a-z])+) versus binding (the '67' bug)\n / output / The next stmt --- SHOULD--- get an error message! \n / match (:incognita:) /nothinjkljiejfksn;s94jf94msks/ output / Bug! You should never see this line! \n/ } trap (:errtext:) /.*/ { output /Caught the error, fault text was: \n:*:errtext:\n/ } } # output /:*:_nl:------------------END OF TESTING----------------------:*:_nl:/ crm114-20100106-BlameMichelson.src/skudtest.crm0000755000000000017500000000145611321154266017357 0ustar rootwsy#! /usr/bin/crm # # skudtest.crm - start/length works regression test # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # # this little regression test by Jskud verifies match start/length works window output /:*:_nl: CRM114 testing that start \/ length works in matches :*:_nl: :*:_nl:/ isolate (:whitelist:) input (:whitelist:) [whitelist.mfp.example] output /DEBUG: whitelist==<<:*:whitelist:>>:*:_nl:/ # match (:waste:) [:whitelist:] /^/ ; # force match so will work { # Grab the next regex -- turn the one-per-line patterns into a regex match (:waste: :whregex:) [:whitelist:] /(.+)/ { output /DEBUG: matched==<<:*:whregex:>>:*:_nl:/ } liaf } output /DEBUG: [TheEnd]:*:_nl:/ crm114-20100106-BlameMichelson.src/KNOWNBUGS.txt0000644000000000017500000000632311321154266017117 0ustar rootwsy# # knownbugs.txt - CRM114 Known Bugs # # Copyright 2001-2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. CRM114 Known Bugs: 1) (fixed) 2) Matcher bug... This bug is in the GNU library, not my code. If you match on a window longer than 20479 characters, AND you have multiline matching enabled, AND the pattern is of the form ".*literalvale", then the match will FAIL even if it should have succeeded. The TRE library does not have this bug. (see "make experimental") (FIXED - default library is now TRE 0.7.4) 3) Matcher bug - this is another bug in the GNU regex library. If you have a pattern of the form ^Q$ where Q is a single character, and you don't specify , then the match will mysteriously fail even if it should have succeeded. The TRE regex library does not have this problem (see "make experimental"). (FIXED - default library is now TRE 0.7.4) 3+) More on bug 3 - it seems that _many_ patterns of the form ^blah$ do not work correctly, including the simple case for a null string of ^$ even if you _do_ specify nomultiline. The TRE regex library does not have this problem either. (FIXED - default library is now TRE 0.7.4) 3++) The GNU regex engine considers /./ to match not only the string "a", but also the empty string "". This is in contrast to /../ which does match "aa" but not "a". (FIXED - default library is now TRE 0.7.4) 4) Thinking bug... If you ALTER one variable that contains another variable, the second variable moves as though you inserted/deleted charactes at the START of the first variable, and then overtyped all of the evenly matched characters. This can lead to counterintuitive results; the fix is to ISOLATE any variable that has to hold value across an ALTER operation. (FIXED - default library is now TRE 0.7.4) 5) FIXED AT LONG LAST (version 20040815) Memory leak - if you MATCH to bind a var, then ISOLATE that var, then MATCH it again, the old ISOLATEd usage is _not_ recovered. If you do this enough, you will run out of buffer space and get a FATAL ERROR. The quick workaround is to use two variables- for example, :a: and :isolated_a: as shown here: match :a: isolate (:isolated_a:) /:*:a:/ match (:a:) isolate (:isolated_a:) /:*:a:/ ad infinitum, which will _not_ leak memory. (REAL FIX in by 20060629 - off by one error in line 920 of crm_var_hash_table.c) 6) If you do math, and the output is bigger than a trillion or less than 1 trillionth, the output comes out in E-notation. This is fine -except that you can't _re_use that, because the numerical parser doesn't understand E-notation. Be warned. Or send me a patch! (FIXED - the switchover between fixed and E notation is now a soft set, and the reader DOES understand E-notation) 7) If you malform a math expression (like leave off the closing colon) the last digit of your result gets eaten. e.g.: /:@: 1 + 123 :/ --> 124 (correct) /:@: 1 + 123 / --> 12 (incorrect) Patches appreciated on this one too. ( SEMI-FIXED - you now get an error that says "you didn't end with an ':', which is often an error. However, it still gets the wrong result. ) Let me know if you find any others! -Bill Y. crm114-20100106-BlameMichelson.src/bracktest.crm0000755000000000017500000000074311321154266017471 0ustar rootwsy#! /usr/bin/crm # # bracktest.crm - test brackets, escapes, and \# # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. window output /:*:_nl: CRM114 testing brackets, escapes and \#'s :*:_nl:/ output /:*:_nl: If you don't see any compiler error messages, that means the test passed. :*:_nl:/ { } # are you annoyed? I am. { { { { } } } } { # these brackets should never be seen.... { and } # nor these { ; } } output /:*:_nl:/ crm114-20100106-BlameMichelson.src/crm_osb_bayes.c0000644000000000017500000013532211321154266017756 0ustar rootwsy// crm_osb_bayes.c - OSB Bayes utilities // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" #define STRIDE 2 // ugly hack for doing stride 2 struct double_feature {unsigned int word[STRIDE];}; // qsort compare function: compare 64-bit feature hashes static int compare_double_features(const void *p0, const void *p1) { const struct double_feature *d0 = (struct double_feature *)p0; const struct double_feature *d1 = (struct double_feature *)p1; int ret; if (d0->word[0] > d1->word[0]) ret = 1; else if (d0->word[0] < d1->word[0]) ret = -1; #if (STRIDE >= 2) else if (d0->word[1] > d1->word[1]) ret = 1; else if (d0->word[1] < d1->word[1]) ret = -1; #endif // (STRIDE >= 2) else ret = 0; return ret; } // // How to learn OSB_Bayes style - in this case, we'll include the single // word terms that may not strictly be necessary. // int crm_expr_osb_bayes_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen) { // learn the osb_bayes transform spectrum of this input window as // belonging to a particular type. // learn (classname) /word/ // long i, j, k; char ptext[MAX_PATTERN]; // the regex pattern long plen; long osb_bayes_file_length; char htext[MAX_PATTERN]; // the hash file name long hlen; struct stat statbuf; // for statting the hash file long hfsize; // size of the hash file FEATUREBUCKET_STRUCT *hashes; // the text of the hash file // // malloc'ed large array of feature hashes unsigned int *features; long features_out; long textoffset; long sense; long microgroom; long fev; // unsigned long learns_index = 0; unsigned long features_index = 0; char *learnfilename; if (internal_trace) fprintf (stderr, "executing a LEARN\n"); features = (unsigned int *) malloc(OSB_BAYES_MAX_FEATURE_COUNT * STRIDE * sizeof(*features)); if (features == NULL) untrappableerror5("Couldn't allocate features array", "", CRM_ENGINE_HERE); // extract the hash file name crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar (htext, hlen, MAX_PATTERN); // get the "this is a word" regex crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len); plen = apb->s1len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); // set our cflags, if needed. The defaults are // "case" and "affirm", (both zero valued). // and "microgroom" disabled. sense = +1; if (apb->sflags & CRM_REFUTE) { sense = -sense; if (user_trace) fprintf (stderr, " refuting learning\n"); }; microgroom = 0; if (apb->sflags & CRM_MICROGROOM) { microgroom = 1; if (user_trace) fprintf (stderr, " enabling microgrooming.\n"); }; // // grab the filename, and stat the file // note that neither "stat", "fopen", nor "open" are // fully 8-bit or wchar clean... i = 0; while (htext[i] < 0x021) i++; j = i; while (htext[j] >= 0x021) j++; // filename starts at i, ends at j. null terminate it. htext[j] = '\000'; learnfilename = strdup ( &(htext[i] )); // and stat it to get it's length k = stat (&htext[i], &statbuf); // quick check- does the file even exist? if (k != 0) { // file didn't exist... create it FILE *f; if (user_trace) fprintf (stderr, "\nHad to create new CSS file %s\n", &htext[i]); f = fopen (&htext[i], "wb"); if (!f) { fprintf (stderr, "\n Couldn't open your new CSS file %s for writing; errno=%d .\n", &htext[i], errno); if (engine_exit_base != 0) { exit (engine_exit_base + 20); } else exit (EXIT_FAILURE); }; // did we get a value for sparse_spectrum_file_length? osb_bayes_file_length = sparse_spectrum_file_length; if (osb_bayes_file_length == 0 ) { osb_bayes_file_length = DEFAULT_OSB_BAYES_SPARSE_SPECTRUM_FILE_LENGTH; } // put in osb_bayes_file_length entries of NULL for (j = 0; j < osb_bayes_file_length * sizeof ( FEATUREBUCKET_STRUCT); j++) fputc ('\000', f); // fclose (f); // and reset the statbuf to be correct k = stat (&htext[i], &statbuf); }; // hfsize = statbuf.st_size; if (user_trace) fprintf (stderr, "Sparse spectra file %s has length %ld bins\n", &htext[i], hfsize / sizeof (FEATUREBUCKET_STRUCT)); // // map the .css file into memory // hashes = (FEATUREBUCKET_STRUCT *) crm_mmap_file (&(htext[i]), 0, hfsize, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); if (hashes == MAP_FAILED) { fev = fatalerror5 ("Couldn't get to the statistic file named: ", &htext[i], CRM_ENGINE_HERE); return (fev); }; // // now set the hfsize to the number of entries, not the number // of bytes total hfsize = hfsize / sizeof ( FEATUREBUCKET_STRUCT ); #ifdef OSB_LEARNCOUNTS // If LEARNCOUNTS is enabled, we normalize with documents-learned. // // We use the reserved h2 == 0 setup for the learncount. // { char* litf = "Learnings in this file"; char* fitf = "Features in this file"; unsigned int h1, hindex; // h1 = strnhash (litf, strlen ( litf )); hindex = h1 % hfsize; if (hashes[hindex].hash != h1) { // initialize the file? if (hashes[hindex].hash == 0 && hashes[hindex].key == 0) { hashes[hindex].hash = h1; hashes[hindex].key = 0; hashes[hindex].value = 1; learns_index = hindex; } else { fatalerror5 (" This file should have learncounts, but doesn't!", " The slot is busy, too. It's hosed. Time to die.", CRM_ENGINE_HERE); goto done; }; } else { if (hashes[hindex].key == 0) // the learncount matched. { learns_index = hindex; if (sense > 0) hashes[hindex].value = hashes[hindex].value + sense; else { if (hashes[hindex].value + sense > 0) hashes[hindex].value += sense; else hashes[hindex].value = 0; } if (user_trace) fprintf (stderr, "This file has had %u documents learned!\n", hashes[hindex].value); }; }; h1 = strnhash (fitf, strlen ( fitf )); hindex = h1 % hfsize; if (hashes[hindex].hash != h1) { // initialize the file? if (hashes[hindex].hash == 0 && hashes[hindex].key == 0) { hashes[hindex].hash = h1; hashes[hindex].key = 0; hashes[hindex].value = 1; features_index = hindex; } else { fatalerror5 (" This file should have learncounts, but doesn't!", " The slot is busy, too. It's hosed. Time to die.", CRM_ENGINE_HERE); goto done ; }; } else { if (hashes[hindex].key == 0) // the learncount matched. { features_index = hindex; if (user_trace) fprintf (stderr, "This file has had %u features learned!\n", hashes[hindex].value); }; }; }; #endif // OSB_LEARNCOUNTS textoffset = txtstart; (void)crm_vector_tokenize_selector(apb, txtptr, txtstart, txtlen, ptext, plen, NULL, 0, 0, features, (long)(OSB_BAYES_MAX_FEATURE_COUNT * STRIDE), &features_out, &textoffset); // #if (0) // // Can't count on this. When learning QUICKREF.txt, tokenizer's // // match.eo only goes a few characters into the whitespace at the // // end, doesn't go to end of text. // if (textoffset < txtlen) // (void)fatalerror5("Too many input features", // " (text being learned is too big).", // CRM_ENGINE_HERE); // #ifdef if (apb->sflags & CRM_UNIQUE) { // hack: assume stride STRIDE struct double_feature *d = (struct double_feature *)&features[0]; long n_double_features = features_out / STRIDE; qsort(d, n_double_features, sizeof(struct double_feature), compare_double_features); i = 0; // remove successive duplicates for (j = 1; j < n_double_features; j++) if (d[j].word[0] != d[i].word[0] || d[j].word[1] != d[i].word[1]) d[++i] = d[j]; // set new length, possibly shorter if (n_double_features > 0) n_double_features = i + 1; // convert new length to the other form features_out = n_double_features * STRIDE; }; // and the big loop... go through all of the text. // hack: assume crm_vector_tokenize_selector() picked stride STRIDE for (i = 0; i + (STRIDE - 1) < features_out; i += STRIDE) { unsigned int hindex; unsigned int h1, h2; unsigned long incrs; h1 = features[i]; h2 = features[i + 1]; if (h2 == 0) h2 = 0xdeadbeef; hindex = h1 % hfsize; // // we now look at both the primary (h1) and // crosscut (h2) indexes to see if we've got // the right bucket or if we need to look further // incrs = 0; while ( // part 1 - when to stop if sense is positive: ! ( sense > 0 // in positive mode, stop when we hit // the correct slot, OR when we hit an // zero-value (reusable) slot && ( hashes[hindex].value == 0 || ( hashes[hindex].hash == h1 && hashes[hindex].key == h2 ))) && ! ( sense <= 0 // in negative/refute mode, stop when // we hit the correct slot, or a truly // unused (not just zero-valued reusable) // slot. && ( ( hashes[hindex].hash == h1 && hashes[hindex].key == h2) || ( hashes[hindex].value == 0 && hashes[hindex].hash == 0 && hashes[hindex].key == 0 )))) { // incrs++; // // If microgrooming is enabled, and we've found a // chain that's too long, we groom it down. // if (microgroom && (incrs > MICROGROOM_CHAIN_LENGTH)) { long zeroedfeatures; // set the random number generator up... // note that this is repeatable for a // particular test set, yet dynamic. That // way, we don't always autogroom away the // same feature; we depend on the previous // feature's key. srand (h2); // // and do the groom. zeroedfeatures = crm_microgroom (hashes, NULL, hfsize, hindex); hashes[features_index].value -= zeroedfeatures; // since things may have moved after a // microgroom, restart our search hindex = h1 % hfsize; incrs = 0; }; // check to see if we've incremented ourself all the // way around the .css file. If so, we're full, and // can hold no more features (this is unrecoverable) if (incrs > hfsize - 3) { nonfatalerror5 ("Your program is stuffing too many " "features into this size .css file. " "Adding any more features is " "impossible in this file.", "You are advised to build a larger " ".css file and merge your data into " "it.", CRM_ENGINE_HERE); goto done; }; hindex++; if (hindex >= hfsize) hindex = 0; }; if (internal_trace) { if (hashes[hindex].value == 0) { fprintf (stderr,"New feature at %u\n", hindex); } else { fprintf (stderr, "Old feature at %u\n", hindex); }; }; // always rewrite hash and key, as they may be incorrect // (on a reused bucket) or zero (on a fresh one) // // watch out - sense may be both + or -, so check before // adding it... // // let the embedded feature counter sorta keep up... hashes[features_index].value += sense; if (sense > 0 ) { // Right slot, set it up // hashes[hindex].hash = h1; hashes[hindex].key = h2; if ( hashes[hindex].value + sense >= FEATUREBUCKET_VALUE_MAX-1) { hashes[hindex].value = FEATUREBUCKET_VALUE_MAX - 1; } else { hashes[hindex].value += sense; }; }; if ( sense < 0 ) { if (hashes[hindex].value <= -sense ) { hashes[hindex].value = 0; } else { hashes[hindex].value += sense; }; }; }; done: free(features); // and remember to let go of the mmap // (we force the munmap, because otherwise we still have a link // to the file which stays around until program exit) crm_force_munmap_addr ((void *) hashes); #ifndef CRM_WINDOWS // Because mmap/munmap doesn't set atime, nor set the "modified" // flag, some network filesystems will fail to mark the file as // modified and so their cacheing will make a mistake. // // The fix is to do a trivial read/write on the .css ile, to force // the filesystem to repropagate it's caches. // { int hfd; // hashfile fd FEATURE_HEADER_STRUCT foo; hfd = open (learnfilename, O_RDWR); dontcare = read (hfd, &foo, sizeof(foo)); lseek (hfd, 0, SEEK_SET); dontcare = write (hfd, &foo, sizeof(foo)); close (hfd); } #endif // !CRM_WINDOWS return (0); } // How to do a OSB_Bayes CLASSIFY some text. // int crm_expr_osb_bayes_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen) { // classify the sparse spectrum of this input window // as belonging to a particular type. // // This code should look very familiar- it's cribbed from // the code for LEARN // long i, j, k; char ptext[MAX_PATTERN]; // the regex pattern long plen; // the hash file names char htext[MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN]; long htext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN; long hlen; // the match statistics variable char stext[MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100)]; long stext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100); long slen; char svrbl[MAX_PATTERN]; // the match statistics text buffer long svlen; long fnameoffset; char fname[MAX_FILE_NAME_LEN]; // long vhtindex; long nrows; long use_chisquared; // // use embedded feature index counters, rather than full scans unsigned long learns_index [MAX_CLASSIFIERS]; unsigned long total_learns; unsigned long features_index [MAX_CLASSIFIERS]; unsigned long total_features; // malloc'ed large array of feature hashes unsigned int *features; long features_out; // number returned by vector tokenizer long next_offset; // where in text to look for next token // map of features already seen (used for uniqueness tests) int use_unique; unsigned char *seen_features; struct stat statbuf; // for statting the hash file // unsigned long fcounts[MAX_CLASSIFIERS]; // total counts for feature normalize // unsigned long totalcount = 0; double cpcorr[MAX_CLASSIFIERS]; // corpus correction factors double hits[MAX_CLASSIFIERS]; // actual hits per feature per classifier long totalhits[MAX_CLASSIFIERS]; // actual total hits per classifier double chi2[MAX_CLASSIFIERS]; // chi-squared values (such as they are) long expected; // expected hits for chi2. long unk_features; // total unknown features in the document double htf; // hits this feature got. double tprob = 0.0; // total probability in the "success" domain. // set to 0.0 for compiler warnings double ptc[MAX_CLASSIFIERS]; // current running probability of this class double renorm = 0.0; double pltc[MAX_CLASSIFIERS]; // current local probability of this class // int hfds[MAX_CLASSIFIERS]; FEATUREBUCKET_STRUCT *hashes[MAX_CLASSIFIERS]; long hashlens[MAX_CLASSIFIERS]; char *hashname[MAX_CLASSIFIERS]; long succhash; long vbar_seen; // did we see '|' in classify's args? long maxhash; long fnstart, fnlen; long fn_start_here; long textoffset; long textmaxoffset; long bestseen; long thistotal; int ifile; double top10scores[10]; long top10polys[10]; char top10texts[10][MAX_PATTERN]; if (internal_trace) fprintf (stderr, "executing a CLASSIFY\n"); // extract the hash file names crm_get_pgm_arg (htext, htext_maxlen, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar (htext, hlen, htext_maxlen); // extract the "this is a word" regex // crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len); plen = apb->s1len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); // extract the optional "match statistics" variable // crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len); svlen = apb->p2len; svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN); { long vstart, vlen; crm_nextword (svrbl, svlen, 0, &vstart, &vlen); memmove (svrbl, &svrbl[vstart], vlen); svlen = vlen; svrbl[vlen] = '\000'; }; // status variable's text (used for output stats) // stext[0] = '\000'; slen = 0; // set our flags, if needed. use_unique = 0; if (apb->sflags & CRM_UNIQUE) { use_unique = 1; if (user_trace) fprintf (stderr, " unique engaged -repeated features are ignored \n"); }; // crm_vector_tokenize_selector() picks these numbers. We just know. nrows = 4; if (apb->sflags & CRM_UNIGRAM) { nrows = 1; if (user_trace) fprintf (stderr, " using unigram features only \n"); }; use_chisquared = 0; if (apb->sflags & CRM_CHI2) { use_chisquared = 1; if (user_trace) fprintf (stderr, " using chi^2 chaining rule \n"); }; if ( internal_trace) fprintf (stderr, "\nWordmatch pattern is %s", ptext); features = (unsigned int *) malloc(OSB_BAYES_MAX_FEATURE_COUNT * STRIDE * sizeof(*features)); if (features == NULL) untrappableerror5("Couldn't allocate features array", "", CRM_ENGINE_HERE); if (use_unique) { if ((seen_features = calloc(OSB_BAYES_MAX_FEATURE_COUNT, 1)) == NULL) untrappableerror5 (" Couldn't allocate enough memory to keep track", "of nonunique features. This is deadly", CRM_ENGINE_HERE); } else seen_features = NULL; bestseen = 0; thistotal = 0; // goodcount = evilcount = 1; // prevents a divide-by-zero error. //cpgood = cpevil = 0.0; //ghits = ehits = 0.0 ; //psucc = 0.5; //pfail = (1.0 - psucc); //pic = 0.5; //pnic = 0.5; // initialize our arrays for N .css files for (i = 0; i < MAX_CLASSIFIERS; i++) { // fcounts[i] = 0; // check later to prevent a divide-by-zero // error on empty .css file cpcorr[i] = 0.0; // corpus correction factors hits[i] = 0.0; // absolute hit counts totalhits[i] = 0; // absolute hit counts ptc[i] = 0.5; // priori probability pltc[i] = 0.5; // local probability }; for (i = 0; i < 10; i++) { top10scores[i] = 0; top10polys[i] = 0; strcpy (top10texts[i], ""); }; // -- probabilistic evaluator --- // S = success; A = a testable attribute of success // ns = not success, na = not attribute // the chain rule we use is: // // P(A|S) P(S) // P (S|A) = ------------------------- // P(A|S) P(S) + P(A|NS) P(NS) // // and we apply it repeatedly to evaluate the final prob. For // the initial a-priori probability, we use 0.5. The output // value (here, P(S|A) ) becomes the new a-priori for the next // iteration. // // Extension - we generalize the above to I classes as and feature // F as follows: // // P(F|Ci) P(Ci) // P(Ci|F) = ---------------------------------------- // Sum over all classes Ci of P(F|Ci) P(Ci) // // We also correct for the unequal corpus sizes by multiplying // the probabilities by a renormalization factor. if Tg is the // total number of good features, and Te is the total number of // evil features, and Rg and Re are the raw relative scores, // then the corrected relative scores Cg aqnd Ce are // // Cg = (Rg / Tg) // Ce = (Re / Te) // // or Ci = (Ri / Ti) // // Cg and Ce can now be used as "corrected" relative counts // to calculate the naive Bayesian probabilities. // // Lastly, the issue of "over-certainty" rears it's ugly head. // This is what happens when there's a zero raw count of either // good or evil features at a particular place in the file; the // strict but naive mathematical interpretation of this is that // "feature A never/always occurs when in good/evil, hence this // is conclusive evidence of good/evil and the probabilities go // to 1.0 or 0.0, and are stuck there forevermore. We use the // somewhat ad-hoc interpretation that it is unreasonable to // assume that any finite number of samples can appropriately // represent an infinite continuum of spewage, so we can bound // the certainty of any meausre to be in the range: // // limit: [ 1/featurecount+2 , 1 - 1/featurecount+2]. // // The prior bound is strictly made-up-on-the-spot and has NO // strong theoretical basis. It does have the nice behavior // that for feature counts of 0 the probability is clipped to // [0.5, 0.5], for feature counts of 1 to [0.333, 0.666] // for feature counts of 2 to [0.25, 0.75], for 3 to // [0.2, 0.8], for 4 to [0.166, 0.833] and so on. // vbar_seen = 0; maxhash = 0; succhash = 0; fnameoffset = 0; // now, get the file names and mmap each file // get the file name (grody and non-8-bit-safe, but doesn't matter // because the result is used for open() and nothing else. // GROT GROT GROT this isn't NULL-clean on filenames. But then // again, stdio.h itself isn't NULL-clean on filenames. if (user_trace) fprintf (stderr, "Classify list: -%s- \n", htext); fn_start_here = 0; fnlen = 1; while ( fnlen > 0 && maxhash < MAX_CLASSIFIERS) { crm_nextword (htext, hlen, fn_start_here, &fnstart, &fnlen); if (fnlen > 0) { strncpy (fname, &htext[fnstart], fnlen); fn_start_here = fnstart + fnlen + 1; fname[fnlen] = '\000'; if (user_trace) fprintf (stderr, "Classifying with file -%s- "\ "succhash=%ld, maxhash=%ld\n", fname, succhash, maxhash); if ( fname[0] == '|' && fname[1] == '\000') { if (vbar_seen) { nonfatalerror5 ("Only one ' | ' allowed in a CLASSIFY. \n" , "We'll ignore it for now.", CRM_ENGINE_HERE); } else { succhash = maxhash; }; vbar_seen ++; } else { // be sure the file exists // stat the file to get it's length k = stat (fname, &statbuf); // quick check- does the file even exist? if (k != 0) { nonfatalerror5 ("Nonexistent Classify table named: ", fname, CRM_ENGINE_HERE); } else { // file exists - do the open/process/close // hashlens[maxhash] = statbuf.st_size; // mmap the hash file into memory so we can bitwhack it hashes[maxhash] = (FEATUREBUCKET_STRUCT *) crm_mmap_file ( fname, 0, hashlens[maxhash], PROT_READ | PROT_WRITE, MAP_SHARED, NULL); if (hashes[maxhash] == MAP_FAILED ) { nonfatalerror5 ("Couldn't memory-map the table file", fname, CRM_ENGINE_HERE); } else { // set this hashlens to the length in features instead // of the length in bytes. hashlens[maxhash] = hashlens[maxhash] / sizeof (FEATUREBUCKET_STRUCT); hashname[maxhash] = (char *) malloc (fnlen+10); if (!hashname[maxhash]) untrappableerror5 ("Couldn't malloc hashname[maxhash]\n","We need that part later, so we're stuck. Sorry.", CRM_ENGINE_HERE); strncpy(hashname[maxhash],fname,fnlen); hashname[maxhash][fnlen]='\000'; maxhash++; }; }; }; if (maxhash > MAX_CLASSIFIERS-1) nonfatalerror5 ("Too many classifier files.", "Some may have been disregarded", CRM_ENGINE_HERE); }; }; // // If there is no '|', then all files are "success" files. if (succhash == 0) succhash = maxhash; if (user_trace) fprintf (stderr, "Running with %ld files for success out of %ld files\n", succhash, maxhash ); // sanity checks... Uncomment for super-strict CLASSIFY. // // do we have at least 1 valid .css files? if (maxhash == 0) { fatalerror5 ("Couldn't open at least 1 .css file for classify().", "", CRM_ENGINE_HERE); }; // do we have at least 1 valid .css file at both sides of '|'? //if (!vbar_seen || succhash < 0 || (maxhash < succhash + 2)) // { // nonfatalerror ( // "Couldn't open at least 1 .css file per SUCC | FAIL classes " // " for classify().\n","Hope you know what are you doing."); // }; // CLASSIFY with no arguments is a "success", if not found insane above if (maxhash == 0) return (0); for (ifile = 0; ifile < maxhash; ifile++) { // now, set up the normalization factor fcount[] // count up the total first // fcounts[ifile] = 0; // { // long k; // // for (k = 1; k < hashlens[ifile]; k++) // fcounts [ifile] = fcounts[ifile] + hashes[ifile][k].value; // } // if (fcounts[ifile] == 0) fcounts[ifile] = 1; // totalcount = totalcount + fcounts[ifile]; #ifdef OSB_LEARNCOUNTS // If LEARNCOUNTS is enabled, we normalize with // documents-learned. // // We use the reserved h2 == 0 setup for the learncount. // { char* litf = "Learnings in this file"; char* fitf = "Features in this file"; unsigned int h1; unsigned int hindex; // h1 = strnhash (litf, strlen ( litf )); hindex = h1 % hashlens[ifile]; if (hashes[ifile][hindex].hash != h1 || hashes[ifile][hindex].key != 0) { if (hashes[ifile][hindex].hash == 0 && hashes[ifile][hindex].key == 0) { // the slot is vacant - we use it. hashes[ifile][hindex].hash = h1; hashes[ifile][hindex].key = 0; hashes[ifile][hindex].value = 1; learns_index [ifile] = hindex; } else { fatalerror5 (" This file should have learncounts, but doesn't," " and the learncount slot is busy. It's hosed. ", " Time to die.", CRM_ENGINE_HERE); goto done; } } else { // the learncount slot was found matched. learns_index [ifile] = hindex; if (user_trace) fprintf (stderr, "File # %d has had %u documents learned.\n", ifile, hashes[ifile][hindex].value); }; h1 = strnhash (fitf, strlen ( fitf )); hindex = h1 % hashlens[ifile]; if (hindex == learns_index[ifile]) hindex++; if (hashes[ifile][hindex].hash != h1 || hashes[ifile][hindex].key != 0) { if (hashes[ifile][hindex].hash == 0 && hashes[ifile][hindex].key == 0) { // the slot is vacant - we use it. hashes[ifile][hindex].hash = h1; hashes[ifile][hindex].key = 0; hashes[ifile][hindex].value = 1; features_index[ifile] = hindex; } else { fatalerror5 ("This file should have featurecounts, but doesn't," "and the featurecount slot is busy. It's hosed. ", " Time to die.", CRM_ENGINE_HERE); goto done; } } else { // the learncount matched. features_index[ifile] = hindex; if (user_trace) fprintf (stderr, "File %d has had %u features learned\n", ifile, hashes[ifile][hindex].value); }; }; #endif // OSB_LEARNCOUNTS }; // // calculate cpcorr (count compensation correction) // total_learns = 0; total_features = 0; for (ifile = 0; ifile < maxhash; ifile++) { total_learns += hashes[ifile][learns_index[ifile]].value; total_features += hashes[ifile][features_index[ifile]].value; }; for (ifile = 0; ifile < maxhash; ifile++) { // disable cpcorr for now... unclear that it's useful. // cpcorr[ifile] = 1.0; // // new cpcorr - from Fidelis' work on evaluators. Note that // we renormalize _all_ terms, not just the min term. cpcorr [ifile] = (total_learns / (float) maxhash) / ((float) hashes[ifile][learns_index[ifile]].value); if (use_chisquared) cpcorr[ifile] = 1.00; }; if (internal_trace) fprintf (stderr, " files %ld learns #0 %u #1 %u total %lu cp0 %f cp1 %f \n", maxhash, hashes[0][learns_index[0]].value, hashes[1][learns_index[1]].value, total_learns, cpcorr [0], cpcorr [1] ); // // now all of the files are mmapped into memory, // and we can do the polynomials and add up points. thistotal = 0; textoffset = txtstart; textmaxoffset = txtstart + txtlen; (void)crm_vector_tokenize_selector(apb, txtptr, txtstart, txtlen, ptext, plen, NULL, 0, 0, features, (long)(OSB_BAYES_MAX_FEATURE_COUNT * STRIDE), &features_out, &next_offset); // #if (0) // // can't count on this // if (next_offset < txtlen) // (void)fatalerror5("Too many input features", // " (text being classified is too big).", // CRM_ENGINE_HERE); // #endif // !0 unk_features = features_out / STRIDE; // GROT GROT GROT // For each token found in the text, the vector tokenizer returns // nrows feature hashes, where nrows is the number of rows in the // coefficients matrix. Feature weights and chi-squared feature // weights, below, are chosen according to which row the feature // hash came from. crm_vector_tokenize_selector() doesn't tell us // how many rows are in the matrix it selected, so we just have to // know -- see setting variable nrows, above -- and step through the // returned array of features in lockstep with how we think the // tokenizer generated it. And we're doing stride STRIDE, which we // also just have to know. Assuming that all works, the matrix row // subscript for a feature hash is (j / STRIDE) % nrows, where j is // the subscript in the array features. // // That lockstep requirement is why we uniquify with seen_features, // instead of just sort-uniquing what came back from vector // tokenize. Sort-uniquing would throw away the implicit row // numbers in the array of features. for (j = 0; j + (STRIDE - 1) < features_out; j += STRIDE) { long irow = (j / STRIDE) % nrows; unsigned int h1, h2; int do_this_feature; // Zero out "Hits This Feature" htf = 0.0; h1 = features[j]; h2 = features[j + 1]; if (h2 == 0) h2 = 0xdeadbeef; if (internal_trace) fprintf (stderr, "Polynomial %ld has h1:%u h2: %u\n", irow, h1, h2); if (use_unique) { if (seen_features[h1 % OSB_BAYES_MAX_FEATURE_COUNT]) do_this_feature = 0; else { do_this_feature = 1; seen_features[h1 % OSB_BAYES_MAX_FEATURE_COUNT] = 1; } } else do_this_feature = 1; if (do_this_feature) for (ifile = 0; ifile < maxhash; ifile++) { unsigned int hindex; unsigned int lh; hindex = h1 % hashlens[ifile]; lh = hindex; hits[ifile] = 0; while ( hashes[ifile][lh].key != 0 && ( hashes[ifile][lh].hash != h1 || hashes[ifile][lh].key != h2 )) { lh++; if (lh >= hashlens[ifile]) lh = 0; // wraparound if (lh == hindex) break; // tried them all }; if (hashes[ifile][lh].hash == h1 && hashes[ifile][lh].key == h2) { // Note - a strict interpretation of Bayesian // chain probabilities should use 0 as the initial // state. However, because we rapidly run out of // significant digits, we use a much less strong // initial state. Note also that any nonzero // positive value prevents divide-by-zero. static int fw[] = {24, 14, 7, 4, 2, 1, 0}; // cubic weights seems to work well for chi^2...- Fidelis static int chi_feature_weight[] = {125, 64, 27, 8, 1, 0}; int feature_weight; long wh; // occurrences this feature this file, weighted // ..."weighted hits" // // calculate the precursors to the local probabilities; // these are the hits[ifile] array, and the htf total. feature_weight = fw[irow]; if ( use_chisquared ) { feature_weight = chi_feature_weight[irow]; // turn off weighting? feature_weight = 1; }; wh = hashes[ifile][lh].value * feature_weight; wh = wh * cpcorr [ifile]; // Correct with cpcorr // remember totalhits if (use_chisquared) { totalhits[ifile]++; } else { totalhits[ifile] = totalhits[ifile] + wh; } hits[ifile] = wh; htf = htf + hits[ifile]; // and hits-this-feature }; }; // now update the probabilities. // // NOTA BENE: there are a bunch of different ways to // calculate local probabilities. The text below // refers to an experiment that may or may not make it // as the "best way". // // The hard part is this - what is the local in-class // versus local out-of-class probability given the finding // of a particular feature? // // I'm guessing this- the validity is the differntial // seen on the feature (that is, fgood - fevil ) // times the entropy of that feature as seen in the // corpus (that is, // // Pfeature*log2(Pfeature) // // = // totalcount_this_feature // --------------- * log2 (totalcount_this_feature) // totalcount_all_features // // (note, yes, the last term seems like it should be // relative to totalcount_all_features, but a bit of algebra // will show that if you view fgood and fevil as two different // signals, then you end up with + and - totalcount inside // the logarithm parenthesis, and they cancel out. // (the 0.30102 converts "log10" to "log2" - it's not // a magic number, it's just that log2 isn't in glibc) // // HACK ALERT- this code here is still under development // and should be viewed with the greatest possible // suspicion. :=) // Now, some funky magic. Our formula above is // mathematically correct (if features were // independent- something we conveniently ignore.), // but because of the limited word length in a real // computer, we can quickly run out of dynamic range // early in a CLASSIFY (P(S) indistinguishable from // 1.00) and then there is no way to recover. To // remedy this, we use two alternate forms of the // formula (in Psucc and Pfail) and use whichever // form that yields the smaller probability to // recalculate the value of the larger. // // The net result of this subterfuge is a nonuniform // representation of the probability, with a huge dynamic // range in two places - near 0.0, and near 1.0 , which // are the two places where we actually care. // // Note upon note - we don't do this any more - instead we // do a full renormalize and unstick at each local prob. // // calculate renormalizer (the Bayesian formula's denomenator) if (do_this_feature) { if (use_chisquared) { // Actually, for chisquared with ONE feature // category (that being the number of weighted // hits) we end up with not having to do // anything here at all. Instead, we look at // total hits expected in a document of this // length. // // This actually makes sense, since the reality // is that most texts have an expected value of // far less than 1.0 for almost all featuess. // and thus common chi-squared assumptions // break down (like "need at least 5 in each // category"!) // float renorm; //double expected; //for ( ifile = 0; ifile < maxhash; ifile++) // { // This is the first half of a BROKEN // chi-squared formula - // // MeritProd = // Product (expected-observed)^2 / expected // // Second half- when done with features, take the // featurecounth root of MeritProd. // // Note that here the _lowest_ Merit is best fit. //if (htf > 0 ) // ptc[ifile] = ptc[ifile] * // (1.0 + ((htf/maxhash) - hits[ifile]) // * (1.0 +(htf/maxhash) - hits[ifile])) // / (2.0 + htf/maxhash); // // Renormalize to avoid really small // underflow... this is unnecessary with // above better additive form // //renorm = 1.0; //for (ifile = 0; ifile < maxhash; ifile++) //renorm = renorm * ptc[ifile]; //for (ifile = 0; ifile < maxhash; ifile++) //{ // ptc[ifile] = ptc[ifile] / renorm; // fprintf (stderr, "IFILE= %d, rn=%f, ptc[ifile] = %f\n", // // ifile, renorm, ptc[ifile]); //}; // Nota BENE: the above is not standard chi2 // here's a better one. // Again, lowest Merit is best fit. //if (htf > 0 ) // { // expected = (htf + 0.000001) / (maxhash + 1.0); // ptc[ifile] = ptc[ifile] + //((expected - hits[ifile]) // * (expected - hits[ifile])) // / expected; //}; //}; } else // if not chi-squared, use Bayesian { // calculate local probabilities from hits // for (ifile = 0; ifile < maxhash; ifile++) { pltc[ifile] = 0.5 + (( hits[ifile] - (htf - hits[ifile])) / (LOCAL_PROB_DENOM * (htf + 1.0))); }; // Calculate the per-ptc renormalization numerators renorm = 0.0; for (ifile = 0; ifile < maxhash; ifile++) renorm = renorm + (ptc[ifile]*pltc[ifile]); for (ifile = 0; ifile < maxhash; ifile++) ptc[ifile] = (ptc[ifile] * pltc[ifile]) / renorm; // if we have underflow (any probability == 0.0 ) then // bump the probability back up to 10^-308, or // whatever a small multiple of the minimum double // precision value is on the current platform. // for (ifile = 0; ifile < maxhash; ifile++) if (ptc[ifile] < 1000*DBL_MIN) ptc[ifile] = 1000 * DBL_MIN; // // part 2) renormalize to sum probabilities to 1.0 // renorm = 0.0; for (ifile = 0; ifile < maxhash; ifile++) renorm = renorm + ptc[ifile]; for (ifile = 0; ifile < maxhash; ifile++) ptc[ifile] = ptc[ifile] / renorm; for (ifile = 0; ifile < maxhash; ifile++) if (ptc[ifile] < 10*DBL_MIN) ptc[ifile] = 1000 * DBL_MIN; }; }; if (internal_trace) { for (ifile = 0; ifile < maxhash; ifile++) { fprintf (stderr, " poly: %ld filenum: %d, HTF: %7.0f, hits: %7.0f, Pl: %6.4e, Pc: %6.4e\n", irow, ifile, htf, hits[ifile], pltc[ifile], ptc[ifile]); }; }; }; expected = 1; // Do the chi-squared computation. This is just // (expected-observed)^2 / expected. // Less means a closer match. // if (use_chisquared) { double features_here, learns_here; double avg_features_per_doc, this_doc_relative_len; double actual; // The next statement appears stupid, but we don't have a // good way to estimate the fraction of features that // will be "out of corpus". A very *rough* guess is that // about 2/3 of the learned document features will be // hapaxes - that is, features not seen before, so we'll // start with the 1/3 that we expect to see in the corpus // as not-hapaxes. expected = unk_features / 1.5 ; for (k = 0; k < maxhash; k++) { if (totalhits[k] > expected) expected = totalhits[k] + 1; } for (k = 0; k < maxhash; k++) { features_here = hashes[k][features_index[k]].value; learns_here = hashes[k][learns_index[k]].value ; avg_features_per_doc = 1.0 + features_here / ( learns_here + 1.0); this_doc_relative_len = unk_features / avg_features_per_doc; // expected = 1 + this_doc_relative_len * avg_features_per_doc / 3.0; // expected = 1 + this_doc_relative_len * avg_features_per_doc; actual = totalhits[k]; chi2[k] = (expected - actual) * (expected - actual) / expected; // There's a real (not closed form) expression to // convert from chi2 values to probability, but it's // lame. We'll approximate it as 2^-chi2. Close enough // for government work. ptc[k] = 1 / (pow (chi2[k], 2)); if (user_trace) fprintf (stderr, "CHI2: k: %ld, feats: %lf, learns: %lf, avg fea/doc: %lf, rel_len: %lf, exp: %ld, act: %lf, chi2: %lf, p: %lf\n", k, features_here, learns_here, avg_features_per_doc, this_doc_relative_len, expected, actual, chi2[k], ptc[k] ); }; } // One last chance to force probabilities into the non-stuck zone for (k = 0; k < maxhash; k++) if (ptc[k] < 1000 * DBL_MIN) ptc[k] = 1000 * DBL_MIN; // and one last renormalize for both bayes and chisquared renorm = 0.0; for (k = 0; k < maxhash; k++) renorm = renorm + ptc[k]; for (k = 0; k < maxhash; k++) ptc[k] = ptc[k] / renorm; if (user_trace) { for (k = 0; k < maxhash; k++) fprintf (stderr, "Probability of match for file %ld: %f\n", k, ptc[k]); }; // tprob = 0.0; for (k = 0; k < succhash; k++) tprob = tprob + ptc[k]; if (svlen > 0) { char buf[1024]; double accumulator; double remainder; double overall_pR; long m; buf [0] = '\000'; accumulator = 1000 * DBL_MIN; for (m = 0; m < succhash; m++) { accumulator = accumulator + ptc[m]; }; remainder = 1000 * DBL_MIN; for (m = succhash; m < maxhash; m++) { remainder = remainder + ptc[m]; }; overall_pR = log10 (accumulator) - log10 (remainder); // note also that strcat _accumulates_ in stext. // There would be a possible buffer overflow except that _we_ control // what gets written here. So it's no biggie. if (tprob > 0.5000) { sprintf (buf, "CLASSIFY succeeds; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR ); } else { sprintf (buf, "CLASSIFY fails; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR ); }; if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); bestseen = 0; for (k = 0; k < maxhash; k++) if (ptc[k] > ptc[bestseen] ) bestseen = k; remainder = 1000 * DBL_MIN; for (m = 0; m < maxhash; m++) if (bestseen != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "Best match to file #%ld (%s) "\ "prob: %6.4f pR: %6.4f \n", bestseen, hashname[bestseen], ptc[bestseen], (log10(ptc[bestseen]) - log10(remainder))); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); sprintf (buf, "Total features in input file: %ld\n", unk_features); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); if (use_chisquared) { for (k = 0; k < maxhash; k++) { long m; remainder = 1000 * DBL_MIN; for (m = 0; m < maxhash; m++) if (k != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "#%ld (%s):" \ " features: %u, hits: %ld," // exp: %ld," " chi2: %3.2e, pR: %6.2f \n", k, hashname[k], hashes[k][features_index[k]].value, totalhits[k], // expected, chi2[k], (log10 (ptc[k]) - log10 (remainder) ) ); // strcat (stext, buf); if (strlen(stext)+strlen(buf) <= stext_maxlen) strcat (stext, buf); }; } else { for (k = 0; k < maxhash; k++) { long m; remainder = 1000 * DBL_MIN; for (m = 0; m < maxhash; m++) if (k != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "#%ld (%s):" \ " features: %u, hits: %ld, prob: %3.2e, pR: %6.2f \n", k, hashname[k], hashes[k][features_index[k]].value, totalhits[k], ptc[k], (log10 (ptc[k]) - log10 (remainder) ) ); // strcat (stext, buf); if (strlen(stext)+strlen(buf) <= stext_maxlen) strcat (stext, buf); }; }; // check here if we got enough room in stext to stuff everything // perhaps we'd better rise a nonfatalerror, instead of just // whining on stderr if (strcmp(&(stext[strlen(stext)-strlen(buf)]), buf) != 0) { nonfatalerror5 ( "WARNING: not enough room in the buffer to create " "the statistics text. Perhaps you could try bigger " "values for MAX_CLASSIFIERS or MAX_FILE_NAME_LEN?", " ", CRM_ENGINE_HERE); }; crm_destructive_alter_nvariable (svrbl, svlen, stext, strlen (stext)); }; done: // cleanup time! free(features); if (use_unique) free(seen_features); // remember to let go of the fd's and mmaps for (k = 0; k < maxhash; k++) { // close (hfds [k]); crm_munmap_file ((void *) hashes[k]); }; // // Free the hashnames, to avoid a memory leak. // for (i = 0; i < maxhash; i++) { /////////////////////////////////////// // ! XXX SPAMNIX HACK! //! -- by Barry Jaspan // //! Without the statement "k = i" (which should have no effect), //! the for statement crashes on MacOS X when compiled with gcc //! -O3. I've examined the pointers being freed, and they appear //! valid. I've run this under Purify on Windows, valgrind on //! Linux, and efence on MacOS X; none report a problem here //! (though valgrind reports umrs in the VHT code; see my post to //! crm114-developers). I've also examined the assembler produced //! with various changes here and, though I don't speak PPC, w/o //! the k = i it is qualitatively different. //! //! For now, I'm concluding it is an optimizer bug, and fixing it //! with the "k = i" statement. This occurs on MacOS X 10.2 with //! Apple Computer, Inc. GCC version 1175, based on gcc version //! 3.1 20020420 (prerelease). // k = i; free (hashname[i]); } if (tprob <= 0.5000) { if (user_trace) fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n"); // and do what we do for a FAIL here csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1; return (0); }; // // all done... if we got here, we should just continue execution if (user_trace) fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n"); return (0); }; crm114-20100106-BlameMichelson.src/crm_svm_matrix_util.c0000644000000000017500000013543611321154266021244 0ustar rootwsy#include "crm_svm_matrix_util.h" // crm_svm_matrix_util.c - Support Vector Machine //////////////////////////////////////////////////////////////////////// // This code is originally copyright and owned by William // S. Yerazunis as file crm_neural_net. In return for addition of // significant derivative work, Jennifer Barry is hereby granted a full // unlimited license to use this code, includng license to relicense under // other licenses. //////////////////////////////////////////////////////////////////////// // // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. /************************************************************************ *Expanding array and linked list functions. Mostly for use with the *matrix library, but possibly more general. ************************************************************************/ /***********************Expanding Array Functions***************************/ //Static expanding array function declarations static void expand(ExpandingArray *A, int newsize); /*************************************************************************** *Make a new expanding array with nothing in it * *INPUT: init_size: the initial size of the array * compact: COMPACT or PRECISE, deciding the size of the data in the array * *OUTPUT: an expanding array ***************************************************************************/ ExpandingArray *make_expanding_array(int init_size, int compact) { ExpandingArray *A = (ExpandingArray *)malloc(sizeof(ExpandingArray)); if (!A) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Could not create expanding array.\n"); } return NULL; } if (init_size < 0) { init_size = 0; } A->length = init_size; if (!compact) { A->data.precise = (PreciseExpandingType *)malloc(sizeof(PreciseExpandingType)*init_size); A->compact = 0; if (!A->data.precise) { A->length = 0; } } else { A->data.compact = (CompactExpandingType *)malloc(sizeof(CompactExpandingType)*init_size); A->compact = 1; if (!A->data.compact) { A->length = 0; } } A->last_elt = -1; A->first_elt = 0; //the first time we do an insert_before we center the array A->n_elts = 0; A->was_mapped = 0; return A; } /*************************************************************************** *Puts an element into the next open spot in the array, doubling the size *of the array if needed. * *INPUT: d: element to insert * A: array in which to insert the element * *WARNINGS: *1) This just puts an element into the next open spot in A. If d is some * sort of sparse element, this never checks to make sure A is still in * ascending column order. Use the matrix functions to insert things * instead if you want that order preserved. ***************************************************************************/ void expanding_array_insert(ExpandingType d, ExpandingArray *A) { if (!A) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_insert: null array.\n"); } return; } A->last_elt++; if (A->last_elt >= A->length) { if (A->length == 0) { A->length = 1; } expand(A, 2*A->length); if (!(A->length)) { if (MATR_DEBUG_MODE) { fprintf (stderr, "expanding_array_insert: unable to expand array enough to do insert.\n"); } return; } } if (A->compact) { A->data.compact[A->last_elt] = *(d.compact); } else { A->data.precise[A->last_elt] = *(d.precise); } A->n_elts++; } /*************************************************************************** *Inserts an element into position c of A where c is relative to first_elt. *If something is already at position c, this over-writes it. If c is negative, *the element is inserted before the first element. * *INPUT: d: element to insert * c: column in A (relative to first_elt!) in which to insert d. c CAN be * negative, in which case this will insert the element an appropriate * number of places before zero * A: array in which to insert the element * *WARNINGS: *1) c is relative to first_elt. So if the first element is in place 3 * of A and c is 1, d will be inserted in place 4 of A. To insert d in * place 1 of A, c needs to be -3. *2) This just puts an element into spot c in A. If d is some * sort of sparse element, this never checks to make sure A is still in * ascending column order. Use the matrix functions to insert things * instead if that is important. *3) If there is already an entry at c, d will overwrite it. ***************************************************************************/ void expanding_array_set(ExpandingType d, int c, ExpandingArray *A) { int newsize, offset, mid, i; if (!A) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_set: null array.\n"); } return; } if (A->first_elt == 0 && c == A->n_elts) { //if we only ever insert at the end of the array //then we don't want to do this funny middle thing //the first time we do an insert_before, first_elt gets set to non-zero //and all is good expanding_array_insert(d, A); return; } if (c+A->first_elt >= A->length || c+A->first_elt < 0) { if (fabs(c+A->first_elt) < 2*A->length) { if (A->length == 0) { A->length = 1; } newsize = 2*A->length; } else { newsize = (fabs(c+A->first_elt)+1); } expand(A, newsize); if (!(A->length)) { if (MATR_DEBUG_MODE) { fprintf (stderr, "expanding_array_insert: unable to expand array enough to do insert.\n"); } return; } //with insert we try to keep things centered //so we move everything //and recenter the array mid = A->n_elts/2; offset = A->length/2 - mid; for (i = A->last_elt; i >= A->first_elt; i--) { if (A->compact) { A->data.compact[i-A->first_elt+offset] = A->data.compact[i]; } else { A->data.precise[i-A->first_elt+offset] = A->data.precise[i]; } } A->last_elt += offset-A->first_elt; A->first_elt = offset; } if (A->compact) { A->data.compact[A->first_elt+c] = *(d.compact); } else { A->data.precise[A->first_elt+c] = *(d.precise); } if (c+A->first_elt > A->last_elt) { A->last_elt = c + A->first_elt; A->n_elts++; } if (c+A->first_elt < A->first_elt) { A->first_elt += c; A->n_elts++; } } //"private" function to change the array size //on failure this sets A->length = 0 static void expand(ExpandingArray *A, int newsize) { ExpandingArray tmp; int i; if (!A) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expand: null array.\n"); } return; } if (MATR_DEBUG_MODE >= MATR_OPS) { fprintf(stderr, "Expanding array to size %d\n", newsize); } A->length = newsize; if (!A->was_mapped) { if (A->compact) { A->data.compact = (CompactExpandingType *) realloc(A->data.compact, sizeof(CompactExpandingType)*newsize); if (!A->data.compact) { A->length = 0; } } else { A->data.precise = (PreciseExpandingType *) realloc(A->data.precise, sizeof(PreciseExpandingType)*newsize); if (!A->data.precise) { A->length = 0; } } } else { A->was_mapped = 0; //the data for A needs to be freed now! if (A->compact) { tmp.data.compact = A->data.compact; A->data.compact = (CompactExpandingType *) malloc(sizeof(CompactExpandingType)*newsize); if (!A->data.compact) { A->length = 0; return; } for (i = A->first_elt; i < A->last_elt; i++) { if (i >= newsize) { //we might be making A smaller break; } A->data.compact[i] = tmp.data.compact[i]; } } else { tmp.data.precise = A->data.precise; A->data.precise = (PreciseExpandingType *) malloc(sizeof(PreciseExpandingType)*newsize); if (!A->data.precise) { A->length = 0; return; } for (i = A->first_elt; i < A->last_elt; i++) { if (i >= newsize) { break; } A->data.precise[i] = tmp.data.precise[i]; } } } } /*************************************************************************** *Trims the expanding array to size first_elt + n_elts. I would have liked *to have this function trim the array to size n_elts, but I can't figure *out how to just free the first first_elt elements of an array. * *INPUT: A: array to trim * *WARNINGS: *1) This frees only memory above last_elt (ie trims A to size first_elt * + n_elts). If first_elt != 0 this DOES NOT free all of the unused * memory associated with A. ***************************************************************************/ void expanding_array_trim(ExpandingArray *A) { if (!A) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_trim: null array.\n"); } return; } if (A->length == A->last_elt+1) { return; } expand(A, A->last_elt+1); } /*************************************************************************** *Returns the element at position c of A where c is relative to first_elt. * *INPUT: c: element to get. c is relative to first_elt! * A: array from which to get the element * *OUTPUT: the element at position c or NULL if c is less than 0 or * greater than the number of elements in A * *WARNINGS: *1) c is relative to first_elt. So if the first element is in place 3 * of A and c is 1, this returns the element in place 4 of A. *2) Check for a NULL return. ***************************************************************************/ ExpandingType expanding_array_get(int c, ExpandingArray *A) { ExpandingType et; if (!A || !(A->length)) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_get: null array.\n"); } et.precise = NULL; return et; } if (c + A->first_elt > A->last_elt || c < 0) { et.precise = NULL; return et; } if (A->compact) { et.compact = &A->data.compact[c+A->first_elt]; return et; } else { et.precise = &A->data.precise[c + A->first_elt]; return et; } } /*************************************************************************** *Search for an element with column c of A, assuming A is ordered by ascending *columns and that its elements are SparseElement's. * *INPUT: c: column to search for. this has nothing to do with the column * of A - rather it is the column of a sparse element in a matrix! * init: initial guess of the index at which c appears (relative to first_elt) * A: array to search * *OUTPUT: the index of an element with column c or, if no such element exists, * the index of the last element with a column number less than c or the * first element with a column number greater than c. If the array is empty * this returns -1. * *WARNINGS: *1) init is relative to first_elt. *2) if c does not appear in the array, the return may be the last element * before c would appear OR the first element after. *3) the search assumes A is arranged in ascending column order. if it is * not, the result will probably be wrong. ***************************************************************************/ int expanding_array_search(unsigned int c, int init, ExpandingArray *A) { int i, front, back, num_it = 0; if (!A) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_search: null array.\n"); } return -1; } i = init+A->first_elt; front = A->first_elt; back = A->last_elt; if (back < front) { return -1; } if (i < front) { i = front; } if (i > back) { i = back; } if ((A->compact && !(A->data.compact)) || (!(A->compact) && !(A->data.precise))) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_search: null array.\n"); } return -1; } //check i itself if (i >= A->first_elt && i <= A->last_elt && ((A->compact && c == A->data.compact[i].s.col) || (!A->compact && c == A->data.precise[i].s.col))) { return i - A->first_elt; } //check the beginning and the end if ((A->compact && c >= A->data.compact[A->last_elt].s.col) || (!A->compact && c >= A->data.precise[A->last_elt].s.col)) { return A->last_elt-A->first_elt; } if ((A->compact && c <= A->data.compact[A->first_elt].s.col) || (!A->compact && c <= A->data.precise[A->first_elt].s.col)) { return 0; } //check before and after the current element if (i > A->first_elt && i <= A->last_elt && ((A->compact && c < A->data.compact[i].s.col) || (!A->compact && c < A->data.precise[i].s.col)) && ((A->compact && c >= A->data.compact[i-1].s.col) || (!A->compact && c >= A->data.precise[i-1].s.col))) { return i-1-A->first_elt; } if (i >= A->first_elt && i < A->last_elt && ((A->compact && c > A->data.compact[i].s.col) || (!A->compact && c > A->data.precise[i].s.col)) && ((A->compact && c <= A->data.compact[i+1].s.col) || (!A->compact && c <= A->data.precise[i+1].s.col))) { return i+1-A->first_elt; } while (((A->compact && A->data.compact[i].s.col != c) || (!(A->compact) && A->data.precise[i].s.col != c)) && front <= back) { i = (front + back)/2; if ((A->compact && A->data.compact[i].s.col < c) || (!A->compact && A->data.precise[i].s.col < c)) { front = i+1; } else if ((A->compact && A->data.compact[i].s.col > c) || (!A->compact && A->data.precise[i].s.col > c)) { back = i-1; } num_it++; } if (MATR_DEBUG_MODE >= MATR_OPS_MORE) { fprintf(stderr, "After full search (%d iterations) returned %d, init = %d, last_elt = %d, first_elt = %d\n", num_it, i, init+A->first_elt, A->last_elt, A->first_elt); } return i - A->first_elt; } /*************************************************************************** *Insert an element into the array. This function does the least amount *of shifting so that if before < n_elts/2, all the elements will move back *one place and otherwise they will move forward one place. * *INPUT: ne: element to insert * before: the index of the element ne should be inserted before (relative to * first_elt) * A: the array in which to insert * *OUTPUT: the ABSOLUTE index in A of the new element. this index is NOT * relative to first_elt! we do this this way, because the location of * first_elt may change during this function. * *WARNINGS: *1) the return is an ABSOLUTE index NOT RELATIVE to first_elt. *2) the function does the least amount of shifting so it may change * last_elt OR it may change first_elt. ***************************************************************************/ int expanding_array_insert_before(ExpandingType ne, int before, ExpandingArray *A) { int i; ExpandingType tmp; CompactExpandingType cet; PreciseExpandingType pet; if (!A) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_insert_before: null array.\n"); } return -1; } if (before < 0) { before = 0; } if (before < A->n_elts/2) { //this changes indexing tmp = expanding_array_get(0, A); if (tmp.precise) { if (A->compact) { cet = *(tmp.compact); tmp.compact = &cet; } else { pet = *(tmp.precise); tmp.precise = &pet; } expanding_array_set(tmp, -1, A); } for (i = 1; i < before; i++) { tmp = expanding_array_get(i+1, A); if (tmp.precise) { if (A->compact) { cet = *(tmp.compact); tmp.compact = &cet; } else { pet = *(tmp.precise); tmp.precise = &pet; } expanding_array_set(tmp, i, A); } } expanding_array_set(ne, before, A); } else { for (i = A->n_elts-1; i >= before; i--) { tmp = expanding_array_get(i, A); if (tmp.precise) { if (A->compact) { cet = *(tmp.compact); tmp.compact = &cet; } else { pet = *(tmp.precise); tmp.precise = &pet; } expanding_array_set(tmp, i+1, A); } } expanding_array_set(ne, before, A); } return before+A->first_elt; } /*************************************************************************** *Insert an element into the array. This function does the least amount *of shifting so that if after < n_elts/2, all the elements will move back *one place and otherwise they will move forward one place. * *INPUT: ne: element to insert * after: the index of the element ne should be inserted after (relative to * first_elt) * A: the array in which to insert * *OUTPUT: the ABSOLUTE index in A of the new element. this index is NOT * relative to first_elt! we do this this way, because the location of * first_elt may change during this function. * *WARNINGS: *1) the return is an ABSOLUTE index NOT RELATIVE to first_elt. *2) the function does the least amount of shifting so it may change * last_elt OR it may change first_elt. ***************************************************************************/ int expanding_array_insert_after(ExpandingType ne, int after, ExpandingArray *A) { return expanding_array_insert_before(ne, after+1, A); } /*************************************************************************** *Remove an element from the array. This function does the least amount *of shifting so that if elt < n_elts/2, all the elements will move back *one place and otherwise they will move forward one place. * *INPUT: elt: the index (relative to first_elt) of the element to be removed * A: the array from which to remove elt * *WARNINGS: *1) the function does the least amount of shifting so it may change * last_elt OR it may change first_elt. ***************************************************************************/ void expanding_array_remove_elt(int elt, ExpandingArray *A) { int i; if (!A) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_remove_elt: null array.\n"); } return; } if (elt < A->n_elts/2) { //move everything behind it closer for (i = elt-1; i >= 0; i--) { expanding_array_set(expanding_array_get(i, A), i+1, A); } A->first_elt++; } else { for (i = elt+1; i < A->n_elts; i++) { expanding_array_set(expanding_array_get(i, A), i-1, A); } A->last_elt--; } A->n_elts--; } /*************************************************************************** *Clears all elements of A. * *INPUT: A: the array to clear * *WARNINGS: *1) this does not free any of the memory associated with A. to do that * call expanding_array_free. ***************************************************************************/ void expanding_array_clear(ExpandingArray *A) { if (!A) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_clear: null array.\n"); } return; } A->last_elt = -1; if (A->first_elt > 0) { A->first_elt = A->length/2; } A->n_elts = 0; } /*************************************************************************** *Writes A to a file in binary format. * *INPUT: A: the array to write * fp: pointer to file to write A in * *WARNINGS: *1) A is written in a BINARY format. Use expanding_array_read to recover * A. ***************************************************************************/ size_t expanding_array_write(ExpandingArray *A, FILE *fp) { size_t size; if (!A || !fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_write: null arguments.\n"); } return 0; } size = sizeof(ExpandingArray)*fwrite(A, sizeof(ExpandingArray), 1, fp); if (A->length && A->length >= A->first_elt) { if (A->compact && A->data.compact) { return size + sizeof(CompactExpandingType)* fwrite(&(A->data.compact[A->first_elt]), sizeof(CompactExpandingType), A->n_elts, fp); } if (!(A->compact) && A->data.precise) { return size + sizeof(PreciseExpandingType)* fwrite(&(A->data.precise[A->first_elt]), sizeof(PreciseExpandingType), A->n_elts, fp); } } return size; } /*************************************************************************** *Reads A from a file in binary format. * *INPUT: A: an expanding array. if A contains any data it will be freed * and overwritten. * fp: pointer to file to read A from * *WARNINGS: *1) If fp does not contain a properly formatted expanding array as written * by the function expanding_array_write, this function will do its best, * but the results may be very bizarre. Check for an empty return. ***************************************************************************/ void expanding_array_read(ExpandingArray *A, FILE *fp) { size_t amount_read; if (!A || !fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_read: null arguments.\n"); } return; } if (A->compact && A->data.compact && !(A->was_mapped)) { free(A->data.compact); } else if (!(A->compact) && A->data.precise && !(A->was_mapped)) { free(A->data.precise); } amount_read = fread(A, sizeof(ExpandingArray), 1, fp); A->was_mapped = 0; if (!amount_read) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_read: bad file.\n"); } return; } if (A->length >= A->n_elts && A->first_elt < A->length && A->first_elt >= 0) { if (A->compact) { A->data.compact = (CompactExpandingType *) malloc(sizeof(CompactExpandingType)*A->length); amount_read = fread(&(A->data.compact[A->first_elt]), sizeof(CompactExpandingType), A->n_elts, fp); } else { A->data.precise = (PreciseExpandingType *) malloc(sizeof(PreciseExpandingType)*A->length); amount_read = fread(&(A->data.precise[A->first_elt]), sizeof(PreciseExpandingType), A->n_elts, fp); } if (amount_read < A->n_elts && MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_read: fewer elts read in than expected.\n"); } } else { if (MATR_DEBUG_MODE && A->n_elts) { fprintf(stderr, "expanding_array_read: A cannot contain all of its elements. This is likely a corrupted file.\n"); } A->length = 0; A->n_elts = 0; A->first_elt = 0; A->last_elt = -1; A->data.precise = NULL; } } /*************************************************************************** *Maps an expanding array from a block of memory in binary format (the same *format as would be written to a file using . * *INPUT: addr: a pointer to the address where the expanding array begins * last_addr: the last possible address that is valid. NOT necessarily where * the expanding array ends - just the last address that has been allocated * in the chunk pointed to by *addr (ie, if *addr was taken from an mmap'd file * last_addr would be addr + the file size). * *OUTPUT: An expanding array STILL referencing the chunk of memory at *addr, * but formated as an expanding array or NULL if a properly formatted * expanding array didnt' start at addr. * *addr: (pass-by-reference) now points to the next address AFTER the full * expanding array. * *WARNINGS: * 1) *addr needs to be writable. This will CHANGE VALUES stored at *addr and * will seg fault if *addr is not writable. * 2) last_addr does not need to be the last address of the expanding array * but if it is before that, either NULL will be returned or an * expanding array with a NULL data value will be returned. * 3) if *addr does not contain a properly formatted array, this function * will not seg fault, but that is the only guarantee. * 4) call expanding_array_free_data on this output UNLESS you are SURE * you have made no changes! if the array expands its data, you need * to free that. DO NOT call expanding_array_free. * 5) *addr CHANGES! ***************************************************************************/ ExpandingArray *expanding_array_map(void **addr, void *last_addr) { ExpandingArray *A; if (!addr || !*addr || !last_addr || *addr >= last_addr) { if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_map: null arguments.\n"); } return NULL; } if (*addr + sizeof(ExpandingArray) > last_addr) { //bad if (MATR_DEBUG_MODE) { fprintf(stderr, "expanding_array_map: not enough memory for array.\n"); } return NULL; } A = (ExpandingArray *)(*addr); *addr += sizeof(ExpandingArray); A->length = A->n_elts; //we only have this much space in the file A->last_elt = A->n_elts-1; A->first_elt = 0; A->was_mapped = 1; if (A->length >= A->n_elts && A->first_elt < A->length && A->first_elt >= 0 && ((A->compact && *addr + A->n_elts*sizeof(CompactExpandingType) <= last_addr) || (!(A->compact) && *addr + A->n_elts*sizeof(PreciseExpandingType) <= last_addr))) { if (A->compact) { A->data.compact = (CompactExpandingType *)(*addr); *addr += A->n_elts*sizeof(CompactExpandingType); } else { A->data.precise = (PreciseExpandingType *)(*addr); *addr += A->n_elts*sizeof(PreciseExpandingType); } } else { if (MATR_DEBUG_MODE && A->n_elts) { fprintf(stderr, "expanding_array_map: array cannot contain all of its elements. This is likely a corrupted file.\n"); } A->length = 0; A->n_elts = 0; A->first_elt = 0; A->last_elt = -1; A->data.precise = NULL; } return A; } /*************************************************************************** *Frees the data associated A. * *INPUT: A: the array with the data to free * *WARNINGS: *1) this does not free A, only the data associated with it. ***************************************************************************/ void expanding_array_free_data(ExpandingArray *A) { if (!A || A->was_mapped) { return; } if (A->compact && A->data.compact) { free(A->data.compact); } else if (A->data.precise) { free(A->data.precise); } } /*************************************************************************** *Frees all memory associated with A. * *INPUT: A: the array to free ***************************************************************************/ void expanding_array_free(ExpandingArray *A) { if (A && A->was_mapped) { //the data stored in A was mapped in //we shouldn't free it free(A); return; } expanding_array_free_data(A); if (A) { free(A); } } /***********************Linked List Functions***************************/ //Linked list static function declarations static inline size_t node_write(SparseNode n, FILE *fp); static inline SparseNode node_read(int is_compact, FILE *fp); static inline SparseNode node_map(int is_compact, void **addr, void *last_addr); /*************************************************************************** *Make a new list with nothing in it * *INPUT: compact: COMPACT or PRECISE, deciding the size of the data in the list * *OUTPUT: a linked list ***************************************************************************/ SparseElementList *make_list(int compact) { SparseElementList *l = (SparseElementList *)malloc(sizeof(SparseElementList)); if (!l) { if (MATR_DEBUG_MODE) { fprintf(stderr, "Could not create a sparse element list.\n"); } return NULL; } l->compact = compact; l->head = make_null_node(compact); l->tail = make_null_node(compact); l->last_addr = NULL; return l; } /*************************************************************************** *Search for an element with column c of l, assuming l is ordered by ascending *columns and that its elements are SparseElement's. * *INPUT: c: column to search for. * init: initial guess * l: list to search * *OUTPUT: a pointer to the element with column c or, if no such element exists, * a pointer to the last element with a column number less than c or the * first element with a column number greater than c. If the array is empty * this returns a null node. * *WARNINGS: *1) if c does not appear in the array, the return may be the last element * before c would appear OR the first element after. *2) the search assumes l is arranged in ascending column order. if it is * not, the result will probably be wrong. ***************************************************************************/ SparseNode list_search(unsigned int c, SparseNode init, SparseElementList *l) { SparseNode curr = init; if (!l) { if (MATR_DEBUG_MODE) { fprintf(stderr, "list_search: null list.\n"); } return make_null_node(l->compact); } if (list_is_empty(l)) { return make_null_node(l->compact); } if (c <= node_col(l->head)) { return l->head; } if (c >= node_col(l->tail)) { return l->tail; } while (!null_node(curr) && node_col(curr) < c) { curr = next_node(curr); } while (!null_node(curr) && node_col(curr) > c) { curr = prev_node(curr); } return curr; } /*************************************************************************** *Insert an element into a list. * *INPUT: newelt: element to be inserted * before: pointer to element before which newelt should be inserted * l: list in which to insert * *OUTPUT: a pointer to the element that has been inserted ***************************************************************************/ SparseNode list_insert_before(SparseElement newelt, SparseNode before, SparseElementList *l) { CompactSparseNode *cn; PreciseSparseNode *pn; SparseNode n; if (!l) { if (MATR_DEBUG_MODE) { fprintf(stderr, "list_insert_before: null list.\n"); } return make_null_node(l->compact); } n.is_compact = l->compact; n.compact = NULL; n.precise = NULL; if (l->compact) { cn = (CompactSparseNode *)malloc(sizeof(CompactSparseNode)); n.compact = cn; cn->data = *(newelt.compact); if (list_is_empty(l)) { //empty list cn->prev = NULL; l->head.compact = cn; l->tail.compact = cn; } else { cn->prev = before.compact->prev; if (cn->prev) { cn->prev->next = cn; } else { l->head.compact = cn; } before.compact->prev = cn; } cn->next = before.compact; } else { pn = (PreciseSparseNode *)malloc(sizeof(PreciseSparseNode)); n.precise = pn; pn->data = *(newelt.precise); if (list_is_empty(l)) { //empty list pn->prev = NULL; l->head.precise = pn; l->tail.precise = pn; } else { pn->prev = before.precise->prev; if (pn->prev) { pn->prev->next = pn; } else { l->head.precise = pn; } before.precise->prev = pn; } pn->next = before.precise; } return n; } /*************************************************************************** *Insert an element into a list. * *INPUT: newelt: element to be inserted * after: pointer to element after which newelt should be inserted * l: list in which to insert * *OUTPUT: a pointer to the element that has been inserted ***************************************************************************/ SparseNode list_insert_after(SparseElement ne, SparseNode after, SparseElementList *l) { CompactSparseNode *cn; PreciseSparseNode *pn; SparseNode n; if (!l) { if (MATR_DEBUG_MODE) { fprintf(stderr, "list_insert_after: null list.\n"); } return make_null_node(l->compact); } n.is_compact = l->compact; n.compact = NULL; n.precise = NULL; if (l->compact) { cn = (CompactSparseNode *)malloc(sizeof(CompactSparseNode)); n.compact = cn; cn->data = *(ne.compact); if (list_is_empty(l)) { //empty list cn->next = NULL; l->head.compact = cn; l->tail.compact = cn; } else { cn->next = after.compact->next; if (cn->next) { cn->next->prev = cn; } else { l->tail.compact = cn; } after.compact->next = cn; } cn->prev = after.compact; } else { pn = (PreciseSparseNode *)malloc(sizeof(PreciseSparseNode)); n.precise = pn; pn->data = *(ne.precise); if (list_is_empty(l)) { //empty list pn->next = NULL; l->head.precise = pn; l->tail.precise = pn; } else { pn->next = after.precise->next; if (pn->next) { pn->next->prev = pn; } else { l->tail.precise = pn; } after.precise->next = pn; } pn->prev = after.precise; } return n; } /*************************************************************************** *Clear a list, freeing each element. * *INPUT: l: list to clear ***************************************************************************/ void list_clear(SparseElementList *l) { SparseNode curr, next; int i; if (!l) { if (MATR_DEBUG_MODE) { fprintf(stderr, "list_clear: null list.\n"); } return; } curr = l->head; i = 0; while (!null_node(curr)) { next = next_node(curr); if (!(l->last_addr)) { node_free(curr); } else { if (l->compact && ((void *)curr.compact < (void *)l || (void *)curr.compact >= l->last_addr)) { node_free(curr); } if (!(l->compact) && ((void *)curr.precise < (void *)l || (void *)curr.precise >= l->last_addr)) { node_free(curr); } } curr = next; i++; } l->head = make_null_node(l->compact); l->tail = make_null_node(l->compact); } /*************************************************************************** *Remove an element from the list. * *INPUT: l: list from which to remove an element * toremove: pointer to the element to be removed ***************************************************************************/ void list_remove_elt(SparseElementList *l, SparseNode toremove) { if (!l) { if (MATR_DEBUG_MODE) { fprintf(stderr, "list_remove_elt: null list.\n"); } return; } if (null_node(toremove)) { return; } if (!null_node(prev_node(toremove))) { if (l->compact) { toremove.compact->prev->next = toremove.compact->next; } else { toremove.precise->prev->next = toremove.precise->next; } } else { if (l->compact) { l->head.compact = toremove.compact->next; } else { l->head.precise = toremove.precise->next; } } if (!null_node(next_node(toremove))) { if (l->compact) { toremove.compact->next->prev = toremove.compact->prev; } else { toremove.precise->next->prev = toremove.precise->prev; } } else { if (l->compact) { l->tail.compact = toremove.compact->prev; } else { l->tail.precise = toremove.precise->prev; } } if (l->compact) { if (!(l->last_addr) || (void *)toremove.compact < (void *)l || (void *)toremove.compact >= l->last_addr) { node_free(toremove); } } else { if (!(l->last_addr) || (void *)toremove.precise < (void *)l || (void *)toremove.precise >= l->last_addr) { node_free(toremove); } } } /*************************************************************************** *Check if a list is empty. * *INPUT: l: list to check. * *OUTPUT: 1 if l is empty, 0 else ***************************************************************************/ int list_is_empty(SparseElementList *l) { if (!l) { return 1; } return null_node((l->head)); } /*************************************************************************** *Writes l to a file in binary format. * *INPUT: l: the array to write * fp: pointer to file to write l in * *WARNINGS: *1) l is written in a BINARY format. Use list_read to recover * l. ***************************************************************************/ size_t list_write(SparseElementList *l, FILE *fp) { size_t size; if (!l || !fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "list_write: null arguments.\n"); } return 0; } size = sizeof(SparseElementList)*fwrite(l, sizeof(SparseElementList), 1, fp); SparseNode curr = l->head; while (!null_node(curr)) { size += node_write(curr, fp); curr = next_node(curr); } return size; } /*************************************************************************** *Reads l from a file in binary format. * *INPUT: l: a sparse element list. if l contains any data it will be freed * and overwritten. * fp: pointer to file to read l from * n_elts: the number of elements (nodes) to read into the list * *OUTPUT: the number of elements actually read. * *WARNINGS: *1) If fp does not contain a properly formatted list as written * by the function list_write, this function will do its best, * but the results may be very bizarre. Check for an empty return. ***************************************************************************/ int list_read(SparseElementList *l, FILE *fp, int n_elts) { SparseNode n, pn; int i; size_t unused; if (!l || !fp || n_elts < 0) { if (MATR_DEBUG_MODE) { fprintf(stderr, "list_write: null arguments.\n"); } return 0; } if (!list_is_empty(l)) { list_clear(l); } l->last_addr = NULL; unused = fread(l, sizeof(SparseElementList), 1, fp); if (n_elts <= 0) { return 0; } l->head = node_read(l->compact, fp); pn = l->head; for (i = 1; i < n_elts; i++) { if (null_node(pn)) { break; } n = node_read(l->compact, fp); if (null_node(n)) { break; } if (l->compact) { pn.compact->next = n.compact; n.compact->prev = pn.compact; } else { pn.precise->next = n.precise; n.precise->prev = pn.precise; } pn = n; } if (i != n_elts) { if (!null_node(pn)) { if (l->compact) { pn.compact->next = NULL; } else { pn.precise->next = NULL; } } if (MATR_DEBUG_MODE) { fprintf(stderr, "list_read: Couldn't read in enough elements.\n"); } } l->tail = pn; return i; } /*************************************************************************** *Maps an list from a block of memory in binary format (the same *format as would be written to a file using list_write. * *INPUT: addr: pointer to the address where the list begins * last_addr: the last possible address that is valid. NOT necessarily where * the list ends - just the last address that has been allocated in the * chunk pointed to by *addr (ie, if *addr was taken from an mmap'd file * last_addr would be *addr + the file size). * n_elts_ptr: a pointer to a value containing the number of elements * in l that should be read. on return, this value is the number of * elements that actually were read. * *OUTPUT: A list STILL referencing the chunk of memory at *addr, * but formated as a list or NULL if a properly formatted * list didn't start at *addr. * *addr: (pass-by-reference) points to the first memory location AFTER the * full list * *n_elts_ptr: (pass-by-reference) the number of elements actually read * *WARNINGS: * 1) *addr needs to be writable. This will CHANGE VALUES stored at *addr and * will seg fault if addr is not writable. * 2) last_addr does not need to be the last address of the list * but if it is before that, either NULL will be returned or an * matrix with a NULL data value will be returned. * 3) if *addr does not contain a properly formatted list, this function * will not seg fault, but that is the only guarantee. * 4) you should call list_clear on this list unless you are CERTAIN you * have not changed the list. calling list_clear on an unchanged list * will not do anything. * 5) *addr and *n_elts_ptr CHANGE! ***************************************************************************/ SparseElementList *list_map(void **addr, void *last_addr, int *n_elts_ptr) { SparseElementList *l; SparseNode n, pn; int n_elts = *n_elts_ptr, i; if (!addr || !*addr || !last_addr || n_elts < 0 || *addr >= last_addr) { if (MATR_DEBUG_MODE) { fprintf(stderr, "list_map: null arguments.\n"); } *n_elts_ptr = 0; return NULL; } if (*addr + sizeof(SparseElementList) > last_addr) { if (MATR_DEBUG_MODE) { fprintf(stderr, "list_map: not enough memory for list.\n"); } *n_elts_ptr = 0; return NULL; } l = (SparseElementList *)(*addr); *addr += sizeof(SparseElementList); l->head = node_map(l->compact, addr, last_addr); pn = l->head; for (i = 1; i < n_elts; i++) { if (null_node(pn)) { break; } n = node_map(l->compact, addr, last_addr); if (null_node(n)) { break; } if (l->compact) { pn.compact->next = n.compact; n.compact->prev = pn.compact; } else { pn.precise->next = n.precise; n.precise->prev = pn.precise; } pn = n; } if (i != n_elts) { if (!null_node(pn)) { if (l->compact) { pn.compact->next = NULL; } else { pn.precise->next = NULL; } } *n_elts_ptr = i; if (MATR_DEBUG_MODE) { fprintf(stderr, "list_map: Couldn't read in enough elements.\n"); } } l->last_addr = *addr; l->tail = pn; return l; } /*************************************************************************** *Copies a list into a chunk of memory. to will not be completely identical * to from since pointer values will change and the value of last_addr will * change. to is, and can be treated as, a contiguous-memory form of from. * it is a SparseElementList and list_clear should be called before free'ing * the chunk of memory to belongs in. As with memmove this function does not * actually "move" anything out of from. * *INPUT: to: a block of memory with enough memory to hold the entire list * stored in from. * from: the list to be copied from. * *OUTPUT: A pointer to the first address AFTER the data was copied. In * other words this returns to + size(from) where size(from) is the size * in bytes of the full list from. * *WARNINGS: * 1) to needs to be writable. This will CHANGE VALUES stored at to and * will seg fault if to is not writable. * 2) this does NOT CHECK FOR OVERFLOW. to must have enough memory * already to contain from or this can cause a seg fault. * 3) unlike with memmove, this is not a completely byte-by-byte copy. * instead, to is a copy of the list from stored contiguously at to * with the same functionality as from. in other words, to can be * treated as a list. * 4) you should call list_clear on to unless you are CERTAIN you * have not changed it. calling list_clear on an unchanged list * will not do anything. * 5) like memmove, this actually copies, not moves. it DOES NOT FREE from. ***************************************************************************/ void *list_memmove(void *to, SparseElementList *from) { void *curr; SparseNode n, tn, tpn; int i; if (!from || !to) { if (MATR_DEBUG_MODE) { fprintf(stderr, "list_memmove: null arguments.\n"); } return to; } *((SparseElementList *)to) = *from; curr = to + sizeof(SparseElementList); n = from->head; if (null_node(((SparseElementList *)to)->head)) { return curr; } if (from->compact) { ((SparseElementList *)to)->head.compact = (CompactSparseNode *)curr; curr += sizeof(CompactSparseNode); *(((SparseElementList *)to)->head.compact) = *(n.compact); ((SparseElementList *)to)->head.precise = NULL; } else { ((SparseElementList *)to)->head.precise = (PreciseSparseNode *)curr; curr += sizeof(PreciseSparseNode); *(((SparseElementList *)to)->head.precise) = *(n.precise); ((SparseElementList *)to)->head.compact = NULL; } tpn = ((SparseElementList *)to)->head; n = next_node(n); i = 1; tn.is_compact = from->compact; tpn.is_compact = from->compact; while (!null_node(n)) { if (from->compact) { tn.compact = (CompactSparseNode *)curr; curr += sizeof(CompactSparseNode); tn.compact->data = n.compact->data; tn.compact->prev = tpn.compact; tn.compact->next = NULL; tn.precise = NULL; tpn.compact->next = tn.compact; } else { tn.precise = (PreciseSparseNode *)curr; curr += sizeof(PreciseSparseNode); tn.precise->data = n.precise->data; tn.precise->prev = tpn.precise; tn.precise->next = NULL; tn.compact = NULL; tpn.precise->next = tn.precise; } n = next_node(n); tpn = tn; i++; } ((SparseElementList *)to)->tail = tpn; ((SparseElementList *)to)->last_addr = curr; return curr; } //writes a node to a file static inline size_t node_write(SparseNode n, FILE *fp) { if (null_node(n) || !fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "node_write: null arguments.\n"); } } if (n.is_compact) { return sizeof(CompactSparseNode)*fwrite(n.compact, sizeof(CompactSparseNode), 1, fp); } return sizeof(PreciseSparseNode)*fwrite(n.precise, sizeof(PreciseSparseNode), 1, fp); } //reads a node from a file static inline SparseNode node_read(int is_compact, FILE *fp) { SparseNode n = make_null_node(is_compact); size_t nr; if (!fp) { if (MATR_DEBUG_MODE) { fprintf(stderr, "node_read: bad file pointer.\n"); } return n; } if (n.is_compact) { n.compact = (CompactSparseNode *)malloc(sizeof(CompactSparseNode)); nr = fread(n.compact, sizeof(CompactSparseNode), 1, fp); if (!nr) { //end of file free(n.compact); return make_null_node(is_compact); } n.compact->next = NULL; n.compact->prev = NULL; return n; } n.precise = (PreciseSparseNode *)malloc(sizeof(PreciseSparseNode)); nr = fread(n.precise, sizeof(PreciseSparseNode), 1, fp); if (!nr) { //end of file free(n.precise); return make_null_node(is_compact); } n.precise->next = NULL; n.precise->prev = NULL; return n; } //maps a node in from memory //on finish, addr points to the address AFTER the node that was mapped in //if there was not enough memory between *addr and last_addr, *addr will //point AFTER LAST_ADDR and a null_node will be returned. static inline SparseNode node_map(int is_compact, void **addr, void *last_addr){ SparseNode n = make_null_node(is_compact); if (*addr >= last_addr) { if (MATR_DEBUG_MODE) { fprintf(stderr, "node_map: no memory.\n"); } return n; } if (n.is_compact) { n.compact = (CompactSparseNode *)(*addr); *addr += sizeof(CompactSparseNode); if (*addr > last_addr) { return make_null_node(is_compact); } n.compact->next = NULL; n.compact->prev = NULL; return n; } n.precise = (PreciseSparseNode *)(*addr); *addr += sizeof(PreciseSparseNode); if (*addr > last_addr) { return make_null_node(is_compact); } n.precise->next = NULL; n.precise->prev = NULL; return n; } //qsort comparison functions //a function for use with qsort that compares CompactExpandingTypes //by their integer values int compact_expanding_type_int_compare(const void *a, const void *b) { CompactExpandingType *ceta = (CompactExpandingType *)a, *cetb = (CompactExpandingType *)b; if (ceta->i < cetb->i) { return -1; } if (ceta->i > cetb->i) { return 1; } return 0; } //function to be passed to qsort that compares two PreciseExpandingType's by //the value. the sort will be in INCREASING value order. int precise_sparse_element_val_compare(const void *a, const void *b) { PreciseSparseElement *ra = (PreciseSparseElement *)a, *rb = (PreciseSparseElement *)b; if (ra->data < rb->data) { return -1; } if (ra->data > rb->data) { return 1; } return 0; } //function to be passed to qsort that compares two PreciseExpandingType's by //col number. the sort will be in DECREASING row order int precise_sparse_element_col_compare(const void *a, const void *b) { PreciseSparseElement *ra = (PreciseSparseElement *)a, *rb = (PreciseSparseElement *)b; if (ra->col > rb->col) { return -1; } if (ra->col < rb->col) { return 1; } return 0; } crm114-20100106-BlameMichelson.src/blacklist.mfp0000644000000000017500000000000011321154266017437 0ustar rootwsycrm114-20100106-BlameMichelson.src/crm114_config.h0000644000000000017500000005241111321154266017505 0ustar rootwsy// crm114_config.h -- Configuration for CRM114. // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. /////////////////////////////////////////////////////////////////// // Some things here you can change with relative impunity. // Other things, not so much. Where there are limiting factors // noted, please obey them or you may break something important. // And, of course, realize that this is GPLed software with // NO WARRANTY - make any changes and that goes double. /////////////////////////////////////////////////////////////////// #ifndef __CRM114_CONFIG_H__ #define __CRM114_CONFIG_H__ // Do you want all the classifiers? Or just the "production // ready ones"? Comment the next line out if you want everything. //#define PRODUCTION_CLASSIFIERS_ONLY // // // default size of the variables hashtable (a.k.a. the VHT) #define DEFAULT_VHT_SIZE 4095 // default limit on the control stack (for catching infinite loops, // not a preallocated variable) #define DEFAULT_CSTK_LIMIT 1024 // how many levels (pending operations) will we allow in // math evaluations. We _could_ have it be unlimited, but // this serves as an error catcher in runaway programs. #define DEFAULT_MATHSTK_LIMIT 1024 // default maximum number of lines in any program file #define DEFAULT_MAX_PGMLINES 10000 // define maximum number of INSERTs before we think we're in an // infinite loop... #define DEFAULT_MAX_INSERTS 1024 // default size of the data window: 8 megabytes. #define DEFAULT_DATA_WINDOW 8388608 //#define DEFAULT_DATA_WINDOW 16777216 //#define DEFAULT_DATA_WINDOW 1048576 // mmap cacheing length - only actually write out this often. // set to 0 to disable mmap cacheing and release files faster. // However, this has a negative speed impact. // I unset this from 0 -JB //#define UNMAP_COUNT_MAX 0 //#define UNMAP_COUNT_MAX 2 #define UNMAP_COUNT_MAX 1000 // What's the smallest chunk we actually want to bother reclaiming // on the fly out of the isolated data area "tdw". Set this to 1 // for agressive compression; values like 100 to 10K can speed up // execution of things that thrash the tdw badly; set to larger // than the data window size to completely disable the on-the-fly // reclaimer. Watch out though- values less than 1 can cause the // end of one variable to overlap the start of another; this causes // horrible problems. FOR LATER IMPROVEMENT: Start with a // relatively large reclaimer value, then decrease slowly as memory // becomes more scarce. #define MAX_RECLAIMER_GAP 5 // How many regex compilations do we cache? (this saves the time // to recompile regexes in a loop, but uses memory) Set to zero to // disable cacheing. Note that we cache the actual regex, not the // source code line, so this happens *after* the regex text is var // expanded; two different expressions that evaluate to the same // actual regex will share the same cache slot, which is pretty // cool. // // For programs that don't loop, or reuse the same regex a lot, // performance is slightly better with cacheing disabled. But if you // do reuse the same regexes tens or hundreds of times (say, lots of // LIAF-loops) then cacheing can accelerate your program significantly. // //#define CRM_REGEX_CACHESIZE 0 //#define CRM_REGEX_CACHESIZE 10 #define CRM_REGEX_CACHESIZE 1024 // // and how do we want the regex cache to work? RANDOM_ACCESS can // keep more things around, but is only 1 LRU deep for each slot so // use plenty of slots, like 256 or more. LINEAR_SEARCH is a // strict LRU cache but that's slower; don't use too many slots // with LINEAR_SEARCH or you'll spend more time searching the cache // than you would have spent just recompiling the regex. // // Be sure to turn on ONLY ONE of these !!!! // #define REGEX_CACHE_RANDOM_ACCESS //#define REGEX_CACHE_LINEAR_SEARCH // How big a space in a "standard header" (which is relatively new // and most classifiers don't support yet) do we want to use? Note // that changing this will break all previously generated statistics // files that use this standard header. #define STATISTICS_FILE_NCHUNKS 1024 #define STATISTICS_FILE_IDENT_STRING_MAX 1024 #define CLASSNAME_TAG_LENGTH 32 // do we use Sparse Binary Polynomial Hashing (sensitive to both // sequence and spacing of individual words), Token Grab Bag, or // Token Sequence Sensitive? Testing against the SpamAssassin // "hard" database shows that SBPH, TGB, and TGB2, are somewhat // more accurate than TSS, and about 50% more accurate than First // Order Only. However, that's for English, and other natural // languages may show a different statistical distribution. // // Choose ONE of the following: // SBPH, TGB2, TGB, TSS, or ARBITRARY_WINDOW_LEN: // // *** DANGER, WILL ROBINSON *** You MUST rebuild your .css files from // samples of text if you change this. // // // Sparse Binary Polynomial Hashing #define SBPH // // Token Grab Bag, noaliasing //#define TGB2 // // Token Grab Bag, aliasing //#define TGB // // Token Sequence Sensitive //#define TSS // // First Order Only (i.e. single words, like SpamBayes) // Note- if you use FOO, you must turn off weights!! //#define FOO // // Generalized format for the window length. // // DO NOT SET THIS TO MORE THAN 10 WITHOUT LENGTHENING hctable // the classifier modules !!!!!! "hctable" contains the pipeline // hashing coefficients and needs to be extended to 2 * WINDOW_LEN // // Generic window length code //#define ARBITRARY_WINDOW_LENGTH // #define MARKOVIAN_WINDOW_LEN 5 // #define OSB_BAYES_WINDOW_LEN 5 // // DO NOT set this to more than 5 without lengthening the // htup1 and htup2 tables in crm_unified_bayes.c // #define UNIFIED_BAYES_WINDOW_LEN 5 // // Unified tokenization pipeline length. // maximum window length _ever_. #define UNIFIED_WINDOW_LEN 32 // // maximum number of weight vectors to be applied to the pipeline #define UNIFIED_VECTOR_LIMIT 256 //// // Winnow algorithm parameters here... // #define OSB_WINNOW_WINDOW_LEN 5 #define OSB_WINNOW_PROMOTION 1.23 #define OSB_WINNOW_DEMOTION 0.83 // // Now, choose whether we want to use the "old" or the "new" local // probability calculation. The "old" one works slightly better // for SBPH and much better for TSS, the "new" one works slightly // better for TGB and TGB2, and _much_ better for FOO // // The current default (not necessarily optimal) // is Markovian SBPH, STATIC_LOCAL_PROBABILITIES, // LOCAL_PROB_DENOM = 16, and SUPER_MARKOV // //#define LOCAL_PROB_DENOM 2.0 #define LOCAL_PROB_DENOM 16.0 //#define LOCAL_PROB_DENOM 256.0 #define STATIC_LOCAL_PROBABILITIES //#define LENGTHBASED_LOCAL_PROBABILITIES // //#define ENTROPIC_WEIGHTS //#define MARKOV_WEIGHTS #define SUPER_MARKOV_WEIGHTS //#define BREYER_CHHABRA_SIEFKES_WEIGHTS //#define BREYER_CHHABRA_SIEFKES_BASE7_WEIGHTS //#define BCS_MWS_WEIGHTS //#define BCS_EXP_WEIGHTS // // // Do we use learncount-based normalization in calculating probabilities? #define OSB_LEARNCOUNTS // // Do we take only the maximum probability feature? // //#define USE_PEAK // // // Should we use stochastic microgrooming, or weight-distance microgrooming- // Make sure ONE of these is turned on. //#define STOCHASTIC_AMNESIA #define WEIGHT_DISTANCE_AMNESIA #if (! defined (STOCHASTIC_AMNESIA) && ! defined (WEIGHT_DISTANCE_AMNESIA)) #error Neither STOCHASTIC_AMNESIA nor WEIGHT_DISTANCE_AMNESIA defined #elif (defined (STOCHASTIC_AMNESIA) && defined (WEIGHT_DISTANCE_AMNESIA)) #error Both STOCHASTIC_AMNESIA and WEIGHT_DISTANCE_AMNESIA defined #endif // // define the default max chain length in a .css file that triggers // autogrooming, the rescale factor when we rescale, and how often // we rescale, and what chance (mask and key) for any particular // slot to get rescaled when a rescale is triggered for that slot chain. //#define MICROGROOM_CHAIN_LENGTH 1024 #define MICROGROOM_CHAIN_LENGTH 256 //#define MICROGROOM_CHAIN_LENGTH 64 #define MICROGROOM_RESCALE_FACTOR .75 #define MICROGROOM_STOCHASTIC_MASK 0x0000000F #define MICROGROOM_STOCHASTIC_KEY 0x00000001 #define MICROGROOM_STOP_AFTER 32 // maximum number of buckets groom-zeroed #define FEATURE_HIT_INCREMENT_SIZE 7 // define the "block ratio" of how of a memory data window we're // willing to suck in from a minion process before we block on // sucking; the un-sucked part just waits in the minion's stdout // buffer (and causes the minion to block on output). Normally a // factor of 2 (1/4th of the size of a full memory window, or 2 // megabytes in the default configuraton) is sufficient. #define SYSCALL_WINDOW_RATIO 2 // define default internal debug level #define DEFAULT_INTERNAL_TRACE_LEVEL 0 // define default user debug level #define DEFAULT_USER_TRACE_LEVEL 0 // define maximum number of parenthesized sub regexes we'll accept #define MAX_SUBREGEX 256 // define maximum bracket depth nesting we'll allow.... #define MAX_BRACKETDEPTH 256 // define maximum number of iterations allowed for EVAL expansion //#define MAX_EVAL_ITERATIONS 16384 //#define MAX_EVAL_ITERATIONS 1024 #define MAX_EVAL_ITERATIONS 4096 // define maximum size of a pattern in bytes #define MAX_PATTERN 16384 // and how long can a variable name be #define MAX_VARNAME 2048 // define the default number of buckets in a learning file hash table // (note that this should be a prime number, or at least one with a // lot of big factors) // // this value (2097153) is one more than 2 megs, for a .css of 24 megs //#define DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH 2097153 // // this value (1048577) is one more than a meg, for a .css of 12 megs // for the Markovian, and half that for OSB classifiers #define DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH 1048577 #define DEFAULT_MARKOVIAN_SPARSE_SPECTRUM_FILE_LENGTH 1048577 #define DEFAULT_OSB_BAYES_SPARSE_SPECTRUM_FILE_LENGTH 524287 // Mersenne prime #define DEFAULT_WINNOW_SPARSE_SPECTRUM_FILE_LENGTH 1048577 //#define DEFAULT_BIT_ENTROPY_FILE_LENGTH 2000000 #define DEFAULT_BIT_ENTROPY_FILE_LENGTH 1000000 // ??? #define OSB_BAYES_MAX_FEATURE_COUNT DEFAULT_OSB_BAYES_SPARSE_SPECTRUM_FILE_LENGTH #define WINNOW_MAX_FEATURE_COUNT DEFAULT_WINNOW_SPARSE_SPECTRUM_FILE_LENGTH // For the hyperspace matcher, we need to define a few things. #define HYPERSPACE_MAX_FEATURE_COUNT 500000 // Stuff for bit-entropic configuration // Define the size of our alphabet, and how many bits per alph. #define ENTROPY_ALPHABET_SIZE 2 #define ENTROPY_CHAR_SIZE 1 #define ENTROPY_CHAR_BITMASK 0x1 // What fraction of the nodes in a bit-entropic file should be // referenceable from the FIR prior arithmetical encoding // lookaside table? 0.01 is 1% == average of 100 steps to find // the best node. 0.2 is 20% or 5 steps to find the best node. #define BIT_ENTROPIC_FIR_LOOKASIDE_FRACTION 0.1 #define BIT_ENTROPIC_FIR_LOOKASIDE_STEP_LIMIT 128 #define BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT 0.5 #define BIT_ENTROPIC_SHUFFLE_HEIGHT 1024 // was 256 #define BIT_ENTROPIC_SHUFFLE_WIDTH 1024 // was 256 #define BIT_ENTROPIC_PROBABILITY_NERF 0.0000000000000000001 // Defines for the svm classifier // All defines you should want to use without getting into // the nitty details of the SVM are here. For nitty detail // defines, see crm_svm_matrix_util.h, crm_svm_quad_prog.h, // crm_svm_matrix.h, and crm_svm_lib_fncts.h #define MAX_SVM_FEATURES 100000 //per example #define SVM_INTERNAL_TRACE_LEVEL 3 //the debug level when internal_trace is //on #define SVM_ACCURACY 1e-3 //The accuracy to which to run the solver //This is the average margin violation NOT //accounted for by the slack variable. #define SV_TOLERANCE 0.01 //An example is a support vector if //theta*y*x <= 1 + SV_TOLERANCE. //The smaller SV_TOLERANCE, the fewer //examples will be tagged as support //vectors. This will make it faster to //learn new examples, but possibly less //accurate. #define SVM_ADD_CONSTANT 1 //Define this to be 1 if you want a //constant offset in the classification //ie h(x) = theta*x + b where b is //the offset. If you don't want //a constant offset (just h(x) = theta*x), //define this to be 0. #define SVM_HOLE_FRAC 0.25 //Size of the "hole" left at the end of //the file to allow for quick appends //without having to forcibly unmap the //file. This is as a fraction of the //size of the file without the hole. So //setting it to 1 doubles the file size. //If you don't want a hole left, set //this to 0. #define SVM_MAX_SOLVER_ITERATIONS 200 //absolute maximum number of loops the //solver is allowed #define SVM_CHECK 100 //every SVM_CHECK we look to see if //the accuracy is better than //SVM_CHECK_FACTOR*SVM_ACCURACY. //If it is, we exit the solver loop. #define SVM_CHECK_FACTOR 2 //every SVM_CHECK we look to see if //the accuracy is better than //SVM_CHECK_FACTOR*SVM_ACCURACY. //If it is, we exit the solver loop. //defines for SVM microgrooming #define SVM_GROOM_OLD 10000 //we groom only if there are this many //examples (or more) not being used in //solving #define SVM_GROOM_FRAC 0.9 //we keep this fraction of examples after //grooming //defines for svm_smart_mode #define SVM_BASE_EXAMPLES 1000 //the number of examples we need to see //before we train #define SVM_INCR_FRAC 0.1 //if more than this fraction of examples //are appended, we do a fromstart rather //than use the incremental method. // Defines for the PCA classifier // All defines you should want to use without getting into // the nitty details of the PCA are here. For nitty detail // defines, see crm_svm_matrix_util.h and crm_pca_lib_fncts.h #define MAX_PCA_FEATURES 100000 //per example #define PCA_INTERNAL_TRACE_LEVEL 3 //the debug level when internal_trace is on #define PCA_ACCURACY 1e-8 //accuracy to which to run the solver #define MAX_PCA_ITERATIONS 1000 //maximum number of solver iterations #define PCA_CLASS_MAG 50 //the starting class magnitudes. if this //is too small, the solver will double it //and resolve. if it is too large, the //solver will be less accurate. #define PCA_REDO_FRAC 0.001 //if we get this fraction of training //examples wrong with class mag enabled, we //retrain with class mag doubled. #define PCA_MAX_REDOS 20 //The maximum number of redos allowed when //trying to find the correct class mag. #define PCA_HOLE_FRAC 0.25 //Size of the "hole" left at the end of //the file to allow for quick appends //without having to forcibly unmap the file. //This is as a fraction of the size of the //file without the hole. So setting it to //1 doubles the file size. If you don't //want a hole left, set this to 0. //defines for PCA microgrooming #define PCA_GROOM_OLD 10000 //we groom only if there are this many //examples (or more) present #define PCA_GROOM_FRAC 0.9 //we keep this fraction of examples after //grooming // define the maximum length of a filename // #define MAX_FILE_NAME_LEN 255 // defaults to system's, if any #ifdef NAME_MAX #define MAX_FILE_NAME_LEN NAME_MAX+1 #else #ifdef FILENAME_MAX #define MAX_FILE_NAME_LEN FILENAME_MAX+1 #else #define MAX_FILE_NAME_LEN 256 #endif #endif // define how many microseconds to sleep waiting for a minion process // to complete: //#define MINION_SLEEP_USEC 1000000 //#define MINION_SLEEP_USEC 10000 //#define MINION_SLEEP_USEC 1000 //#define MINION_SLEEP_USEC 100 #define MINION_SLEEP_USEC 10 // How many microseconds to sleep if we're looping on input WINDOW stmt. // try 1 millisecond for now #define INPUT_WINDOW_SLEEP_USEC 1000 // DANGER DANGER DANGER DANGER DANGER // CHANGE THESE AT YOUR PERIL- YOUR .CSS FILES WILL NOT BE // FORWARD COMPATIBLE WITH ANYONE ELSES IF YOU CHANGE THESE. // // Maximum number of different .CSS files in a CLASSIFY #define MAX_CLASSIFIERS 128 // how many classes can the library support? #define LIBCRM_MAX_CLASSES MAX_CLASSIFIERS // Maximum length of a stored regex (ugly! But we need a max length // in the mapping. GROT GROT GROT ) #define MAX_REGEX 4096 // Maximum number of coeffs for a particular pipeline. (ugly! But we // need a max length for easy mapping. GROT GROT GROT ) #define MAX_PIPECOEFFS 512 #define MAX_CLASSIFIER_PARAMS 1024 // Define the type of a token. This should be either 32-bit or // 64-bit. Note that some (for now, all!) classifiers will ignore this. typedef int CRM114_TOKEN; // typedef double CRM114_TOKEN; // /// END OF DANGER DANGER DANGER DANGER ///////////////////////////////////////////////////////////////////// // Maximum number of nonfatal errors we'll allow before tossing our // cookies on a fatal error #define MAX_NONFATAL_ERRORS 100 // How big is a feature bucket? Is it a byte, a short, a long, // a float, whatever. :) //#define FEATUREBUCKET_VALUE_MAX 32767 #define FEATUREBUCKET_VALUE_MAX 1000000000 #define FEATUREBUCKET_HISTOGRAM_MAX 4096 //////////////////////////////////////////// // // Improved FSCM-specific parameters // ///////////////////////////////////////////// // this is 2^18 + 1 // This determines the tradeoff in memory vs. speed/accuracy. //define FSCM_DEFAULT_HASH_TABLE_SIZE 262145 // // This is 1 meg + 1 #define FSCM_DEFAULT_HASH_TABLE_SIZE 1048577 // How long are our prefixes? Original prefix was 3 but that's // rather suboptimal for best speed. 6 looks pretty good for speed and // accuracy. // prefix length 6 and thickness 10 (200 multiplier) yields 29 / 4147 // //#define FSCM_DEFAULT_CODE_PREFIX_LEN 3 #define FSCM_DEFAULT_CODE_PREFIX_LEN 6 // The chain cache is a speedup for the FSCM match // It's indexed modulo the chainstart, with associativity 1.0 #define FSCM_CHAIN_CACHE_SIZE 1048577 //////////////////////////////////////////// // // Neural Net parameters // //////////////////////////////////////////// #define NN_RETINA_SIZE 8192 #define NN_FIRST_LAYER_SIZE 8 #define NN_HIDDEN_LAYER_SIZE 8 #define NN_MAX_FEATURES 65536 // Neural Net training setups // // Note- convergence seems to work well at // alpha 0.2 init_noise 0.5 stoch_noise 0.1 gain_noise 0.00000001 // alpha 0.2 init_noise 0.2 stoch_noise 0.1 gain_noise 0.00000001 // alpha 0.2 init_noise 0.2 stoch_noise 0.05 gain_noise 0.00000001 // alpha 0.2 init_noise 0.2 stoch_noise 0.05 gain_noise 0.00000001 // alpha 0.2 init_noise 0.2 stoch_noise 0.05 gain_noise 2.0 // alpha 0.2 init_noise 0.2 stoch_noise 0.05 gain_noise 2.0 zerotr 0.9999 #define NN_DEFAULT_ALPHA 0.2 // Initialization noise magnitude #define NN_INITIALIZATION_NOISE_MAGNITUDE 0.2 // Stochastic noise magnitude #define NN_DEFAULT_STOCH_NOISE 0.05 // Gain noise magnitude #define NN_DEFAULT_GAIN_NOISE 2.0 // Zero-tracking factor - factor the weights move toward zero every epoch #define NN_ZERO_TRACKING 0.9999 // Threshold for back propagation #define NN_INTERNAL_TRAINING_THRESHOLD 0.1 // Just use 1 neuron excitation per token coming in. #define NN_N_PUMPS 1 // How many training cycles before we punt out #define NN_MAX_TRAINING_CYCLES 500 // When doing a "nuke and retry", allow this many training cycles. #define NN_MAX_TRAINING_CYCLES_FROMSTART 5000 // How often do we cause a punt (we punt every 0th epoch modulo this number) #define NN_FROMSTART_PUNTING 10000000 // After how many "not needed" cycles do we microgroom this doc away? #define NN_MICROGROOM_THRESHOLD 1000000 // use the sparse retina design? No, it's not good. #define NN_SPARSE_RETINA 0 // End of configurable parameters. #endif // !_CRM114_CONFIG_H_ crm114-20100106-BlameMichelson.src/crm_svm_quad_prog.c0000644000000000017500000012153611321154266020660 0ustar rootwsy#include "crm_svm_quad_prog.h" // crm_svm_quad_prog.c - Support Vector Machine //////////////////////////////////////////////////////////////////////// // This code is originally copyright and owned by William // S. Yerazunis as file crm_neural_net. In return for addition of // significant derivative work, Jennifer Barry is hereby granted a full // unlimited license to use this code, includng license to relicense under // other licenses. //////////////////////////////////////////////////////////////////////// // // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. /****************************************************************** *We use the active set method outlined in Gill and Murray 1977 *"Numerically Stable Methods for Quadratic Programming" *Philip E. Gill and Walter Murray *Mathematical Programming 14 (1978) 349-372 *North-Holland Publishing Company *We want to solve a constrained quadratic problem of the form *min_x f(x) = 0.5xGx + c\dot x *subject to Ax >= b *for POSITIVE DEFINITE G. *First consider the problem without the constraints. We could just *solve for f'(x) = Gx + c = 0, which is a linear problem. * *Now add in the constraints. We would like to get near f'(x) = 0 *while still fulfilling all contraints on x. Therefore we use *an iterative method where we walk down the gradient, adding constraints *as they become active (ie constraint i is active if (Ax)_i = b_i) and *removing them as they become inactive ((Ax)_i > b_i). * *When a constraint is active, we only allow descent along directions *perpendicular to the constraint so that we will never violate it. *Specifically, at each iteration we solve for the direction p in * G'(x+p) + c' = 0 *where G' and c' are projections of G and c along directions orthogonal *to the current active constraints. This requires us to keep a projection *matrix Z such that G' = Z^T*G*Z and c' = Z*c. In order to calculate this *projection matrix quickly, we also keep the QR factorization of the current *ACTIVE constraint matrix A'^T (where A' contains only the rows of A that are *that are currently active. If we have k constraints then A' is k x n and *A'^T is n x k). The QR factorization of A'^T is given by two matrices Q *and R such that *QA'^T = |R| * |0_{n-k x k}| *where Q is nxn and the rows of Q are orthonormal (so Q*Q^T = I_nxn) and R *is kxk upper triangular. *Note that the product of the last n-k rows of Q with any row of A' is zero. *Therefore, the last n-k rows of Q are a projection matrix onto a space *orthogonal to any active constraints and the columns of Z are the last n-k *rows of Q. It turns out we can update Q and R quickly when we add or remove *a row from A' (corresponding to a constraint becoming active or inactive). *For exacly how these updates work, see the add_constraint and *delete_constraint functions. * *The steps of the algorithm are then * 1) Find an initial starting point that fulfills the constraints. * This is actually a linear minimization problem and one we do * not actually solve. If a starting point is passed in by the user * (indicated by x != 0), we use that. Note we do not check to make * sure it fulfills all constraints. If no starting point is passed in * (x = 0), we assume that Ax >= b is always * of the form passed in by the linear solver specifying that x >= 0 * and sum_x >= C and set x = C/(n+1) where G is nxn. This is NOT * a general method for finding a starting point and would need to * be improved upon to make this a general solver. * 2) Solve for the direction p such that G'(x+p) + c' = 0. * See find_direction for an overview of how we solve for p. * 3) If ||p|| ~= 0, solve for the Lagrange mulipliers on the active * constraints. If these are all positive (ie, all of the active * constraints are still active), return x. Otherwise, pick an active * constraint with a negative mulitplier and remove it and return to step 2. * For an overview of how to find the Lagrange multipliers see the * compute_lambda function. * For an overview of how to remove a constraint see the delete_constraint * function. * 3) Find how far we can go in direction p without violating a constraint. * 4) If we can move ||p|| in direction p without violating a constraint, * update x -> x + p and return to step 2. * 5) Otherwise, move as far as possible in direction p and add the first * constraint we hit to the set of active constraints. Repeat from step 2. * For an overview of how to add a constraint see the add_constraint * function. * *INPUT: Matrices G, A, vectors c, b where the solution will be * min_x 0.5*x*G*x + c*b * s.t. Ax >= b * if x is non-zero it is assumed to be a starting point such that * Ax >= b * *OUTPUT: x fulfilling the equation above as pass by reference. * *TYPES: Really depends on how sparse G and A are. Nothing in this function * takes its type from the arguments passed in so pick what you like. * However, the sparse arrays in this function will initialize with * the size passed in with A so even if A is not a sparse array, you need * to set its initial size. * *WARNING: I think I (finally) have all of the bugs out of this algorithm * but it does tend to be picky (especially if QP_LINEAR_ACCURACY is set to a * high number so the conjugate gradient solver doesn't give a great answer) * so I've put a maxmimum number of loops on it. ***********************************************************************/ void run_qp(Matrix *G, Matrix *A, Vector *c, Vector *b, Vector *x) { int tk = 0, n = A->cols, m = A->rows, oldtk = 0, neg_lambda, toadd, toremove, i, active[m], constr[n], loop_it = 0, def_val, agz = 0; //gradient g = G*x + c Vector *g = vector_make_size(n, SPARSE_LIST, MATR_PRECISE, A->size), //direction G*(x+p) + c = 0 *p = vector_make_size(n, SPARSE_LIST, MATR_PRECISE, A->size), *row; double alpha, r, dotp, length, bval; //R and Q are QR factorization of the current constraint matrix Matrix *R = matr_make_size(0,0, SPARSE_ARRAY, MATR_PRECISE, A->size), *Q = matr_make_size(n, n, NON_SPARSE, MATR_PRECISE, A->size), //projection matrix *Z = matr_make_size(n, n, SPARSE_ARRAY, MATR_PRECISE, A->size); //it is not actually necessary to keep the active constraint matrix //seperately from A. However, I have left it commented out to make //it clear which constraints are currently active. //*currA = matr_make(0,0); VectorIterator vit; MATR_DEBUG_MODE = QP_DEBUG_MODE; //debug mode if (QP_DEBUG_MODE >= QP_DEBUG) { //print out the arguments fprintf(stderr, "Arguments are\nn = %d, m = %d\nG = \n", n, m); matr_print(G); fprintf(stderr, "A = \n"); matr_print(A); fprintf(stderr, "f = "); vector_print(c); fprintf(stderr, "b = "); vector_print(b); fprintf(stderr, "x0 = "); vector_print(x); } for (i = 0; i < m; i++) { //active[i] = 1 if constraint i is active active[i] = 0; //constr[i] = the column of A'^T corresponding to constraint i if //i is active. -1 if i is inactive. constr[i] = -1; } //initialize Q and Z to be the identity matrix //this is fast if Q and Z are sparse //because they were zero matrices before for (i = 0; i < n; i++) { matr_set(Q, i, i, 1); matr_set(Z, i, i, 1); } //finding an initial feasible point is a chore in and of itself //for what i'm doing, because i know the constraints, i know that //this vector works, but it may NOT in the general case //if you need a general solver, this step needs to be a linear //optimization problem if (vector_iszero(x)) { if (QP_DEBUG_MODE >= QP_DEBUG) { fprintf(stderr, "No inital guess. Using default guess.\n"); } //otherwise x is an initial starting point vectorit_set_at_beg(&vit, b); def_val = vectorit_curr_val(vit, b); for (i = 0; i < n; i++) { vector_set(x, i, def_val/(double)(n+1)); } } while (loop_it < QP_MAX_ITERATIONS) { toadd = -1; toremove = -1; //calculate the gradient matr_vector(G, x, g); //sparse times non-sparse = fast vector_add(g, c, g); //sparse + non-sparse //find the direction //we want g = Gx + c = 0 (gradient) //so direction we need to go is p //where G(x+p) + c = 0 //so Gp = -g //we will project only onto the directions //of inactive constraints (since we can't //move a constrained direction) //Z is the matrix we use to project onto these directions if (tk < n) { length = find_direction(Z, G, g, p); } else { length = 0; } if (QP_DEBUG_MODE >= QP_DEBUG_LOOP) { fprintf(stderr, "x = "); vector_print(x); if (tk < n) { fprintf(stderr, "p = "); vector_print(p); } fprintf(stderr, "length = %.10lf, tk = %d\n", length, tk); } if (length <= QP_LINEAR_ACCURACY) { if (tk > 0) { neg_lambda = compute_lambda(R, Q, g); } else { //no constraints are active //we reached the actual minimum - yay! neg_lambda = -1; } if (neg_lambda < 0) { //all currently "active" constraints actually are active //this is the solution break; } else { //an active constraint became inactive somewhere along //the way //remove it and see if we can move in that direction toremove = neg_lambda; } } else { alpha = SVM_INF; //alpha is the step size //find the step size vectorit_set_at_beg(&vit, b); for (i = 0; i < m; i++) { if (!active[i]) { row = matr_get_row(A, i); //part of p in direction of constraint dotp = dot(row, p); //sparse times sparse if (QP_DEBUG_MODE >= QP_LINEAR_SOLVER) { fprintf(stderr, "dotp = %.10f\n", dotp); } if (dotp < -QP_LINEAR_ACCURACY) { if (vectorit_curr_col(vit, b) != i) { bval = 0; } else { bval = vectorit_curr_val(vit, b); } //step size for this constraint r = (bval - dot(row, x))/dotp; //sparse times sparse if (QP_DEBUG_MODE >= QP_LINEAR_SOLVER) { fprintf(stderr, "bval = %f r = %f dot(row, x) = %.10f\n", bval, r, dot(row, x)); } if (r < alpha) { //this is the most constrained constraint so far alpha = r; toadd = i; } } } if (vectorit_curr_col(vit, b) == i) { vectorit_next(&vit, b); } } if (agz && alpha < SVM_EPSILON) { //we actually can't move along this constraint //this means we must have added the constraint at some //point in the past, then removed it, then tried to //add it again WITHOUT actually being able to move in this //direction. This happens when the linear solver is not quite //accurate enough (through the fact that we consider SVM_EPSILON //to be zero I believe) and something that should be just above //zero is instead just below. By the time we're looking at removing //constraints with such small leeway though, we've got a good //enough answer. So give up. // //The agz term is in case our original x0 starting point is already //up against some constrained directions. The first thing we do, then //is add those constraints. In this case, we want to keep going even //though we are adding constraints along which we can't move at all - //after we have added the first alpha non-zero constraint, we need to //start checking for alpha = 0. break; } if (alpha > SVM_EPSILON) { agz = 1; } if (alpha > 1) { //we can move ||p|| in direction p //there is no new active constraint alpha = 1; toadd = -1; } else { //we must move less than ||p|| because we hit a constraint //move as far as we can and add the constraint vector_multiply(p, alpha, p); //sparse if (QP_DEBUG_MODE >= QP_LINEAR_SOLVER) { fprintf(stderr, "alpha = %f\n", alpha); } } vector_add(x, p, x); //sparse + sparse } oldtk = tk; if (toadd >= 0) { if (QP_DEBUG_MODE >= QP_DEBUG_LOOP) { fprintf(stderr, "Adding constraint %d\n", toadd); } //add constraint active[toadd] = 1; constr[tk] = toadd; //column tk contains toadd add_constraint(toadd, A, Q, R, &Z); tk++; } if (toremove >= 0) { if (QP_DEBUG_MODE >= QP_DEBUG_LOOP) { fprintf(stderr, "Removing constraint %d to remove = %d\n", constr[toremove], toremove); } //remove constraint active[constr[toremove]] = 0; delete_constraint(toremove, A, Q, R, &Z); //we need to keep track of which constraints correspond to which rows for (i = toremove; i < m-1; i++) { constr[i] = constr[i+1]; } tk--; } loop_it++; } if (QP_DEBUG_MODE && loop_it >= QP_MAX_ITERATIONS) { fprintf(stderr, "QP didn't converge. Returning.\n"); } if (QP_DEBUG_MODE >= QP_DEBUG) { fprintf(stderr, "Max iterations was %d\n", loop_it); } matr_free(Q); matr_free(R); //matr_free(currA); matr_free(Z); vector_free(g); vector_free(p); } /************************************************************************** *We need to add row toadd of A to the current set of active constraints *and update Q, R, and Z accordingly. *Specifically we are adding a column to A'^T making it n x k+1. To make *this clear let A^k = A'^T before we add a column and A^{k+1} = A'^T after *we add a column. We will use the same convention for Q, R, and Z. Then *Q^k*A^{k+1} = |R^k Q*a_{1:k} | * | 0 Q*a_{k+1:n}| *where a is the row of A we are adding. *So R is almost upper triangular still except for its last column. To *make it completely upper triangular, we use Givens rotations. These are *used in the Gill and Murray paper, but I believe the original reference *is *Givens, Wallace. *"Computation of plane unitary rotations transforming a general matrix *to triangular form". *J. SIAM 6(1) (1958), pp. 26–50. *If you are just trying to understand them, though, I would recommend the *Wikipedia article. * *To make R upper triangular will require n-(k+1) rotations of the form * | I_{i-1, i-1} 0 0 0_{n-i-1} | * | 0 c_i s_i 0_{n-i-1} | * | 0 -s_i c_i 0_{n-i-1} | * | 0 0 0 I_{n-i-1, n-i-1} | *where * c_i = (Q*a)_(i-k-1)/r_i * s_i = r_{i-1}/r_i * r_i = sqrt(sum_{j = i:n-k+1} Q*a_{j-k-1}^2) *We multiply Q on the left by each of these matrices *each multiplication affects only 2 rows of Q, giving a total of n(n-tk) *time. *Note that Q remains orthonormal since these are rotation matrices. * *INPUT: toadd: the number of the row of A corresponding to the constriant * we are adding * A: the full constraint matrix in A*x >= b * *OUTPUT: Updated QR factorization of A'^T (pass by reference) * Rotation matrix Z. We actually delete and reallocate Z so we need a * double pointer. * *TYPES: It should be easy to add a row and column to R and therefore * R should be some sort of sparse matrix. It is upper triangular so it * is already guaranteed to be ~50% zeros. * Q never changes size so it can be sparse or non-sparse. * Z needs to be freed and reallocated - this will be fastest with a * SPARSE_ARRAY. **************************************************************************/ void add_constraint(int toadd, Matrix *A, Matrix *Q, Matrix *R, Matrix **Z_ptr) { Matrix Q2, *Z = *Z_ptr; unsigned int tk = R->rows, n = Q->rows, i, j, col; int rotate = 1, zsize = Z->size; Vector *Q2a = vector_make_size(n-tk, R->type, MATR_PRECISE, R->size), *trow = vector_make_size(n, Q->type, MATR_PRECISE, Q->size), *brow = vector_make_size(n, Q->type, MATR_PRECISE, Q->size), *arow; VectorType ztype = Z->type; double r2, c_i, s_i, r, uval, gamma; VectorIterator vit; if (QP_DEBUG_MODE >= QP_CONSTRAINTS) { fprintf(stderr, "Before adding constraint %d.\n", toadd); fprintf(stderr, "Q = \n"); matr_print(Q); fprintf(stderr, "R = \n"); matr_print(R); fprintf(stderr, "Z = \n"); matr_print(*(Z_ptr)); } //currA gets another column //matr_add_col(currA_ptr); //currA = *currA_ptr; //for (i = 0; i < currA->rows; i++) { //note that currA transposes A //currA->data[i][currA->cols-1] = A->data[toadd][i]; //} //R^(k+1) is //|R^k Q^k*A(toadd)_{1:k} | //| 0 ||Q^k_2A(toadd)_{k+1:n-k}|| | matr_add_col(R); //sparse = fast matr_add_row(R); //set up Q2 (really just the bottom n-tk rows of Q) Q2.rows = n-tk; Q2.cols = n; Q2.compact = Q->compact; Q2.type = Q->type; Q2.size = Q->size; Q2.data = &(Q->data[tk]); arow = matr_get_row(A, toadd); matr_vector(&Q2, arow, Q2a); gamma = norm(Q2a); vectorit_set_at_end(&vit, matr_get_row(R, tk)); if (fabs(gamma + vector_get(Q2a, 0)) < SVM_EPSILON) { //R is already upper triangular //and the first entry of Q2a is actually -Q2a //we won't rotate R at all, so this entry needs to have the //correct sign vectorit_insert(&vit, tk, vector_get(Q2a, 0), matr_get_row(R, tk)); rotate = 0; } else { vectorit_insert(&vit, tk, gamma, matr_get_row(R, tk)); } for (i = 0; i < tk; i++) { //this sets the last element of every row //so it's fast vectorit_set_at_end(&vit, matr_get_row(R, i)); vectorit_insert(&vit, tk, dot(matr_get_row(Q, i), arow), matr_get_row(R, i)); } if (rotate) { //now we have to update Q vectorit_set_at_end(&vit, Q2a); //although we never need it, keep Q2a updated or things will get confusing! //note that Q2a above it's bottom non-zero element is certainly non-sparse //so we can work with that col = vectorit_curr_col(vit, Q2a); r = vectorit_curr_val(vit, Q2a); r2 = vectorit_curr_val(vit, Q2a)*vectorit_curr_val(vit, Q2a); while (col > 0 && col < Q2a->dim) { //when we hit zero we're done i = col + tk; vectorit_prev(&vit, Q2a); if (vectorit_curr_col(vit, Q2a) == col - 1) { uval = vectorit_curr_val(vit, Q2a); } else { uval = 0; } vectorit_next(&vit, Q2a); c_i = uval; s_i = r; r2 += uval*uval; vectorit_insert(&vit, col-1, r2, Q2a); //r2 is nonzero if we care vectorit_next(&vit, Q2a); vectorit_zero_elt(&vit, Q2a); vectorit_prev(&vit, Q2a); if (vectorit_curr_col(vit, Q2a) == col) { //non-sparse representation vectorit_prev(&vit, Q2a); } col = vectorit_curr_col(vit, Q2a); //should also be col-- r = sqrt(r2); if (r > SVM_EPSILON) { //we still need this in case of full matrices c_i /= r; s_i /= r; //this affects two rows of Q //the i^th row and the one above it //this is done this way so it works for both //sparse and full matrices vector_multiply(matr_get_row(Q, i-1), c_i, trow); vector_add_multiple(trow, matr_get_row(Q, i), s_i, trow); vector_multiply(matr_get_row(Q, i), c_i, matr_get_row(Q, i)); vector_add_multiple(matr_get_row(Q, i), matr_get_row(Q, i-1), -1.0*s_i, matr_get_row(Q, i)); matr_set_row(Q, i-1, trow); } else { r2 = 0; //avoid floating pt errors } } } //recalculate Z //now has one less row //easier to just free //and reallocate //same amount of time matr_free(*Z_ptr); *Z_ptr = matr_make_size(n, n - tk - 1, ztype, MATR_PRECISE, zsize); Z = *Z_ptr; //columns of Z are the last n-tk-1 (since tk is still the old number of //constraints) rows of the new Q in reverse order //this is fast for sparse matrices because we fill columns of j //in ascending order for (j = 0; j < n-tk-1; j++) { matr_set_col(Z, j, matr_get_row(Q, n - j - 1)); } if (QP_DEBUG_MODE >= QP_CONSTRAINTS) { fprintf(stderr, "After adding constraint %d\n", toadd); fprintf(stderr, "Q = \n"); matr_print(Q); fprintf(stderr, "R = \n"); matr_print(R); fprintf(stderr, "Z = \n"); matr_print(Z); } vector_free(Q2a); vector_free(trow); vector_free(brow); } /************************************************************************** *We need to remove row todel of A' (note that todel is the number of the row *in A' NOT A!!) and update Q, R, and Z accordingly. *Specifically we are removing a column from A'^T making it n x k-1. To make *this clear let A^k = A'^T before we remove a column and A^{k-1} = A'^T after *we remove a column. We will use the same convention for Q, R, and Z. Then *Q^k*A^{k-1} = |R^k_0 R^k_1 ... R^k_{todel-1} R^k_{todel+1} ... R^k_k| * | 0 0 .... 0 0 ... 0 | *Therefore R is almost still upper triangular (in fact if todel = k it will *remain upper triangular) but all columns with numbers greater than todel *have an extra element below the diagonal. Therefore, we need to rotate *R to remove this element. We use Givens rotations. These are *used in the Gill and Murray paper, but I believe the original reference *is *Givens, Wallace. *"Computation of plane unitary rotations transforming a general matrix *to triangular form". *J. SIAM 6(1) (1958), pp. 26–50. *If you are just trying to understand them, though, I would recommend the *Wikipedia article. * *When we remove a column c from A', the matrix K = Q^kA^{k-1} *has n-c subdiagonal non-zero elements at K_{j+1, j} for j >= c *We need to rotate K to get rid of these elements *The rotated K is R and we multiply the rotation matrices into Q on the left. *The givens rotation for the j+1, j rotation is * | I_{j, j} 0 0 0_{n-j-2} | * | 0 c_j s_j 0_{n-j-2} | * | 0 -s_j c_j 0_{n-j-2} | * | 0 0 0 I_{n-j-2, n-j-2} | *where *c_j = R_{j-1, j}/r_j *s_j = R_{j,j}/r_j *r_j = sqrt(R_jj^2 + R_{j+1,j}^2) *Note that it is important to start with the c+1st rotation on the *right since that rotation changes the elements of R in the columns to the *right of c+1. *Each multiplication affects only 2 rows of Q so they take (n-c)(n-k) time. * *INPUT: todel: the number of the row of A' (!!!) corresponding to the * constriant we are deleting. NOTE THAT todel IS NOT NECESSARILY THE ROW * NUMBER OF THE CONSTRAINT IN A. Instead, it corresponds to the column * we need to remove from A and R. * A: the full constraint matrix in A*x >= b * *OUTPUT: Updated QR factorization of A'^T (pass by reference) and * the projection matrix Z. We do not delete and reallocate Z here * so passing just a single pointer would have been fine, but it is done * this way for symmetry with add_constraint. * *TYPES: It should be easy to remove a row and column from R and therefore * R should be some sort of sparse matrix. It is upper triangular so it * is already guaranteed to be ~50% zeros. * Q never changes size so it can be sparse or non-sparse. * Z will have a column added to it so it is better if it is sparse. **************************************************************************/ void delete_constraint(int todel, Matrix *A, Matrix *Q, Matrix *R, Matrix **Z_ptr) { int j; Matrix *Z = *Z_ptr; Vector *trow = vector_make_size(Q->cols, Q->type, MATR_PRECISE, Q->size), *brow = vector_make_size(Q->cols, Q->type, MATR_PRECISE, Q->size), *trrow = vector_make_size(R->cols+1, R->type, MATR_PRECISE, R->size), *brrow = vector_make_size(R->cols+1, R->type, MATR_PRECISE, R->size); double r, c_j, s_j, tmp1, tmp2; if (QP_DEBUG_MODE >= QP_CONSTRAINTS) { fprintf(stderr, "Before deleting row %d\n", todel); fprintf(stderr, "Q = \n"); matr_print(Q); fprintf(stderr, "R = \n"); matr_print(R); fprintf(stderr, "Z = \n"); matr_print(*(Z_ptr)); } //update currA by removing the constraint //matr_remove_col(currA_ptr, todel); //Since we have to search each row for this column, this operation //is slow no matter what R is. The removal is faster for a SPARSE_LIST //but the search is slower and vice versa for a SPARSE ARRAY and //a NON_SPARSE structure. matr_remove_col(R, todel); for (j = todel; j < R->cols; j++) { //both of these values can be picked from a sparse R quickly tmp1 = matr_get(R, j+1, j); //this is the first nonzero elt of row j+1 tmp2 = matr_get(R, j, j); //this is the first nonzero elt of row j r = sqrt(tmp1*tmp1 + tmp2*tmp2); if (r < SVM_EPSILON) { continue; } c_j = tmp2/r; s_j = tmp1/r; //this affects the j and j+1 rows of Q vector_multiply(matr_get_row(Q, j), c_j, trow); vector_add_multiple(trow, matr_get_row(Q, j+1), s_j, trow); vector_multiply(matr_get_row(Q, j+1), c_j, matr_get_row(Q, j+1)); vector_add_multiple(matr_get_row(Q,j+1), matr_get_row(Q, j), -1.0*s_j, matr_get_row(Q, j+1)); matr_set_row(Q, j, trow); //and the j and j+1 rows of R vector_multiply(matr_get_row(R, j), c_j, trrow); vector_add_multiple(trrow, matr_get_row(R, j+1), s_j, trrow); vector_multiply(matr_get_row(R, j+1), c_j, matr_get_row(R, j+1)); vector_add_multiple(matr_get_row(R, j+1), matr_get_row(R, j), -1.0*s_j, matr_get_row(R, j+1)); matr_set_row(R, j, trrow); } //remove the bottom row of all zeros //this will be very fast matr_remove_row(R, R->rows-1); //Z remains almost unchanged //note that in changing Q, we left the bottom n-k rows of Q alone. //however, we need to add in another column to Z since we've //lost a constraint //this column is the R->cols (k+1) row of Q matr_add_col(Z); matr_set_col(Z, Z->cols-1, matr_get_row(Q, R->cols)); //setting the last col //is fast if (QP_DEBUG_MODE >= QP_CONSTRAINTS) { fprintf(stderr, "Q = \n"); matr_print(Q); fprintf(stderr, "After R = \n"); matr_print(R); fprintf(stderr, "Z = \n"); matr_print(Z); } vector_free(trow); vector_free(brow); vector_free(trrow); vector_free(brrow); } /*********************************************************************** *Compute the Langrange multipliers for the active constraints. *These are Lagrange multipliers so we must have that *f'(x) - A'^T*lambda = 0 *Giving us that *A'^T*lambda = Gx + c = g *Since A'^T = Q^T*|R| * |0| *we have that R*lambda = Q_{1:k}*g *where Q_{1:k} are the first k rows (ie R->rows) of Q. *Since R is upper triangular we can solve this problem quickly using *back substition (see the function back_sub). * *INPUT: Q,R: QR factorization of A'^T * g: Gradient G*x + c * *OUTPUT: The index of the first negative Lagrange multiplier or -1 * if all Lagrange mulipliers are positive. * *TYPES: Anything should work. ***********************************************************************/ int compute_lambda(Matrix *R, Matrix *Q, Vector *g) { int col = -1; Vector *c = vector_make_size(R->cols, Q->type, MATR_PRECISE, R->size), *lambda = vector_make_size(R->cols, R->type, MATR_PRECISE, R->size); VectorIterator vit; double minval = -1.0*QP_LINEAR_ACCURACY; matr_vector(Q, g, c); //sparse dot non-sparse = fast back_sub(R, c, lambda); //R is upper triangular so this is very fast vector_free(c); if (QP_DEBUG_MODE >= QP_DEBUG_LOOP) { fprintf(stderr, "lambda = "); vector_print(lambda); } //find any negative Lagrange multipliers vectorit_set_at_beg(&vit, lambda); while (!vectorit_past_end(vit, lambda)) { if (vectorit_curr_val(vit, lambda) < minval) { col = vectorit_curr_col(vit, lambda); minval = vectorit_curr_val(vit, lambda); } vectorit_next(&vit, lambda); } vector_free(lambda); return col; } /************************************************************************* *Uses back substitution to solve * U*x = b *where U is kxk upper triangular. *I don't know who originated this method, but it's in textbooks from the *1950s and I suspect it is much older than that. If you just want a feel *for the method, I would recommend the Wikipedia page. * *The basic idea is that we can solve for the bottom element easily: *U_{kk}*x_k = b_k => x_k = b_k/U_kk *Then x_{k-1} can be solved for: *U_{k-1, k-1}x_{k-1} + U_{k-1, k}*x_k = b_{k-1} *=> x_{k-1} = (b_{k-1} - U_{k-1, k}*x_k)/U_{k-1, k-1} *In general, if we start with x = 0 and update as we go *x_i = b_i/U_{ii} - U_i*x *where U_i is the ith row of U. * *INPUT: U an invertible (ie no zero entries on the diagonal), upper * triangular matrix, b a vector * *OUTPUT: ret (pass by reference) such that * U*ret = b * *TYPES: There are dot products so if the matrices might be sparse, * it's a good idea to represent them as sparse. **************************************************************************/ void back_sub(Matrix *U, Vector *b, Vector *ret) { int i; VectorIterator bit, rit; double bval; vector_zero(ret); vectorit_set_at_end(&bit, b); vectorit_set_at_end(&rit, ret); if (QP_DEBUG_MODE >= QP_CONSTRAINTS) { fprintf(stderr, "U = \n"); matr_print(U); fprintf(stderr, "b = "); vector_print(b); } for (i = U->rows-1; i >= 0; i--) { if (vectorit_curr_col(bit, b) == i) { bval = vectorit_curr_val(bit, b); vectorit_prev(&bit, b); } else { bval = 0; } //note that U_ii is the first non-zero entry of U_i //so retrieving it is fast if U is sparse vectorit_insert(&rit, i, (bval - dot(matr_get_row(U, i), ret))/matr_get(U, i, i), ret); vectorit_prev(&rit,ret); } } /*********************************************************************** *Finds the direction we should follow. *In an unconstrained problem this would simply be p such that * G(x + p) + c = 0 => Gp + g = 0 *However, because of the constraints, we only want to move in certain *directions - namely those orthogonal to the active constraints. *Therefore, we use the projection matrix Z (recall that the columns of Z *are orthogonal to the currect active constraints) to only solve the above *equation in the directions orthogonal to the current constraints. *Namely we solve for p' in * Z^T*G*Z*(x+p') + Z^T*c = 0 => Z^T*G*Z*p' + Z^T*g = 0 *and then project it back into the full space using *p = Z*p' *For how we solve the linear equation, see the conjugate_gradient *funtion. * *INPUT: Z: the projection matrix with columns orthogonal to current * active constraints. * G: The Hessian s.t. we want G*(x+p) + c = 0 * g: The gradient g = G*x + c * *OUTPUT: Returns the norm of the projected gradient Z^T*g * Returns the direction in p as pass by reference. * *TYPES: Use the types best fitted to each. We will need to transpose Z * and multiply Z, Z^T, and G by a vector multiple times. ***********************************************************************/ double find_direction(Matrix *Z, Matrix *G, Vector *g, Vector *p) { Matrix *Zt, *list[3]; Vector *pa, *ga; double length; if (QP_DEBUG_MODE >= QP_DEBUG_LOOP) { fprintf(stderr, "g = "); vector_print(g); } if (Z->cols < G->cols) { //Z is actually a projection matrix //Go through this whole rigamarole Zt = matr_make_size(Z->cols, Z->rows, Z->type, MATR_PRECISE, Z->size); matr_transpose(Z, Zt); list[0] = Z; list[1] = G; list[2] = Zt; ga = vector_make_size(Z->cols, Z->type, MATR_PRECISE, Z->size); matr_vector(Zt, g, ga); pa = vector_make_size(Z->cols, Z->type, MATR_PRECISE, Z->size); conjugate_gradient(list, 3, G->rows, ga, pa); matr_vector(Z, pa, p); matr_free(Zt); vector_free(pa); length = norm2(ga); vector_free(ga); return length; } //Z is the identity //solve a simpler problem conjugate_gradient(&G, 1, G->rows, g, p); return norm2(g); } /***************************************************************************** *This solves A_{n-1}A_{n-2}...A_0x + b = 0 *for symmetric, positive definite A = A_{n-1}A_{n-2}...A_0. *If A is symmetric, positive semi-definite and there is no solution *to the equation, the function detects that and returns the answer *from the iteration before it hit infinity. This is the behavior *we want since the application of the constraints elsewhere will keep *the answer to the QP from actually going to infinity. * *The conjugate gradient method was originally proposed in *Hestenes, Magnus R.; Stiefel, Eduard (December 1952). *"Methods of Conjugate Gradients for Solving Linear Systems". *Journal of Research of the National Bureau of Standards 49 (6). *http://nvl.nist.gov/pub/nistpubs/jres/049/6/V49.N06.A08.pdf. * *However, if you're just trying to understand this algorithm, I would *recommend either the wikipedia page or *An Introduction to the Conjugate Gradient Method Without the Agonizing Pain *Jonathan Richard Shewchuk *CMU, 1994 *http://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf * *The main idea of this algorithm is that you always move in "conjugate" *directions. Two directions d_i and d_j are conjugate if * d_i*A*d_j = 0 *This lets us converge to the correct solution in many fewer steps *than a straight gradient descent approach. In addition, we never move *in more directions than there are - ie, the maximum number of iterations *of this algorithm is n where n is the length of x. * *One way of doing this would be to start with a set of basis vectors (ie *the unit vectors) and use a version of Gram-Schmidt to make them all *conjugate and then use those vectors to solve. This requires generating *all n conjugate vectors, though. So, instead, we use an iterative method *and hope that we don't need all n directions to get a good enough answer. * *The derivation for this algorithm is kind of advanced for a comment. *Therefore, I'll just give the update rules. Use the papers above if *you want the derivation. * *Variables: x = solution (vector) * r = remainder = 0 - (A*x + b) (vector) * p = direction (vector) * a = stepsize (scalar) * b = constant used in calculations (Gram-Schmidt coefficient really) *Init: x_0 = 0, r_0 = -b, p_0 = r_0 = -b * a_i = r_i*r_i/(p_i*A*p_i) * x_{i+1} = x_i + a_i*p_i * r_{i+1} = r_i - a_i*A*p_i * b_{i+1} = r_{i+1}*r_{i+1}/r_i*r_i * p_{i+1} = r_{i+1} + b_{i+1}*p_i * *INPUT: A: Series of matrices such that A = A_{n-1}A_{n-2}...A_0 * (use this to avoid any actual matrix multiplication) * nmatrices: The number of matrices in A * maxrows: The maximum number of rows of any matrix in A * b: constant term * *OUTPUT: x, pass by reference, the solution to * A_{n-1}*A_{n-2}...*A_0*x + b = 0 * *TYPES: All of the types in the function are based off the type of x. *****************************************************************************/ void conjugate_gradient(Matrix **A, int nmatrices, int maxrows, Vector *b, Vector *x) { int dim = A[0]->cols; //should equal A[nmatrices-1]->rows! Vector *r = vector_make_size(dim, x->type, MATR_PRECISE, x->size), *p = vector_make_size(dim, x->type, MATR_PRECISE, x->size), *z = vector_make_size(dim, x->type, MATR_PRECISE, x->size), *ap = vector_make_size(dim, x->type, MATR_PRECISE, x->size), *last_x = vector_make_size(dim, x->type, MATR_PRECISE, x->size); double a = 0, beta = 0, lr, last_lr; int i; if (QP_DEBUG_MODE >= QP_LINEAR_SOLVER) { fprintf(stderr, "Arguments to conjugate gradient are:\n"); for (i = 0; i < nmatrices; i++) { fprintf(stderr, "A[%d] = \n", i); matr_print(A[i]); } fprintf(stderr, "b = "); vector_print(b); } vector_zero(x); //x_0 = 0 vector_multiply(b, -1.0, r); //r_0 = -b vector_copy(r, p); //p_0 = -b lr = norm2(r); last_lr = lr+1; i = 0; //this is the conjugate gradient method //it should never run more than x->dim times //note that this is no longer running to some accuracy //-it pretty much needs to either converge or blow up //or things get crazy while(lr > 0 && (norm2(x) < 1.0/SVM_EPSILON) && i < x->dim) { matr_vector_seq(A, nmatrices, maxrows, p, z); //Ap_i (used a lot) a = dot(r, r)/dot(p, z); //a_i if (QP_DEBUG_MODE >= QP_LINEAR_SOLVER) { fprintf(stderr, "Iteration %d: a = %f, beta = %f norm2(r) = %.11lf\n", i, a, beta, lr); fprintf(stderr, "x = "); vector_print(x); fprintf(stderr, "r = "); vector_print(r); fprintf(stderr, "p = "); vector_print(p); fprintf(stderr, "A*p = "); vector_print(z); } vector_copy(x, last_x); vector_add_multiple(x, p, a, x); vector_add_multiple(r, z, -1.0*a, r); last_lr = lr; lr = norm2(r); beta = lr/last_lr; //beta_{i+1} vector_multiply(p, beta, p); //beta_{i+1}p_i vector_add(p, r, p); //p_{i+1} i++; } if (QP_DEBUG_MODE >= QP_LINEAR_SOLVER) { fprintf(stderr, "Iteration %d: a = %f, beta = %f norm2(r) = %.11lf\nx = ", i, a, beta, lr); vector_print(x); } if (norm2(x) >= 1.0/SVM_EPSILON) { //A was positive semi-definite //return with the correct things going to infinity //and it will all work out :) if (QP_DEBUG_MODE >= QP_DEBUG) { fprintf(stderr, "Singular matrix detected.\n"); } if (i > 1) { vector_copy(last_x, x); } } if (QP_DEBUG_MODE >= QP_LINEAR_SOLVER) { fprintf(stderr, "Solution is x = "); vector_print(x); } vector_free(r); vector_free(p); vector_free(z); vector_free(ap); vector_free(last_x); } //solves equations of the form // Ax + b = 0 //for the LEAST-SQUARES solution //i am no longer using this //but i've left it in //since it is a useful function void run_linear(Matrix *A, Vector *b, Vector *x) { Matrix *T = matr_make(A->cols, A->rows, A->type, MATR_PRECISE), *list[2]; Vector *bt = vector_make(A->cols, A->type, MATR_PRECISE); int maxrows; matr_transpose(A, T); list[0] = A; list[1] = T; if (T->rows > A->rows) { maxrows = T->rows; } else { maxrows = A->rows; } matr_vector(T, b,bt); conjugate_gradient(list, 2, maxrows, bt, x); matr_free(T); vector_free(bt); } //solves Ax + b = 0 //slowly //use conjugate gradient instead void gradient_descent(Matrix **A, int nmatrices, int maxrows, Vector *b, Vector *x) { Vector *r = vector_make(b->dim, (A[0]->type), MATR_PRECISE); vector_zero(x); vector_copy(b, r); if (QP_DEBUG_MODE >= QP_LINEAR_SOLVER) { fprintf(stderr, "A = "); matr_print(A[0]); fprintf(stderr, "b = "); vector_print(b); } while (norm2(r) > SVM_EPSILON) { if (QP_DEBUG_MODE >= QP_LINEAR_SOLVER) { fprintf(stderr, "norm2(r) = %f, r = ", norm2(r)); vector_print(r); fprintf(stderr, "x = "); vector_print(x); } vector_multiply(r, -1.0, r); vector_add(x, r, x); matr_vector_seq(A, nmatrices, maxrows, x, r); vector_add(r, b, r); } } //#define SVM_QP_MAIN #ifdef SVM_QP_MAIN //sample main //to test these functions //and some of the matrix functions int main (int argc, char **argv) { int i, j, xrows, xcols, issparse = 1; Vector *f, *qb, *qx, *v = vector_make(2, issparse), *w = vector_make(2, issparse), *u = vector_make(3, issparse); Matrix *X, *Xt, *G, *A, *M1 = matr_make(3, 2, issparse), *M2 = matr_make(2, 3, issparse), *list[2], *M2M1 = matr_make(2,2, issparse); FILE *in; int *t = NULL; double tmp; if (!t) { fprintf(stderr, "testing null %p\n", &t); } //testing matr_vector_seq matr_set(M1, 0, 0, 1.2); matr_set(M1, 0, 1, 3); matr_set(M1, 1, 0, 1.4); matr_set(M1, 1, 1, 4); matr_set(M1, 2, 0, 6); matr_set(M1, 2, 1, -2); matr_set(M2, 0, 0, -2); matr_set(M2, 0, 1, 1); matr_set(M2, 0, 2, 13); matr_set(M2, 1, 0, -5); matr_set(M2, 1, 1, 1.7); matr_set(M2, 1, 2, -2.5); vector_set(v, 0, 3); vector_set(v, 1, -2); list[0] = M1; list[1] = M2; fprintf(stderr, "M1 = \n"); matr_print(M1); fprintf(stderr, "M2 = \n"); matr_print(M2); fprintf(stderr, "v = "); vector_print(v); matr_vector(M1, v, u); fprintf(stderr, "u = "); vector_print(u); matr_vector_seq(list, 2, 3, v, w); fprintf(stderr, "M2*M1*v = "); vector_print(w); matr_multiply(M2, M1, M2M1); matr_vector(M2M1, v, w); fprintf(stderr, "(M2*M1)*v = "); vector_print(w); fprintf(stderr, "v = "); vector_print(v); fprintf(stderr, "M1 = \n"); matr_print(M1); fprintf(stderr, "M2 = \n"); matr_print(M2); matr_free(M1); matr_free(M2); matr_free(M2M1); vector_free(v); vector_free(w); vector_free(u); //testing run_qp if (argc < 2) { return 0; } in = fopen(argv[1], "r"); if (!in) { fprintf(stderr, "Invalid file name"); exit(1); } fscanf(in, "%d %d", &xrows, &xcols); X = matr_make(xrows, xcols, issparse); for (i = 0; i < xrows; i++) { for (j = 0; j < xcols; j++) { fscanf(in, "%lf", &tmp); matr_set(X, i, j, tmp); } } //last row is f f = vector_make(xrows, issparse); for (i = 0; i < xrows; i++) { fscanf(in, "%lf", &tmp); vector_set(f, i, -1.0*tmp); } fclose(in); //file should list feature vectors as rows Xt = matr_make(xcols, xrows, issparse); matr_transpose(X, Xt); G = matr_make(xrows, xrows, issparse); matr_multiply(X, Xt, G); //make constraint matrix A = matr_make(xrows+1, xrows, issparse); for (i = 0; i < xrows; i++) { matr_set(A, 0, i, -1.0); } for (i = 1; i <= xrows; i++) { matr_set(A, i, i-1, 1); } qb = vector_make(xrows+1, issparse); vector_set(qb, 0, -MAX_X_VAL); qx = vector_make(xrows, issparse); fprintf(stderr, "Running qp.\n"); run_qp(G, A, f, qb, qx); fprintf(stderr, "QP solution is "); vector_print(qx); matr_free(X); matr_free(Xt); matr_free(G); matr_free(A); vector_free(f); vector_free(qb); vector_free(qx); return 0; } #endif crm114-20100106-BlameMichelson.src/escapetest.crm0000755000000000017500000000073411321154266017647 0ustar rootwsy#! /usr/bin/crm # # escapetest.crm - test backslash # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. { window output / :*:_nl: CRM114 testing backslash :*:_nl: :*:_nl:/ alter (:_dw:) /?? abcd ??/ match (:a:) /ab\cd/ output /\/-;-{-:*:a:-}-;-\/ =\/==\= :*:_nl:/ output / This text started out \ on three lines \ but it should appear on just one.:*:_nl:/ output / testing /; # comment \#; output /part-line comments\n/ } crm114-20100106-BlameMichelson.src/crm_stats.c0000644000000000017500000005777311321154266017163 0ustar rootwsy// crm_stats.c - statistical processing functions // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" const double norm_cdf_lookup[] = { 9.865876e-10, 1.086112e-09, 1.195391e-09, 1.315351e-09, 1.447005e-09, 1.591458e-09, 1.749914e-09, 1.923689e-09, 2.114217e-09, 2.323062e-09, 2.551930e-09, 2.802679e-09, 3.077334e-09, 3.378100e-09, 3.707380e-09, 4.067789e-09, 4.462172e-09, 4.893629e-09, 5.365527e-09, 5.881533e-09, 6.445630e-09, 7.062151e-09, 7.735803e-09, 8.471701e-09, 9.275399e-09, 1.015293e-08, 1.111084e-08, 1.215625e-08, 1.329685e-08, 1.454102e-08, 1.589784e-08, 1.737713e-08, 1.898956e-08, 2.074669e-08, 2.266102e-08, 2.474613e-08, 2.701668e-08, 2.948856e-08, 3.217898e-08, 3.510653e-08, 3.829134e-08, 4.175518e-08, 4.552156e-08, 4.961591e-08, 5.406571e-08, 5.890064e-08, 6.415274e-08, 6.985661e-08, 7.604961e-08, 8.277203e-08, 9.006736e-08, 9.798248e-08, 1.065680e-07, 1.158783e-07, 1.259722e-07, 1.369130e-07, 1.487689e-07, 1.616131e-07, 1.755248e-07, 1.905889e-07, 2.068970e-07, 2.245475e-07, 2.436461e-07, 2.643067e-07, 2.866516e-07, 3.108121e-07, 3.369294e-07, 3.651551e-07, 3.956520e-07, 4.285948e-07, 4.641709e-07, 5.025815e-07, 5.440423e-07, 5.887845e-07, 6.370561e-07, 6.891229e-07, 7.452694e-07, 8.058005e-07, 8.710428e-07, 9.413457e-07, 1.017083e-06, 1.098656e-06, 1.186491e-06, 1.281047e-06, 1.382814e-06, 1.492313e-06, 1.610104e-06, 1.736785e-06, 1.872992e-06, 2.019406e-06, 2.176754e-06, 2.345812e-06, 2.527405e-06, 2.722416e-06, 2.931785e-06, 3.156515e-06, 3.397673e-06, 3.656398e-06, 3.933901e-06, 4.231473e-06, 4.550486e-06, 4.892403e-06, 5.258778e-06, 5.651266e-06, 6.071624e-06, 6.521722e-06, 7.003545e-06, 7.519206e-06, 8.070944e-06, 8.661140e-06, 9.292321e-06, 9.967168e-06, 1.068853e-05, 1.145941e-05, 1.228302e-05, 1.316276e-05, 1.410220e-05, 1.510517e-05, 1.617569e-05, 1.731804e-05, 1.853674e-05, 1.983657e-05, 2.122260e-05, 2.270018e-05, 2.427497e-05, 2.595297e-05, 2.774050e-05, 2.964423e-05, 3.167124e-05, 3.382898e-05, 3.612532e-05, 3.856856e-05, 4.116747e-05, 4.393129e-05, 4.686977e-05, 4.999318e-05, 5.331235e-05, 5.683869e-05, 6.058422e-05, 6.456159e-05, 6.878411e-05, 7.326582e-05, 7.802144e-05, 8.306649e-05, 8.841729e-05, 9.409096e-05, 1.001055e-04, 1.064799e-04, 1.132340e-04, 1.203887e-04, 1.279659e-04, 1.359885e-04, 1.444807e-04, 1.534678e-04, 1.629763e-04, 1.730340e-04, 1.836700e-04, 1.949148e-04, 2.068003e-04, 2.193601e-04, 2.326291e-04, 2.466439e-04, 2.614429e-04, 2.770661e-04, 2.935554e-04, 3.109545e-04, 3.293092e-04, 3.486672e-04, 3.690785e-04, 3.905949e-04, 4.132709e-04, 4.371631e-04, 4.623306e-04, 4.888350e-04, 5.167405e-04, 5.461139e-04, 5.770250e-04, 6.095464e-04, 6.437534e-04, 6.797248e-04, 7.175423e-04, 7.572909e-04, 7.990591e-04, 8.429387e-04, 8.890253e-04, 9.374180e-04, 9.882198e-04, 1.041538e-03, 1.097482e-03, 1.156169e-03, 1.217718e-03, 1.282251e-03, 1.349898e-03, 1.420791e-03, 1.495069e-03, 1.572873e-03, 1.654351e-03, 1.739656e-03, 1.828945e-03, 1.922383e-03, 2.020137e-03, 2.122383e-03, 2.229301e-03, 2.341076e-03, 2.457901e-03, 2.579975e-03, 2.707501e-03, 2.840691e-03, 2.979763e-03, 3.124941e-03, 3.276456e-03, 3.434545e-03, 3.599455e-03, 3.771437e-03, 3.950751e-03, 4.137663e-03, 4.332448e-03, 4.535389e-03, 4.746775e-03, 4.966903e-03, 5.196079e-03, 5.434618e-03, 5.682840e-03, 5.941077e-03, 6.209665e-03, 6.488953e-03, 6.779295e-03, 7.081056e-03, 7.394607e-03, 7.720330e-03, 8.058616e-03, 8.409861e-03, 8.774475e-03, 9.152873e-03, 9.545482e-03, 9.952734e-03, 1.037507e-02, 1.081295e-02, 1.126683e-02, 1.173718e-02, 1.222447e-02, 1.272920e-02, 1.325187e-02, 1.379297e-02, 1.435302e-02, 1.493255e-02, 1.553207e-02, 1.615215e-02, 1.679331e-02, 1.745611e-02, 1.814113e-02, 1.884892e-02, 1.958008e-02, 2.033518e-02, 2.111482e-02, 2.191960e-02, 2.275013e-02, 2.360703e-02, 2.449090e-02, 2.540239e-02, 2.634213e-02, 2.731074e-02, 2.830889e-02, 2.933721e-02, 3.039636e-02, 3.148700e-02, 3.260979e-02, 3.376540e-02, 3.495449e-02, 3.617773e-02, 3.743581e-02, 3.872939e-02, 4.005916e-02, 4.142578e-02, 4.282995e-02, 4.427234e-02, 4.575362e-02, 4.727449e-02, 4.883560e-02, 5.043764e-02, 5.208128e-02, 5.376718e-02, 5.549602e-02, 5.726845e-02, 5.908512e-02, 6.094670e-02, 6.285381e-02, 6.480710e-02, 6.680720e-02, 6.885473e-02, 7.095031e-02, 7.309453e-02, 7.528799e-02, 7.753127e-02, 7.982495e-02, 8.216959e-02, 8.456572e-02, 8.701389e-02, 8.951462e-02, 9.206841e-02, 9.467574e-02, 9.733710e-02, 1.000529e-01, 1.028237e-01, 1.056498e-01, 1.085316e-01, 1.114695e-01, 1.144640e-01, 1.175152e-01, 1.206236e-01, 1.237895e-01, 1.270130e-01, 1.302945e-01, 1.336342e-01, 1.370323e-01, 1.404890e-01, 1.440044e-01, 1.475786e-01, 1.512118e-01, 1.549040e-01, 1.586553e-01, 1.624656e-01, 1.663350e-01, 1.702634e-01, 1.742507e-01, 1.782969e-01, 1.824018e-01, 1.865652e-01, 1.907870e-01, 1.950668e-01, 1.994046e-01, 2.037999e-01, 2.082524e-01, 2.127618e-01, 2.173277e-01, 2.219497e-01, 2.266274e-01, 2.313601e-01, 2.361475e-01, 2.409889e-01, 2.458839e-01, 2.508316e-01, 2.558316e-01, 2.608832e-01, 2.659855e-01, 2.711380e-01, 2.763397e-01, 2.815899e-01, 2.868877e-01, 2.922323e-01, 2.976228e-01, 3.030582e-01, 3.085375e-01, 3.140599e-01, 3.196242e-01, 3.252294e-01, 3.308744e-01, 3.365581e-01, 3.422795e-01, 3.480372e-01, 3.538302e-01, 3.596573e-01, 3.655172e-01, 3.714086e-01, 3.773303e-01, 3.832810e-01, 3.892593e-01, 3.952640e-01, 4.012937e-01, 4.073469e-01, 4.134224e-01, 4.195187e-01, 4.256343e-01, 4.317679e-01, 4.379180e-01, 4.440831e-01, 4.502618e-01, 4.564525e-01, 4.626539e-01, 4.688643e-01, 4.750823e-01, 4.813064e-01, 4.875351e-01, 4.937668e-01, 5.000000e-01, 5.062332e-01, 5.124649e-01, 5.186936e-01, 5.249177e-01, 5.311357e-01, 5.373461e-01, 5.435475e-01, 5.497382e-01, 5.559169e-01, 5.620820e-01, 5.682321e-01, 5.743657e-01, 5.804813e-01, 5.865776e-01, 5.926531e-01, 5.987063e-01, 6.047360e-01, 6.107407e-01, 6.167190e-01, 6.226697e-01, 6.285914e-01, 6.344828e-01, 6.403427e-01, 6.461698e-01, 6.519628e-01, 6.577205e-01, 6.634419e-01, 6.691256e-01, 6.747706e-01, 6.803758e-01, 6.859401e-01, 6.914625e-01, 6.969418e-01, 7.023772e-01, 7.077677e-01, 7.131123e-01, 7.184101e-01, 7.236603e-01, 7.288620e-01, 7.340145e-01, 7.391168e-01, 7.441684e-01, 7.491684e-01, 7.541161e-01, 7.590111e-01, 7.638525e-01, 7.686399e-01, 7.733726e-01, 7.780503e-01, 7.826723e-01, 7.872382e-01, 7.917476e-01, 7.962001e-01, 8.005954e-01, 8.049332e-01, 8.092130e-01, 8.134348e-01, 8.175982e-01, 8.217031e-01, 8.257493e-01, 8.297366e-01, 8.336650e-01, 8.375344e-01, 8.413447e-01, 8.450960e-01, 8.487882e-01, 8.524214e-01, 8.559956e-01, 8.595110e-01, 8.629677e-01, 8.663658e-01, 8.697055e-01, 8.729870e-01, 8.762105e-01, 8.793764e-01, 8.824848e-01, 8.855360e-01, 8.885305e-01, 8.914684e-01, 8.943502e-01, 8.971763e-01, 8.999471e-01, 9.026629e-01, 9.053243e-01, 9.079316e-01, 9.104854e-01, 9.129861e-01, 9.154343e-01, 9.178304e-01, 9.201750e-01, 9.224687e-01, 9.247120e-01, 9.269055e-01, 9.290497e-01, 9.311453e-01, 9.331928e-01, 9.351929e-01, 9.371462e-01, 9.390533e-01, 9.409149e-01, 9.427316e-01, 9.445040e-01, 9.462328e-01, 9.479187e-01, 9.495624e-01, 9.511644e-01, 9.527255e-01, 9.542464e-01, 9.557277e-01, 9.571700e-01, 9.585742e-01, 9.599408e-01, 9.612706e-01, 9.625642e-01, 9.638223e-01, 9.650455e-01, 9.662346e-01, 9.673902e-01, 9.685130e-01, 9.696036e-01, 9.706628e-01, 9.716911e-01, 9.726893e-01, 9.736579e-01, 9.745976e-01, 9.755091e-01, 9.763930e-01, 9.772499e-01, 9.780804e-01, 9.788852e-01, 9.796648e-01, 9.804199e-01, 9.811511e-01, 9.818589e-01, 9.825439e-01, 9.832067e-01, 9.838479e-01, 9.844679e-01, 9.850675e-01, 9.856470e-01, 9.862070e-01, 9.867481e-01, 9.872708e-01, 9.877755e-01, 9.882628e-01, 9.887332e-01, 9.891870e-01, 9.896249e-01, 9.900473e-01, 9.904545e-01, 9.908471e-01, 9.912255e-01, 9.915901e-01, 9.919414e-01, 9.922797e-01, 9.926054e-01, 9.929189e-01, 9.932207e-01, 9.935110e-01, 9.937903e-01, 9.940589e-01, 9.943172e-01, 9.945654e-01, 9.948039e-01, 9.950331e-01, 9.952532e-01, 9.954646e-01, 9.956676e-01, 9.958623e-01, 9.960492e-01, 9.962286e-01, 9.964005e-01, 9.965655e-01, 9.967235e-01, 9.968751e-01, 9.970202e-01, 9.971593e-01, 9.972925e-01, 9.974200e-01, 9.975421e-01, 9.976589e-01, 9.977707e-01, 9.978776e-01, 9.979799e-01, 9.980776e-01, 9.981711e-01, 9.982603e-01, 9.983456e-01, 9.984271e-01, 9.985049e-01, 9.985792e-01, 9.986501e-01, 9.987177e-01, 9.987823e-01, 9.988438e-01, 9.989025e-01, 9.989585e-01, 9.990118e-01, 9.990626e-01, 9.991110e-01, 9.991571e-01, 9.992009e-01, 9.992427e-01, 9.992825e-01, 9.993203e-01, 9.993562e-01, 9.993905e-01, 9.994230e-01, 9.994539e-01, 9.994833e-01, 9.995112e-01, 9.995377e-01, 9.995628e-01, 9.995867e-01, 9.996094e-01, 9.996309e-01, 9.996513e-01, 9.996707e-01, 9.996890e-01, 9.997064e-01, 9.997229e-01, 9.997386e-01, 9.997534e-01, 9.997674e-01, 9.997806e-01, 9.997932e-01, 9.998051e-01, 9.998163e-01, 9.998270e-01, 9.998370e-01, 9.998465e-01, 9.998555e-01, 9.998640e-01, 9.998720e-01, 9.998796e-01, 9.998868e-01, 9.998935e-01, 9.998999e-01, 9.999059e-01, 9.999116e-01, 9.999169e-01, 9.999220e-01, 9.999267e-01, 9.999312e-01, 9.999354e-01, 9.999394e-01, 9.999432e-01, 9.999467e-01, 9.999500e-01, 9.999531e-01, 9.999561e-01, 9.999588e-01, 9.999614e-01, 9.999639e-01, 9.999662e-01, 9.999683e-01, 9.999704e-01, 9.999723e-01, 9.999740e-01, 9.999757e-01, 9.999773e-01, 9.999788e-01, 9.999802e-01, 9.999815e-01, 9.999827e-01, 9.999838e-01, 9.999849e-01, 9.999859e-01, 9.999868e-01, 9.999877e-01, 9.999885e-01, 9.999893e-01, 9.999900e-01, 9.999907e-01, 9.999913e-01, 9.999919e-01, 9.999925e-01, 9.999930e-01, 9.999935e-01, 9.999939e-01, 9.999943e-01, 9.999947e-01, 9.999951e-01, 9.999954e-01, 9.999958e-01, 9.999961e-01, 9.999963e-01, 9.999966e-01, 9.999968e-01, 9.999971e-01, 9.999973e-01, 9.999975e-01, 9.999977e-01, 9.999978e-01, 9.999980e-01, 9.999981e-01, 9.999983e-01, 9.999984e-01, 9.999985e-01, 9.999986e-01, 9.999987e-01, 9.999988e-01, 9.999989e-01, 9.999990e-01, 9.999991e-01, 9.999991e-01, 9.999992e-01, 9.999993e-01, 9.999993e-01, 9.999994e-01, 9.999994e-01, 9.999995e-01, 9.999995e-01, 9.999995e-01, 9.999996e-01, 9.999996e-01, 9.999996e-01, 9.999997e-01, 9.999997e-01, 9.999997e-01, 9.999997e-01, 9.999998e-01, 9.999998e-01, 9.999998e-01, 9.999998e-01, 9.999998e-01, 9.999998e-01, 9.999999e-01, 9.999999e-01, 9.999999e-01, 9.999999e-01, 9.999999e-01, 9.999999e-01, 9.999999e-01, 9.999999e-01, 9.999999e-01, 9.999999e-01, 9.999999e-01, 9.999999e-01, 9.999999e-01, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00, 1.000000e-00 }; // there is currently no interpolation double crm_norm_cdf(double x) { long i = 0; if(x < -6.0) return 0.0; if(x >= 6.0) return 1.0; i = (long)((x + 6.0) * 32.0); return norm_cdf_lookup[i]; } // notice we put -7.0 in place of -inf, monotonicity is all that // matters for the algorithm, and huge extrema would only make // visualization harder const double log_lookup_table[] = { -7.000000e+00, -5.950643e+00, -5.257495e+00, -4.852030e+00, -4.564348e+00, -4.341205e+00, -4.158883e+00, -4.004732e+00, -3.871201e+00, -3.753418e+00, -3.648057e+00, -3.552747e+00, -3.465736e+00, -3.385693e+00, -3.311585e+00, -3.242592e+00, -3.178054e+00, -3.117429e+00, -3.060271e+00, -3.006204e+00, -2.954910e+00, -2.906120e+00, -2.859600e+00, -2.815148e+00, -2.772589e+00, -2.731767e+00, -2.692546e+00, -2.654806e+00, -2.618438e+00, -2.583347e+00, -2.549445e+00, -2.516655e+00, -2.484907e+00, -2.454135e+00, -2.424282e+00, -2.395294e+00, -2.367124e+00, -2.339725e+00, -2.313056e+00, -2.287081e+00, -2.261763e+00, -2.237070e+00, -2.212973e+00, -2.189442e+00, -2.166453e+00, -2.143980e+00, -2.122001e+00, -2.100495e+00, -2.079442e+00, -2.058822e+00, -2.038620e+00, -2.018817e+00, -1.999399e+00, -1.980351e+00, -1.961659e+00, -1.943309e+00, -1.925291e+00, -1.907591e+00, -1.890200e+00, -1.873105e+00, -1.856298e+00, -1.839769e+00, -1.823508e+00, -1.807508e+00, -1.791759e+00, -1.776255e+00, -1.760988e+00, -1.745950e+00, -1.731135e+00, -1.716536e+00, -1.702147e+00, -1.687963e+00, -1.673976e+00, -1.660183e+00, -1.646577e+00, -1.633154e+00, -1.619909e+00, -1.606837e+00, -1.593934e+00, -1.581195e+00, -1.568616e+00, -1.556193e+00, -1.543923e+00, -1.531802e+00, -1.519826e+00, -1.507991e+00, -1.496295e+00, -1.484734e+00, -1.473306e+00, -1.462006e+00, -1.450833e+00, -1.439783e+00, -1.428854e+00, -1.418043e+00, -1.407348e+00, -1.396766e+00, -1.386294e+00, -1.375932e+00, -1.365675e+00, -1.355523e+00, -1.345472e+00, -1.335522e+00, -1.325670e+00, -1.315914e+00, -1.306252e+00, -1.296682e+00, -1.287203e+00, -1.277814e+00, -1.268511e+00, -1.259295e+00, -1.250162e+00, -1.241112e+00, -1.232144e+00, -1.223255e+00, -1.214444e+00, -1.205710e+00, -1.197052e+00, -1.188469e+00, -1.179958e+00, -1.171519e+00, -1.163151e+00, -1.154852e+00, -1.146622e+00, -1.138458e+00, -1.130361e+00, -1.122329e+00, -1.114361e+00, -1.106455e+00, -1.098612e+00, -1.090830e+00, -1.083108e+00, -1.075445e+00, -1.067841e+00, -1.060293e+00, -1.052803e+00, -1.045368e+00, -1.037988e+00, -1.030662e+00, -1.023389e+00, -1.016169e+00, -1.009000e+00, -1.001883e+00, -9.948155e-01, -9.877979e-01, -9.808293e-01, -9.739088e-01, -9.670359e-01, -9.602100e-01, -9.534303e-01, -9.466962e-01, -9.400073e-01, -9.333627e-01, -9.267620e-01, -9.202046e-01, -9.136900e-01, -9.072174e-01, -9.007865e-01, -8.943967e-01, -8.880475e-01, -8.817384e-01, -8.754687e-01, -8.692382e-01, -8.630462e-01, -8.568924e-01, -8.507761e-01, -8.446971e-01, -8.386548e-01, -8.326487e-01, -8.266786e-01, -8.207438e-01, -8.148441e-01, -8.089790e-01, -8.031481e-01, -7.973510e-01, -7.915873e-01, -7.858566e-01, -7.801586e-01, -7.744928e-01, -7.688590e-01, -7.632567e-01, -7.576857e-01, -7.521455e-01, -7.466359e-01, -7.411564e-01, -7.357068e-01, -7.302867e-01, -7.248959e-01, -7.195339e-01, -7.142006e-01, -7.088955e-01, -7.036185e-01, -6.983691e-01, -6.931472e-01, -6.879524e-01, -6.827844e-01, -6.776430e-01, -6.725279e-01, -6.674388e-01, -6.623755e-01, -6.573377e-01, -6.523252e-01, -6.473376e-01, -6.423749e-01, -6.374366e-01, -6.325226e-01, -6.276326e-01, -6.227664e-01, -6.179238e-01, -6.131045e-01, -6.083083e-01, -6.035350e-01, -5.987844e-01, -5.940563e-01, -5.893504e-01, -5.846665e-01, -5.800045e-01, -5.753641e-01, -5.707452e-01, -5.661475e-01, -5.615708e-01, -5.570150e-01, -5.524799e-01, -5.479652e-01, -5.434708e-01, -5.389965e-01, -5.345422e-01, -5.301076e-01, -5.256925e-01, -5.212969e-01, -5.169205e-01, -5.125632e-01, -5.082248e-01, -5.039052e-01, -4.996041e-01, -4.953214e-01, -4.910570e-01, -4.868107e-01, -4.825824e-01, -4.783719e-01, -4.741790e-01, -4.700036e-01, -4.658456e-01, -4.617048e-01, -4.575811e-01, -4.534743e-01, -4.493843e-01, -4.453110e-01, -4.412542e-01, -4.372138e-01, -4.331897e-01, -4.291816e-01, -4.251896e-01, -4.212135e-01, -4.172531e-01, -4.133083e-01, -4.093790e-01, -4.054651e-01, -4.015665e-01, -3.976830e-01, -3.938145e-01, -3.899609e-01, -3.861221e-01, -3.822980e-01, -3.784885e-01, -3.746934e-01, -3.709127e-01, -3.671462e-01, -3.633939e-01, -3.596556e-01, -3.559312e-01, -3.522206e-01, -3.485237e-01, -3.448405e-01, -3.411708e-01, -3.375144e-01, -3.338715e-01, -3.302417e-01, -3.266250e-01, -3.230214e-01, -3.194308e-01, -3.158529e-01, -3.122879e-01, -3.087355e-01, -3.051957e-01, -3.016683e-01, -2.981534e-01, -2.946507e-01, -2.911603e-01, -2.876821e-01, -2.842159e-01, -2.807616e-01, -2.773193e-01, -2.738888e-01, -2.704699e-01, -2.670628e-01, -2.636672e-01, -2.602831e-01, -2.569104e-01, -2.535491e-01, -2.501990e-01, -2.468601e-01, -2.435323e-01, -2.402155e-01, -2.369097e-01, -2.336149e-01, -2.303308e-01, -2.270575e-01, -2.237948e-01, -2.205428e-01, -2.173013e-01, -2.140703e-01, -2.108496e-01, -2.076394e-01, -2.044394e-01, -2.012496e-01, -1.980699e-01, -1.949003e-01, -1.917408e-01, -1.885912e-01, -1.854514e-01, -1.823216e-01, -1.792014e-01, -1.760910e-01, -1.729902e-01, -1.698990e-01, -1.668174e-01, -1.637452e-01, -1.606824e-01, -1.576289e-01, -1.545848e-01, -1.515499e-01, -1.485242e-01, -1.455076e-01, -1.425001e-01, -1.395016e-01, -1.365120e-01, -1.335314e-01, -1.305596e-01, -1.275967e-01, -1.246424e-01, -1.216969e-01, -1.187601e-01, -1.158318e-01, -1.129121e-01, -1.100009e-01, -1.070981e-01, -1.042038e-01, -1.013178e-01, -9.844007e-02, -9.557063e-02, -9.270940e-02, -8.985633e-02, -8.701138e-02, -8.417450e-02, -8.134564e-02, -7.852476e-02, -7.571182e-02, -7.290677e-02, -7.010957e-02, -6.732016e-02, -6.453852e-02, -6.176459e-02, -5.899834e-02, -5.623972e-02, -5.348868e-02, -5.074520e-02, -4.800922e-02, -4.528070e-02, -4.255961e-02, -3.984591e-02, -3.713955e-02, -3.444049e-02, -3.174870e-02, -2.906413e-02, -2.638676e-02, -2.371653e-02, -2.105341e-02, -1.839737e-02, -1.574836e-02, -1.310635e-02, -1.047130e-02, -7.843177e-03, -5.221944e-03, -2.607563e-03, -5.773160e-15, 2.600782e-03, 5.194817e-03, 7.782140e-03, 1.036279e-02, 1.293679e-02, 1.550419e-02, 1.806501e-02, 2.061929e-02, 2.316706e-02, 2.570836e-02, 2.824321e-02, 3.077166e-02, 3.329373e-02, 3.580945e-02, 3.831886e-02, 4.082199e-02, 4.331887e-02, 4.580954e-02, 4.829401e-02, 5.077233e-02, 5.324451e-02, 5.571061e-02, 5.817063e-02, 6.062462e-02, 6.307260e-02, 6.551461e-02, 6.795066e-02, 7.038080e-02, 7.280504e-02, 7.522342e-02, 7.763597e-02, 8.004271e-02, 8.244367e-02, 8.483888e-02, 8.722837e-02, 8.961216e-02, 9.199028e-02, 9.436276e-02, 9.672963e-02, 9.909090e-02, 1.014466e-01, 1.037968e-01, 1.061415e-01, 1.084806e-01, 1.108144e-01, 1.131427e-01, 1.154655e-01, 1.177830e-01, 1.200952e-01, 1.224020e-01, 1.247035e-01, 1.269997e-01, 1.292906e-01, 1.315764e-01, 1.338569e-01, 1.361322e-01, 1.384023e-01, 1.406673e-01, 1.429272e-01, 1.451820e-01, 1.474317e-01, 1.496764e-01, 1.519160e-01, 1.541507e-01, 1.563803e-01, 1.586050e-01, 1.608248e-01, 1.630396e-01, 1.652496e-01, 1.674546e-01, 1.696549e-01, 1.718503e-01, 1.740408e-01, 1.762266e-01, 1.784077e-01, 1.805839e-01, 1.827555e-01, 1.849223e-01, 1.870845e-01, 1.892420e-01, 1.913949e-01, 1.935431e-01, 1.956867e-01, 1.978257e-01, 1.999602e-01, 2.020901e-01, 2.042155e-01, 2.063364e-01, 2.084528e-01, 2.105648e-01, 2.126723e-01, 2.147753e-01, 2.168739e-01, 2.189682e-01, 2.210580e-01, 2.231436e-01, 2.252247e-01, 2.273016e-01, 2.293741e-01, 2.314424e-01, 2.335063e-01, 2.355661e-01, 2.376216e-01, 2.396729e-01, 2.417199e-01, 2.437628e-01, 2.458016e-01, 2.478362e-01, 2.498666e-01, 2.518930e-01, 2.539152e-01, 2.559334e-01, 2.579475e-01, 2.599575e-01, 2.619635e-01, 2.639655e-01, 2.659635e-01, 2.679576e-01, 2.699476e-01, 2.719337e-01, 2.739159e-01, 2.758941e-01, 2.778685e-01, 2.798389e-01, 2.818055e-01, 2.837682e-01, 2.857270e-01, 2.876821e-01, 2.896333e-01, 2.915807e-01, 2.935243e-01, 2.954642e-01, 2.974003e-01, 2.993327e-01, 3.012613e-01, 3.031863e-01, 3.051075e-01, 3.070250e-01, 3.089389e-01, 3.108491e-01, 3.127557e-01, 3.146587e-01, 3.165580e-01, 3.184537e-01, 3.203459e-01, 3.222345e-01, 3.241195e-01, 3.260009e-01, 3.278789e-01, 3.297533e-01, 3.316242e-01, 3.334916e-01, 3.353555e-01, 3.372160e-01, 3.390730e-01, 3.409266e-01, 3.427767e-01, 3.446234e-01, 3.464668e-01, 3.483067e-01, 3.501432e-01, 3.519764e-01, 3.538062e-01, 3.556327e-01, 3.574559e-01, 3.592757e-01, 3.610923e-01, 3.629055e-01, 3.647154e-01, 3.665221e-01, 3.683256e-01, 3.701257e-01, 3.719227e-01, 3.737164e-01, 3.755069e-01, 3.772942e-01, 3.790784e-01, 3.808593e-01, 3.826371e-01, 3.844117e-01, 3.861832e-01, 3.879515e-01, 3.897168e-01, 3.914789e-01, 3.932379e-01, 3.949938e-01, 3.967467e-01, 3.984964e-01, 4.002432e-01, 4.019868e-01, 4.037275e-01, 4.054651e-01, 4.071997e-01, 4.089313e-01, 4.106599e-01, 4.123856e-01, 4.141082e-01, 4.158279e-01, 4.175446e-01, 4.192584e-01, 4.209693e-01, 4.226772e-01, 4.243823e-01, 4.260844e-01, 4.277836e-01, 4.294800e-01, 4.311735e-01, 4.328641e-01, 4.345518e-01, 4.362368e-01, 4.379189e-01, 4.395981e-01, 4.412746e-01, 4.429482e-01, 4.446190e-01, 4.462871e-01, 4.479524e-01, 4.496149e-01, 4.512746e-01, 4.529316e-01, 4.545859e-01, 4.562374e-01, 4.578862e-01, 4.595323e-01, 4.611757e-01, 4.628164e-01, 4.644544e-01, 4.660897e-01, 4.677224e-01, 4.693524e-01, 4.709797e-01, 4.726044e-01, 4.742265e-01, 4.758459e-01, 4.774627e-01, 4.790769e-01, 4.806885e-01, 4.822975e-01, 4.839040e-01, 4.855078e-01, 4.871091e-01, 4.887078e-01, 4.903040e-01, 4.918976e-01, 4.934887e-01, 4.950773e-01, 4.966633e-01, 4.982468e-01, 4.998279e-01, 5.014064e-01, 5.029824e-01, 5.045560e-01, 5.061271e-01, 5.076957e-01, 5.092619e-01, 5.108256e-01, 5.123869e-01, 5.139458e-01, 5.155022e-01, 5.170562e-01, 5.186078e-01, 5.201570e-01, 5.217037e-01, 5.232481e-01, 5.247902e-01, 5.263298e-01, 5.278671e-01, 5.294020e-01, 5.309346e-01, 5.324648e-01, 5.339927e-01, 5.355182e-01, 5.370415e-01, 5.385624e-01, 5.400810e-01, 5.415973e-01, 5.431113e-01, 5.446230e-01, 5.461324e-01, 5.476396e-01, 5.491445e-01, 5.506471e-01, 5.521475e-01, 5.536456e-01, 5.551415e-01, 5.566352e-01, 5.581266e-01, 5.596158e-01, 5.611028e-01, 5.625876e-01, 5.640701e-01, 5.655505e-01, 5.670287e-01, 5.685047e-01, 5.699786e-01, 5.714502e-01, 5.729198e-01, 5.743871e-01, 5.758523e-01, 5.773154e-01, 5.787763e-01, 5.802351e-01, 5.816917e-01, 5.831463e-01, 5.845987e-01, 5.860490e-01, 5.874973e-01, 5.889434e-01, 5.903874e-01, 5.918294e-01, 5.932693e-01, 5.947071e-01, 5.961429e-01, 5.975766e-01, 5.990082e-01, 6.004378e-01, 6.018653e-01, 6.032909e-01, 6.047143e-01, 6.061358e-01, 6.075553e-01, 6.089727e-01, 6.103881e-01, 6.118015e-01, 6.132130e-01, 6.146224e-01, 6.160299e-01, 6.174354e-01, 6.188389e-01, 6.202404e-01, 6.216400e-01, 6.230376e-01, 6.244333e-01, 6.258270e-01, 6.272188e-01, 6.286087e-01, 6.299966e-01, 6.313826e-01, 6.327667e-01, 6.341488e-01, 6.355291e-01, 6.369075e-01, 6.382839e-01, 6.396585e-01, 6.410312e-01, 6.424020e-01, 6.437709e-01, 6.451380e-01, 6.465031e-01, 6.478665e-01, 6.492279e-01, 6.505876e-01, 6.519453e-01, 6.533013e-01, 6.546554e-01, 6.560076e-01, 6.573581e-01, 6.587067e-01, 6.600535e-01, 6.613985e-01, 6.627417e-01, 6.640830e-01, 6.654226e-01, 6.667604e-01, 6.680964e-01, 6.694307e-01, 6.707631e-01, 6.720938e-01, 6.734227e-01, 6.747498e-01, 6.760752e-01, 6.773988e-01, 6.787207e-01, 6.800408e-01, 6.813592e-01, 6.826759e-01, 6.839908e-01, 6.853040e-01, 6.866155e-01, 6.879252e-01, 6.892333e-01, 6.905396e-01, 6.918442e-01, 6.931472e-01 }; // this guy does linear interpolation, it's fun double crm_log(double x) { double r = 0, g; int i; while(x >= 2.0) { r += log_lookup_table[768]; //this is (log(2) x /= 2.0; } i = (int)(x * 384.0); g = x - ((double)i) / 384.0; r += (1.0 - g) * log_lookup_table[ i ] + g * log_lookup_table[ i + 1 ]; return r; } #define ONE_OVER_SQRT_2PI 0.3989422804014327 double norm_pdf(double x) { return ONE_OVER_SQRT_2PI * exp( -0.5 * x * x); } // this guy makes it so x = 0 yields 1, this is just for when you // want normal shaped curves double normalized_gauss(double x, double s) { return exp( -0.5 * x * x / (s * s)); } double crm_frand() { return (double)rand() / (double)RAND_MAX; } void print_histogram_float(float *f, int n, int n_buckets) { int *buckets, i; float min, max, s; buckets = malloc(sizeof(int) * n_buckets); min = max = f[0]; for(i = 1; i < n; i++) if(f[i] > max) max = f[i]; else if(f[i] < min) min = f[i]; s = ( (float)n_buckets - 0.01 ) / ( max - min); for(i = 0; i < n_buckets; i++) buckets[i] = 0; for(i = 0; i < n; i++) buckets[ (int)( s * (f[i] - min) ) ]++; fprintf(stderr, "min: %0.4f, max: %0.4f\n", min, max); for(i = 0; i < n_buckets; i++) fprintf(stderr, "( %0.4f - %0.4f ): %d\n", (float)i / s + min, (float)i / s + min + 1.0 / s, buckets[i]); free(buckets); } crm114-20100106-BlameMichelson.src/priolist.mfp0000644000000000017500000000033411321154266017346 0ustar rootwsy# # Comments are anything with a # in front, accept with +, reject with - # # Accept anything from my friends at MIT +mit\.edu # Reject anything from those idiots at WeAreIdiotSpammers.com -weareidiotspammers\.com crm114-20100106-BlameMichelson.src/tenfold_validate.crm0000755000000000017500000002646511321154266021024 0ustar rootwsy#! /usr/bin/crm # # tenfold_validate.crm - Do 10-fold validation on N classes # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # Program to do 10-fold validation on N classes using various classifiers. # # 1) the first user arg is the name of an index file, in TREC format, but # we allow arbitrary classnames and directories instead of just "ham" # and "spam", eg: # # arb_classname1 ./foos/file0001.txt # arb_classname2 ./bars/file0001.txt # # 2) The classnames are "skimmed" from the index file by doing a prepass, # then the class files are created by learning a tiny text into each one. # # 3) Then the index file is divided into ten parts; then the parts are # used for 10-fold validation. The results are concatenated into a # "results" file, also in TREC format, but note that this is NOT a # proper TREC style file as it doesn't preserve sequence information # (and will keep our training classnames, which may or may not match # work with the TREC analyzers.) # # 4) We consider the case of a result pR == 0.00 exactly to be WRONG for all # classes, and train whenever it occurs. window isolate (:verbose:) // isolate (:clf:) /osb unique microgroom/ isolate (:regex:) // isolate (:doublesided:) // isolate (:thickness:) /5.0/ isolate (:show_partitions:) // isolate (:decision_length:) /2048/ isolate (:results:) /results/ isolate (:input_filter:) // isolate (:show_pr:) // # isolate (:initial_text:) /The quick brown fox jumped over the lazy dog's back 012345679/ # # get the index file output /Reading from index file :*:_arg2:\n/ input [:*:_arg2:] (:index_file:) # # Scan the index file for classnames and .stat file names # isolate (:cnames:) // isolate (:cstatfiles:) // isolate (:s:) // { match [:index_file:] /([[:graph:]]+).*/ \ (:: :classname:) { match [:cnames:] /:*:classname:/ alter (:cnames:) /:*:cnames::*:classname:\n/ alter (:cstatfiles:) /:*:cstatfiles: :*:classname:.stat/ } liaf } { # Funny business for SVM and SKS soltion files match [:clf:] /sks|svm/ match [:cstatfiles:] /[[:graph:]]( )[[:graph:]]/ (:: :midspace:) alter (:midspace:) / | / alter (:cstatfiles:) /:*:cstatfiles: | versus.stat/ output /SVM\/SKS special form: :*:cstatfiles:\n/ } output /Classify\/Learn Flags: :*:clf:\n/ output /Classes found:\n:*:cnames:/ # # # Divide filenames into 10 groups (for 10-fold validation) match [:index_file:] // isolate (:f0: :f1: :f2: :f3: :f4: :f5: :f6: :f7: :f8: :f9:) { match [:index_file:] \ /[[:graph:]]+ ([[:graph:]]+)/ (:filename:) alter (:f0:) /:*:f0:\n:*:filename:/ match [:index_file:] \ /[[:graph:]]+ ([[:graph:]]+)/ (:filename:) alter (:f1:) /:*:f1:\n:*:filename:/ match [:index_file:] \ /[[:graph:]]+ ([[:graph:]]+)/ (:filename:) alter (:f2:) /:*:f2:\n:*:filename:/ match [:index_file:] \ /[[:graph:]]+ ([[:graph:]]+)/ (:filename:) alter (:f3:) /:*:f3:\n:*:filename:/ match [:index_file:] \ /[[:graph:]]+ ([[:graph:]]+)/ (:filename:) alter (:f4:) /:*:f4:\n:*:filename:/ match [:index_file:] \ /[[:graph:]]+ ([[:graph:]]+)/ (:filename:) alter (:f5:) /:*:f5:\n:*:filename:/ match [:index_file:] \ /[[:graph:]]+ ([[:graph:]]+)/ (:filename:) alter (:f6:) /:*:f6:\n:*:filename:/ match [:index_file:] \ /[[:graph:]]+ ([[:graph:]]+)/ (:filename:) alter (:f7:) /:*:f7:\n:*:filename:/ match [:index_file:] \ /[[:graph:]]+ ([[:graph:]]+)/ (:filename:) alter (:f8:) /:*:f8:\n:*:filename:/ match [:index_file:] \ /[[:graph:]]+ ([[:graph:]]+)/ (:filename:) alter (:f9:) /:*:f9:\n:*:filename:/ liaf } { match [:show_partitions:] /SET/ output /F0: \n:*:f0:\n\n/ output /F1: \n:*:f1:\n\n/ output /F2: \n:*:f2:\n\n/ output /F3: \n:*:f3:\n\n/ output /F4: \n:*:f4:\n\n/ output /F5: \n:*:f5:\n\n/ output /F6: \n:*:f6:\n\n/ output /F7: \n:*:f7:\n\n/ output /F8: \n:*:f8:\n\n/ output /F9: \n:*:f9:\n\n/ } # # Create the filenames vector match [:cnames:] // isolate (:filenames:) // { match /[[:graph:]]+/ [:cnames:] (:name:) alter (:filenames:) /:*:filenames: :*:name:.stat / liaf } # Now the big part of the work. # # Run the first validation batch { call /:clean_the_files:/ call /:learn_files:/ [:*:f0:] call /:learn_files:/ [:*:f1:] call /:learn_files:/ [:*:f2:] call /:learn_files:/ [:*:f3:] call /:learn_files:/ [:*:f4:] call /:learn_files:/ [:*:f5:] call /:learn_files:/ [:*:f6:] call /:learn_files:/ [:*:f7:] call /:learn_files:/ [:*:f8:] call /:check_files:/ [:*:f9:] } # # Run the second validation batch { call /:clean_the_files:/ call /:learn_files:/ [:*:f1:] call /:learn_files:/ [:*:f2:] call /:learn_files:/ [:*:f3:] call /:learn_files:/ [:*:f4:] call /:learn_files:/ [:*:f5:] call /:learn_files:/ [:*:f6:] call /:learn_files:/ [:*:f7:] call /:learn_files:/ [:*:f8:] call /:learn_files:/ [:*:f9:] call /:check_files:/ [:*:f0:] } # # # Run the third validation batch { call /:clean_the_files:/ call /:learn_files:/ [:*:f2:] call /:learn_files:/ [:*:f3:] call /:learn_files:/ [:*:f4:] call /:learn_files:/ [:*:f5:] call /:learn_files:/ [:*:f6:] call /:learn_files:/ [:*:f7:] call /:learn_files:/ [:*:f8:] call /:learn_files:/ [:*:f9:] call /:learn_files:/ [:*:f0:] call /:check_files:/ [:*:f1:] } # # Run the fourth validation batch { call /:clean_the_files:/ call /:learn_files:/ [:*:f3:] call /:learn_files:/ [:*:f4:] call /:learn_files:/ [:*:f5:] call /:learn_files:/ [:*:f6:] call /:learn_files:/ [:*:f7:] call /:learn_files:/ [:*:f8:] call /:learn_files:/ [:*:f9:] call /:learn_files:/ [:*:f0:] call /:learn_files:/ [:*:f1:] call /:check_files:/ [:*:f2:] } # # Run the fifth validation batch { call /:clean_the_files:/ call /:learn_files:/ [:*:f4:] call /:learn_files:/ [:*:f5:] call /:learn_files:/ [:*:f6:] call /:learn_files:/ [:*:f7:] call /:learn_files:/ [:*:f8:] call /:learn_files:/ [:*:f9:] call /:learn_files:/ [:*:f0:] call /:learn_files:/ [:*:f1:] call /:learn_files:/ [:*:f2:] call /:check_files:/ [:*:f3:] } # # Run the sixth validation batch { call /:clean_the_files:/ call /:learn_files:/ [:*:f5:] call /:learn_files:/ [:*:f6:] call /:learn_files:/ [:*:f7:] call /:learn_files:/ [:*:f8:] call /:learn_files:/ [:*:f9:] call /:learn_files:/ [:*:f0:] call /:learn_files:/ [:*:f1:] call /:learn_files:/ [:*:f2:] call /:learn_files:/ [:*:f3:] call /:check_files:/ [:*:f4:] } # # Run the seventh validation batch { call /:clean_the_files:/ call /:learn_files:/ [:*:f6:] call /:learn_files:/ [:*:f7:] call /:learn_files:/ [:*:f8:] call /:learn_files:/ [:*:f9:] call /:learn_files:/ [:*:f0:] call /:learn_files:/ [:*:f1:] call /:learn_files:/ [:*:f2:] call /:learn_files:/ [:*:f3:] call /:learn_files:/ [:*:f4:] call /:check_files:/ [:*:f5:] } # # Run the eighth validation batch { call /:clean_the_files:/ call /:learn_files:/ [:*:f7:] call /:learn_files:/ [:*:f8:] call /:learn_files:/ [:*:f9:] call /:learn_files:/ [:*:f0:] call /:learn_files:/ [:*:f1:] call /:learn_files:/ [:*:f2:] call /:learn_files:/ [:*:f3:] call /:learn_files:/ [:*:f4:] call /:learn_files:/ [:*:f5:] call /:check_files:/ [:*:f6:] } # # Run the ninth validation batch { call /:clean_the_files:/ call /:learn_files:/ [:*:f8:] call /:learn_files:/ [:*:f9:] call /:learn_files:/ [:*:f0:] call /:learn_files:/ [:*:f1:] call /:learn_files:/ [:*:f2:] call /:learn_files:/ [:*:f3:] call /:learn_files:/ [:*:f4:] call /:learn_files:/ [:*:f5:] call /:learn_files:/ [:*:f6:] call /:check_files:/ [:*:f7:] } # # Run the tenth validation batch { call /:clean_the_files:/ call /:learn_files:/ [:*:f9:] call /:learn_files:/ [:*:f0:] call /:learn_files:/ [:*:f1:] call /:learn_files:/ [:*:f2:] call /:learn_files:/ [:*:f3:] call /:learn_files:/ [:*:f4:] call /:learn_files:/ [:*:f5:] call /:learn_files:/ [:*:f6:] call /:learn_files:/ [:*:f7:] call /:check_files:/ [:*:f8:] } exit # :clean_the_files: # Empty the statistics files, create fresh empty ones # output /\n/ match [:cnames:] // { match /[[:graph:]]+/ [:cnames:] (:name:) output /Deleting old :*:name:.stat\n/ syscall /rm -rf :*:name:.stat / output /CREATING :*:name:.stat with :*:initial_text: \n/ learn <:*:clf:> /:*:regex:/ [:initial_text:] (:*:name:.stat) # syscall /ls -la 1>&2 / liaf { match [:clf:] /svm|sks/ syscall /rm -rf versus.stat/ } } return # :learn_files: (:file_list:) # match [:file_list:] // { # output /Cstatfiles: ":*:cstatfiles:" \n/ match [:file_list:] \ /([[:graph:]]+)[[:blank:]]+([[:graph:]]+)/ (:: :cnam: :fnam:) #output /\nExample file: :*:fnam: (:*:cnam:) / input [ :*:fnam: 0 :*:decision_length: ] (:ftext:) { # is there an input filter? match [:input_filter:] /./ syscall /:*:input_filter:/ (:*:ftext:) (:ftext:) #output /text: :*:ftext:\n/ } { classify <:*:clf:> /:*:regex:/ [:ftext:] (:*:cstatfiles:) (:s:) } #output /:*:s:\n/ # Did our classify result say we're good? { match [:s:] (:L: :pr:) \ /^\#. \(:*:cnam:.*pR:[[:blank:]]+([[:graph:]]+)/ { output /./ eval /:@: :*:pr: <= :*:thickness: :/ { eval /:@: :*:pr: < 0 :/ output /\nX/ } { { match [:verbose:] /./ output /\nExample file: :*:fnam: (:*:cnam:) / output /(pR: :*:pr:) learning into :*:cnam:.stat / } alius { match [:show_pr:] /./ output / :*:cnam: :*:pr: / } } learn <:*:clf:> [:ftext:] /:*:regex:/ (:*:cnam:.stat) { # if doublesided, go through the list of all # classfiles and anti-learn if it's not # our class. match [:doublesided:] /./ match [:cstatfiles:] /.*/ (:cs_alt:) { match [:cs_alt:] \ /[[:graph:]]+/ (:csfil:) { match [:csfil:] \ /:*:cnam:/ output \ /learn-out: :*:csfil:\n/ learn <:*:clf: refute> \ [:ftext:] (:*:csfil:) /:*:regex:/ } liaf } } { # fixup for versus-file anomaly in svm/sks match [:clf:] /svm|sks/ learn <:*:clf:> /:*:regex:/ \ (:*:cstatfiles: ) } { match [:verbose:] /./ output / trained./ } } } liaf } return :check_files: (:file_list:) # output /\nNow the final testing 10%: / output [:*:results:] /=-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=\n/ match [:file_list:] // { match [:file_list:] \ /([[:graph:]]+)[[:blank:]]+([[:graph:]]+)/ (:: :cnam: :fnam:) output [:*:results:] /File: :*:fnam: class: :*:cnam: / input [:*:fnam: 0 :*:decision_length:] (:ftext:) { classify <:*:clf:> /:*:regex:/ [:ftext:] (:*:cstatfiles:) (:s:) # output /:*:s:\n/ } # Get our :*:results: back { { match [:s:] (:: :pr:) \ /^\#. \(:*:cnam:.*pR:[[:blank:]]+([[:graph:]]+)/ } alius output / BOGUS!!!!!\n/ } # Did our classify result say we're good? { { eval /:@: :*:pr: > 0.0 :/ output /-/ output [:*:results:] \ / pR: :*:pr: CORRECT.\n/ } alius { output /X/ output [:*:results:] \ / pR: :*:pr: WRONG.\n/ } } liaf } return crm114-20100106-BlameMichelson.src/crm114.spec0000644000000000017500000000322511321154266016662 0ustar rootwsySummary: CRM114 Bayesian Spam Detector Name: crm114 Version: 20031215RC12 Release: 1 URL: http://crm114.sourceforge.net/ License: GPL Group: Applications/CPAN Source0: http://crm114.sourceforge.net/%{name}-%{version}.src.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-root BuildPreReq: tre-devel %description CRM114 is a system to examine incoming e-mail, system log streams, data files or other data streams, and to sort, filter, or alter the incoming files or data streams according to the user's wildest desires. Criteria for categorization of data can be by satisfaction of regexes, by sparse binary polynomial matching with a Bayesian Chain Rule evaluator, or by other means. %prep %setup -q -n %{name}-%{version}.src %build make INSTALL_DIR=$RPM_BUILD_ROOT%{_bindir} %clean rm -rf $RPM_BUILD_ROOT %install rm -rf $RPM_BUILD_ROOT mkdir -p $RPM_BUILD_ROOT%{_bindir} make BINDIR=${RPM_BUILD_ROOT}%{_bindir} install [ -x /usr/lib/rpm/brp-compress ] && /usr/lib/rpm/brp-compress %files %defattr(-,root,root) %{_bindir}/* %doc *.txt *.recipe %changelog * Mon Dec 15 2003 Bill Yerazunis - removed -RCx stuff, now version contains it. - updated for version 20031215-RC12 - License is GPL, not Artistic, so I corrected that. * Sat Dec 13 2003 Kevin Fenzi - Converted line endings from dos format to unix. - Changed BuildPreReq to be 'tre-devel' - Fixed install to install into rpm build root. - tested on redhat 9 with latest tre. * Tue Oct 22 2003 Nico Kadel-Garcia - Created RedHat compatible .spec file - Added libtre dependency to avoid building second package - Hard-coded "INSTALL_DIR" in build/install setups crm114-20100106-BlameMichelson.src/crm_var_hash_table.c0000644000000000017500000015015011321154266020746 0ustar rootwsy// crm_var_hash_table.c - handle variable hash tables // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *tempbuf; // initialize the variable hash table (the vht) // and stuff in the "standards" (:_vars:, environment vars) // void crm_vht_init (int argc, char **argv) { long i, j, k; long uvstart = 0; // uvstart is the arg that the user sees (post "--") long uvlist = 0; char uvset[MAX_VARNAME]; extern char **environ; char posvars[MAX_VARNAME]; // create the variable hash table (one big one, shared ) vht = (VHT_CELL **) malloc (sizeof (VHT_CELL *) * vht_size); if (!vht) untrappableerror5("Couldn't malloc VHT cell.\n", "No VHT cells, no variables, so no can run. Sorry.", CRM_ENGINE_HERE); for (i = 0; i < vht_size; i++) vht[i] = NULL; // initialize the temporary (non-data-window) area... tdw = malloc (sizeof (CSL_CELL)); if (!tdw) untrappableerror5 ("Couldn't malloc tdw.\n" "We need the TDW for isolated variables." "Can't continue. Sorry.\n","", CRM_ENGINE_HERE); tdw->filename = NULL; tdw->rdwr = 1; tdw->filedes = -1; tdw->filetext = malloc (sizeof (char) * data_window_size); if (!tdw->filetext) untrappableerror5("Couldn't malloc tdw->filetext.\n" "Without this space, you can't have any isolated " "variables,\n and we're stuck. Sorry.","", CRM_ENGINE_HERE); tdw->filetext[0] = '\000'; tdw->nchars = 0; tdw->hash = 0; tdw->mct = NULL; tdw->nstmts = -1; tdw->cstmt = -1; tdw->caller = NULL; // install a few constants. crm_set_temp_var (":_nl:", "\n"); crm_set_temp_var (":_ht:", "\t"); crm_set_temp_var (":_bs:", "\b"); crm_set_temp_var (":_sl:", "/"); crm_set_temp_var (":_sc:", ";"); crm_set_temp_var (":_cd:", "0"); crm_set_temp_var ("::", " "); // put the version string in as a variable. { char verstr[1025]; verstr[0] = 0; strcat (verstr, VERSION); strcat (verstr, " ( "); strcat (verstr, crm_regversion()); strcat (verstr, " )"); crm_set_temp_var (":_crm_version:", verstr); }; // // install the argc and argv values; restart argv values from [2] // if a "--" metaflag is seen. // // argv[0] and argv[1] are not overrideable by "--". crm_set_temp_var ( ":_arg0:", argv[0] ); crm_set_temp_var ( ":_arg1:", argv[1] ); // Check to see if there's a "--" arg. If so, mark uvstart // (that is, "user var start" at that point)... but only the first "--". { long i, j; uvstart = 2; i = 0; j = 0; for (i = 2; argc > i; i++) { // Check for the "--" metaflag if (strlen (argv[i]) == 2 && strncmp (argv[i], "--", 2) == 0 && uvstart == 2) { if (internal_trace) fprintf (stderr, "Resetting uvstart counter to 2\n"); uvstart = i+1; }; } }; // The user variables start at argv[uvstart] { long i, j; char anamebuf [255]; j = 2; for ( i = uvstart; argc > i; i++ ) { sprintf (anamebuf, ":_arg%ld:", j); crm_set_temp_var ( anamebuf, argv[i] ); j++; }; // // and put the "user-visible" argc into a var as well. sprintf (anamebuf, "%ld", j ); crm_set_temp_var (":_argc:", anamebuf); // // Go through argv, and place positional arguments (that is, // arguments that don't contain any '-' preambles) into // :_pos0:, :_pos1:, ... // // :_pos0: is always the name of the CRM114 engine. // :_pos1: is always the name of the program being run. // :_pos2: and so on are the command line args. // // prepare to treasure up the positional args posvars[0] = '\000'; j = 0; for ( i = uvstart; i < argc ; i++ ) { // // check for the "-" sign; this is a positional argument only // if there is no "-" sign. if (argv[i][0] != '-') { sprintf (anamebuf, ":_pos%ld:", j); crm_set_temp_var ( anamebuf, argv[i] ); j++; if (j>0) strcat (posvars, " "); strcat (posvars, argv[i]); }; }; sprintf (anamebuf, "%ld", j); crm_set_temp_var (":_posc:", anamebuf); crm_set_temp_var (":_pos_str:", posvars); // // and set the fault to be a null string for now. crm_set_temp_var (":_fault:", ""); // // set the current line number to a set of zeroes... crm_set_temp_var (":_cs:", "00000000"); // // Set the "lazy" intermediate variable to just a space. // This will get rebound to point to the active lazy var. crm_set_temp_var (":_lazy:", " "); // set the current pid and parent pid. { char pidstr [32]; long pid; pid = (long) getpid(); sprintf (pidstr, "%ld", pid); crm_set_temp_var (":_pid:", pidstr); #ifndef CRM_WINDOWS pid = (long) getppid(); sprintf (pidstr, "%ld", pid); crm_set_temp_var (":_ppid:", pidstr); #endif // !CRM_WINDOWS } }; // now, we shove the whole contents of the ENVIRON // vector into the VHT. i = 0; tempbuf[0] = '\000'; if ( ! ignore_environment_vars) while (environ [i]) { char *name; char *value ; j = 0; if (strlen (tempbuf) + strlen (environ[i]) < (data_window_size - 1000)) { strcat (tempbuf, environ[i]); strcat (tempbuf, "\n"); } else untrappableerror5 ("The ENVIRONMENT variables don't fit into the " "available space. \nThis is very broken. Try " "a larger data window (with flag -w NNNNN), \nor " "drop the environment vars with " "the (with flag -e)", "", CRM_ENGINE_HERE); while (environ[i][j] != '=') j++; name = (char *) malloc ((sizeof (char)) * (j+200)); if (!name) untrappableerror5("Couldn't malloc :_env_ space." "Can't continue.\n","", CRM_ENGINE_HERE); strcpy (name, ":_env_"); memmove (&(name[strlen(name)]), &(environ[i][0]), j); name[j+6] = '\000'; strcat (name, ":"); j++; // step past the equals sign. k = 0; value = strdup (&(environ[i][j+k])); crm_set_temp_var (name, value); free (name); free (value); i++; // and do the next environment variable }; crm_set_temp_var (":_env_string:", tempbuf); // see if argv [1] is a '-( whatever) arg, which limits the // set of runtime parameters allowed on the command line. // If so, we have the limit list. We put spaces around the // args so we can just use strstr(3) to see if an arg is permitted // or if we should fault out. Note that at this point, // we've trashed the contents of uvlist (the parens and the // trailing '--', if there was one. // if (strncmp (argv[1], "-(", 2) == 0) { long closepos; uvlist = 1; strcpy (uvset, " "); strncat (uvset, &argv[1][2], strlen (argv[1]) - 3); // nuke the closing paren closepos = 2; while (uvset[closepos] != ')' && uvset[closepos] != '\000') closepos++; uvset[closepos] = '\000'; strcat (uvset, " "); if (user_trace) fprintf (stderr, "UVset: =%s=\n", uvset); } // // // go through argv again, but this time look for "--foo" // and "--foo=bar" args. // { long i, j, k; char anamebuf [MAX_VARNAME]; char avalbuf [MAX_VARNAME]; long isok; i = 0; j = 0; k = 0; for ( i = uvstart; argc > i; i++ ) { // check for the "--" metaflag preamble if (strlen ( argv[i] ) > 2 && strncmp (argv[i], "--", 2) == 0) { isok = 1; if (uvlist == 1) { isok = 0; // build a testable name out of the -- flagname strcpy (anamebuf, " "); j=2; k = 1; while (argv[i][j] != '\000' && argv[i][j] != '=') { anamebuf[k] = argv[i][j]; j++; k++; }; anamebuf[k] = 0; strcat (anamebuf, " "); // // now we have the var name, surrounded by spaces // we strstr() it to see if it's allowed or not. if (strstr(uvset, anamebuf)) isok = 1; // // Well, maybe the name by itself is too loose; // also allow name=value strcpy (anamebuf, " "); strcat (anamebuf, &argv[i][2]); strcat (anamebuf, " "); if (strstr(uvset, anamebuf)) isok = 1; } if (isok) { if (internal_trace) fprintf (stderr, "setting cmdline string %s", argv[i]); strcpy (avalbuf, "SET"); j = 2; k = 0; // copy the varname into anamebuf anamebuf[k] = ':'; k++; while (argv[i][j] != '\000' && argv[i][j] != '=') { anamebuf[k] = argv[i][j]; j++; k++; }; anamebuf[k] = ':'; k++; anamebuf[k] = '\000'; if (argv[i][j] == '=') { j++; // skip over the = sign k = 0; while (argv[i][j] != '\000') { avalbuf[k] = argv[i][j]; j++; k++; } avalbuf [k] = '\000'; } if (user_trace) fprintf (stderr, "\n Setting cmdline var '%s' to '%s'\n", anamebuf, avalbuf); crm_set_temp_var ( anamebuf, avalbuf ); } else { fprintf (stderr, "\n ***Warning*** " "This program does not accept the " "flag '%s' , \n", anamebuf); fprintf (stderr, " so we'll just ignore it for now. \n"); }; }; }; }; } // routine to put a variable into the temporary (tdw) // buffer. names and values end up interleaved // sequentially, separated by newlines. TDW really should have // been called the idw (Isolated Data Window) but it's too // late to fix it now. // // void crm_set_temp_nvar (char *varname, char *value, long vallen) { long namestart, namelen; long valstart; long i; long vnidx, vnlen; // do the internal_trace thing if (internal_trace) fprintf (stderr, " setting temp-area variable %s to value %s\n", varname, value); i = crm_nextword (varname,strlen (varname), 0, &vnidx, &vnlen); if ( i == 0) { nonfatalerror5 ("Somehow, you are assigning a value to a variable with", "an unprintable name. I'll permit it for now, but" "your program is probably broken.", CRM_ENGINE_HERE); }; if ( (strlen (varname) + vallen + tdw->nchars + 1024) > data_window_size) { nonfatalerror5 ("This program has overflowed the ISOLATEd data " "area with a variable that's just too big. We'll " "clip the tail off the string to fit in available memory. " "The big bad variable was named: ", varname, CRM_ENGINE_HERE); vallen = data_window_size - (strlen (varname)) - tdw->nchars - 1024; if (vallen < 1) fatalerror5 ("Your program is so low on memory that it could not " "even clip the big variable. This is really bad. " "The evil variable was named: " , varname, CRM_ENGINE_HERE); }; // check- is this the first time we've seen this variable? Or // are we re-assigning a previous variable? i = crm_vht_lookup (vht, &varname[vnidx], vnlen); if (vht[i] == NULL) { // never assigned this variable before, so we stick it in the // tdr window. // // do the name first. Start with a newline. // GROT GROT GROT tdw->filetext[tdw->nchars] = '\n'; tdw->nchars++; namestart = tdw->nchars; namelen = vnlen; memmove (&(tdw->filetext[tdw->nchars]), &(varname[vnidx]), namelen); tdw->nchars = tdw->nchars + namelen; // // and add a separator to prevent the varname from sharing // an endpoint offset with the var value. tdw->filetext[tdw->nchars] = '='; tdw->nchars++; // // and the value second valstart = tdw->nchars; memmove (&tdw->filetext [tdw->nchars], value, vallen); tdw->nchars = tdw->nchars + vallen; // // add a separator again, so we don't get strings with overlapped // ranges into the var hash table tdw->filetext[tdw->nchars] = ' '; tdw->nchars++; // // and put a NUL at the end of the tdw, so debuggers won't get // all bent out of shape. tdw->filetext[tdw->nchars] = '\000'; // // now, we whack the actual VHT. crm_setvar (NULL, 0, tdw->filetext, namestart, namelen, tdw->filetext, valstart, vallen, 0, 0); // that's it. } else { // This variable is preexisting. Perform an ALTER on it. // crm_destructive_alter_nvariable ( &varname[vnidx], vnlen, value, vallen ); }; } // GROT GROT GROT this routine needs to replaced for 8-bit-safeness. // Use ONLY where you can be sure no embedded NULs will be seen (i.e. // fixed strings in the early startup. // void crm_set_temp_var (char *varname, char *value) { crm_set_temp_nvar (varname, value, strlen (value)); } // routine to put a data-window-based (the cdw, that is) // variable into the VHT. The text of the variable's name // goes into the tdw buffer, and the value stays in the main // data window (cdw) buffer. // // This is equivalent to a "bind" operation - that is, the // pointers move around, but the data window doesn't get // changed. // // Note - if you rebind a var, you should consider if your // routine should also evaluate the old area for reclamation. // (reclamation uses "crm_compress_tdw_section", see comments // further down in the code here) void crm_set_windowed_nvar ( char *varname, long varlen, char *valtext, long start, long len, long stmtnum) { long i; long namestart, namelen; // do the internal_trace thing if (internal_trace) { long i; fprintf (stderr, " setting data-window variable %s to value ", varname); for (i = start; i < start+len; i++) fprintf (stderr, "%c", valtext[i]); fprintf (stderr, "\n"); }; // check and see if the variable is already in the VHT i = crm_vht_lookup (vht, varname, varlen); if (vht[i] == NULL) { // nope, never seen this var before, add it into the VHT // namestart is where we are now. if (internal_trace) fprintf (stderr, "... new var\n"); // // Put the name into the tdw memory area, add a & after it. // // do the name first. Start on a newline. tdw->filetext[tdw->nchars] = '\n'; tdw->nchars++; namestart = tdw->nchars; namelen = varlen; memmove (&tdw->filetext[namestart], varname, varlen); tdw->nchars = tdw->nchars + namelen; // // put in an "&" separator tdw->filetext[tdw->nchars] = '&'; tdw->nchars++; // // now, we whack the actual VHT. crm_setvar (NULL, 0, tdw->filetext, namestart, namelen, valtext, start, len, stmtnum, 0); // that's it. } else { // We've seen this var before. But, there's a gotcha. // If the var _was_ in the tdw, but is now being moved back // to the cdw, or being rebound inside another tdw var, // then the prior var value might now be dead- that is, "leaked // memory", and now inaccessible. // { // move the text/start/len values around to accomodate the new // value. // if (internal_trace) fprintf (stderr, "... old var\n"); crm_setvar (NULL, 0, vht[i]->nametxt, vht[i]->nstart, vht[i]->nlen, valtext, start, len, stmtnum, 0); // Do we need to repair the leaked memory? Only necessary if the // old text was in the tdw area; this is harmless if the area // is in use by another var, but if we have removed the last // reference to any tdw-based vars, we ought to reclaim them.. // // NOTE - we don't do it here since synchronicity issues // between a var being rebound, reclamation happening, // and then another var _in the same match_ being bound // (to a old, unupdated set of offsets) is such a pain. // // Instead, routines using this routine should also be sure // to call crm_compress_tdw_section if there's a chance they // should be releasing TDW memory. AFTER they've done ALL the // rebinding. That way, all indices and offsets are in the VHT // where they can be safely updated. // }; }; } //#define RECLAIM_ALL_EVERY_TIME 1 #ifdef RECLAIM_ALL_EVERY_TIME // // How we compress out an area that might no longer be in use. static long crm_recursive_compress_tdw_section (char *oldtext, long oldstart, long oldend); long crm_compress_tdw_section (char *oldtext, long oldstart, long oldend) { // let's court death, and do a FULL compress. return (crm_recursive_compress_tdw_section (tdw->filetext, 0, tdw->nchars + 1)); } long crm_recursive_compress_tdw_section (char *oldtext, long oldstart, long oldend) #else // !RECLAIM_ALL_EVERY_TIME long crm_compress_tdw_section (char *oldtext, long oldstart, long oldend) #endif // !RECLAIM_ALL_EVERY_TIME { // The algorithm basically checks to see if there is any region of // the given tdw space that is not currently used by another var. // All such regions are reclaimed with a slice-n-splice. We return // the number of reclaimed characters. // // The algorithm starts out with start and end of the tenatively // unused "to be killed" region. It checks each member of the VHT // in the TDW. If the region overlaps, don't kill the overlapping // part of the region. If at any time the region length goes to 0, // we know that there's no region left to kill. (Option- if the // gap is less than MAX_RECLAIMER_GAP chars, we don't bother moving // it; we retain it as buffer. This minimizes thrashing // // NOTE that the END VALUES ONLY "oldend" and "newend" vars are // NON-inclusive, they index the first NON-involved character // (oldstart and newstart index "involved" characters, that we _do_ // include in our strings) // // BIG ISSUE: As coded, this routine needs to leave a _buffer_ of at // least one UNUSED character between each used (but isolated) string // area. Knowing when to get rid of extra copies of this character // has been a big hassle. Right now there may be a small leak here // so if you can find it, please let me know! Note that any fix that // does not keep two adjacent isolated regions from merging (including // when the first or second becomes a zero-length string!) will get // the submittor a gentle smile and a pointer to this very comment. // (the reason being that prior code that did not leave a buffer // exhibited the property that if A and B were isolated but adjacent, // and then A shrank to 0 length, then B would share the same start // point, and an alteration to A would then *also* insert at the start // point of B, causing A and B to become NONisolated and space-sharing. // That said- enjoy the bug hunt. :) long j, newstart, newend, reclaimed; j = newstart = newend = reclaimed = 0; // return (0); j = newstart = newend = reclaimed = 0; if (internal_trace) fprintf (stderr, " [ Compressing isolated data. Length %ld chars, " "start %ld, len %ld ]\n", tdw->nchars, oldstart, oldend - oldstart); // If oldstart >= oldend, then there's no compression to be done. // if (oldstart >= oldend ) { if (internal_trace) fprintf (stderr, " [ Zero-length compression string... don't do this! ]\n"); return (0); } if (oldtext != tdw->filetext) { fatalerror5 (" Request to compress non-TDW data. This is bogus. ", " Please file a bug report", CRM_ENGINE_HERE); return ( 0 ); }; // Look one character further to before and after; //if (oldstart > 3) oldstart --; //if (oldend < data_window_size - 1) oldend ++; for (j = 0; j < vht_size; j++) { if (vht[j] // is this slot in use? && vht[j]->valtxt == tdw->filetext // Note that being part of :_iso: does NOT exclude from reclamation && 0 != strncmp (&vht[j]->nametxt[vht[j]->nstart], ":_iso:", 6 )) { // for convenience, we get nice short names: newstart = vht[j]->vstart - 1; newend = newstart + vht[j]->vlen + 2; // leave some space no matter what... if (newend < newstart + 2) newend = newstart + 2; // 6 Possible cases: // dead zone entirely before current var // dead zone entirely after current var // dead zone entirely inside current var // dead zone overlaps front of current var // dead zone overlaps back of current var // dead zone split by current var // // 1: dead zone entirely before current var // // // // if ( oldend <= newstart) { // nothing to be done here - not overlapping goto end_of_vstring_tests; }; // 2: dead zone entirely after current var // // // // if ( newend <= oldstart ) { // nothing to be done here - not overlapping goto end_of_vstring_tests; }; // If we get this far, the dead zone in some way overlaps with // our current variable. // 3: dead zone entirely inside a currently live var // // // // // So we terminate this procedure (nothing can be reclaimed) // if (oldstart >= newstart && oldend <= newend) { // the dead zone is inside a non-dead var, so // we can terminate our search right now. if ( internal_trace) fprintf (stderr, " [ Compression not needed after all. ]\n"); return ( 0 ); }; // 4: dead zone overlaps front of current var; we trim the // dead zone to not include the current var. // // // // if ( oldstart < newstart && oldend <= newend ) { // The dead zone should not include the part that's // also new variable. So, we clip out the part // that's still active. if ( internal_trace) fprintf (stderr, " [ Trimming tail off of compression. ]\n"); // // newstart is a "good" char, but since oldend is // noninclusive, this is right. oldend = newstart; goto end_of_vstring_tests; }; // 5: dead zone overlaps back of current var; trim the front off // the dead zone. // // // // if (newstart <= oldstart && newend <= oldend) { if (internal_trace) fprintf (stderr, " [ Trimming head off of compression. ]\n"); // // Newend is the first char that ISN'T in the var, so this // is correct. oldstart = newend ; goto end_of_vstring_tests; }; // 6: dead zone split by current var - the dead zone is actually // split into two distinct pieces. In this case, we need to // recurse on the two pieces. // // // // if ( oldstart <= newstart && newend <= oldend ) { if (internal_trace) { fprintf (stderr, " [ Compression split ]\n"); fprintf (stderr, " [ First part will be %ld to %ld .]\n", oldstart, newstart); fprintf (stderr, " [ Second part will be %ld to %ld .]\n", newend, oldend); }; // // Tricky bit here - we have to do the aft (ne-oe // section) first, so we don't move the os-ns // section offsets. // // was newend - 1, but should be same as case 3 // above (dead zone overlaps tail) #ifdef RECLAIM_ALL_EVERY_TIME reclaimed = crm_recursive_compress_tdw_section (oldtext, newend, oldend); reclaimed +=crm_recursive_compress_tdw_section(oldtext, oldstart, newstart); #else // ! RECLAIM_ALL_EVERY_TIME reclaimed = crm_compress_tdw_section (oldtext, newend, oldend); reclaimed +=crm_compress_tdw_section(oldtext, oldstart, newstart); #endif // ! RECLAIM_ALL_EVERY_TIME // Return here instead of executing common slice-and-splice // tail, because each of our recursive children will do // that for us. return (reclaimed); } } // and the semicolon to keep some compilers happy end_of_vstring_tests: ; // Now, repeat with the name string - all name strings are protected if (vht[j] && vht[j]->nametxt == tdw->filetext) { newstart = vht[j]->nstart - 1 ; newend = newstart + vht[j]->nlen + 2; // leave some space no matter what... if (newend < newstart + 4) newend = newstart + 2; // Possible cases: // dead zone entirely before current var // dead zone entirely after current var // dead zone entirely inside current var // dead zone overlaps front of current var // dead zone overlaps back of current var // dead zone split by current var // // dead zone entirely before current var // // // // OK if ( oldend <= newstart) { // nothing to be done here - not overlapping goto end_of_nstring_tests; }; // dead zone entirely after current var // // // // if ( newend <= oldstart ) { // nothing to be done here - not overlapping goto end_of_nstring_tests; }; // If we get this far, the dead zone in some way overlaps with // our current variable. // dead zone entirely inside a currently live var // // // // // So we terminate this procedure (nothing can be reclaimed) // if (oldstart >= newstart && oldend <= newend) { // the dead zone is inside a non-dead var, so // we can terminate our search right now. if ( internal_trace) fprintf (stderr, " [ Compression not needed after all. ]\n"); return ( 0 ); }; // dead zone overlaps front of current var; we trim the // dead zone to not include the current var. // // // // if ( oldstart < newstart && oldend <= newend ) { // The dead zone should not include the part that's // also new variable. So, we clip out the part // that's still active. if ( internal_trace) fprintf (stderr, " [ Trimming tail off of compression. ]\n"); // // newstart is a "good" char, but since oldend is // noninclusive, this is right. oldend = newstart; goto end_of_nstring_tests; }; // dead zone overlaps back of current var; trim the front off // the dead zone. // // // // if (newstart <= oldstart && newend <= oldend) { if (internal_trace) fprintf (stderr, " [ Trimming head off of compression. ]\n"); // // Newend is the first char that ISN'T in the var, so this // is correct. oldstart = newend ; goto end_of_nstring_tests; }; // dead zone split by current var - the dead zone is actually // split into two distinct pieces. In this case, we need to // recurse on the two pieces. // // // // if ( oldstart <= newstart && newend <= oldend ) { if (internal_trace) { fprintf (stderr, " [ Compression split ]\n"); fprintf (stderr, " [ First part will be %ld to %ld .]\n", oldstart, newstart); fprintf (stderr, " [ Second part will be %ld to %ld .]\n", newend, oldend); }; // // Tricky bit here - we have to do the aft (ne-oe // section) first, so we don't move the os-ns // section offsets. // // was newend - 1, but should be same as case 3 // above (dead zone overlaps tail) #ifdef RECLAIM_ALL_EVERY_TIME reclaimed = crm_recursive_compress_tdw_section (oldtext, newend, oldend); reclaimed +=crm_recursive_compress_tdw_section (oldtext, oldstart, newstart); #else // ! RECLAIM_ALL_EVERY_TIME reclaimed = crm_compress_tdw_section (oldtext, newend, oldend); reclaimed +=crm_compress_tdw_section (oldtext, oldstart, newstart); #endif // ! RECLAIM_ALL_EVERY_TIME // Return here instead of executing common slice-and-splice // tail, because each of our recursive children will do // that for us. return (reclaimed); } // and the semicolon to keep some compilers happy end_of_nstring_tests: ; }; }; // // Well, we've now scanned the VHT, and oldstart/oldend are the // actual dead zone (storage that really isn't used). // // So, we can compress this storage out with a slice-and-splice // return how many character cells we were able to reclaim. // { long cutlen; // cutlen is supposed to be negative for compress cutlen = oldstart - oldend - 1; if (cutlen > 0) fatalerror5 ("Internal cut-length error in isolated var reclamation.", " Please file a bug report", CRM_ENGINE_HERE); // Future Enhancement - dead zones of some small size should be // allowed to stay. This would speed up WINDOW a lot. (but we // would need to expand the range of oldstart and oldend to // actually reclaim those areas if storage really ran low. // Maybe this should be compile-time or command-line parameter?) if (cutlen < 0) { if (internal_trace) { fprintf (stderr, " [ compression slice-splice at %ld for %ld chars. ]\n", oldstart, cutlen); } crm_slice_and_splice_window (tdw, oldstart, cutlen); if (internal_trace) { fprintf (stderr, " [ new isolated area will be %ld bytes. ]\n", tdw->nchars); }; }; return (- (cutlen)); }; } // // Destructive alteration of a preexisting variable, which can be // anywhere. If the variable is not preexisting, we create it and // toss a nonfatal error. // void crm_destructive_alter_nvariable (char *varname, long varlen, char *newstr, long newlen) { long i; long vhtindex, oldlen, delta; // get the first variable name and verify it exists. // GROT GROT GROT this should use nextword!!! i = 0; while (varname[i] < 0x021 && i < varlen) i++; vhtindex = crm_vht_lookup (vht, &(varname[i]), varlen); if (vht[vhtindex] == NULL) { // IGNORE FOR NOW nonfatalerror5 (" Attempt to alter the value of a nonexistent " "variable, so I'm creating an ISOLATED variable. " "I hope that's OK. The nonexistent variable is: ", &(varname[i]), CRM_ENGINE_HERE); crm_set_temp_var (&varname[i], ""); }; // make enough space in the input buffer to accept the new value oldlen = vht[vhtindex]->vlen; delta = newlen - oldlen; mdw = NULL; if (tdw->filetext == vht[vhtindex]->valtxt) mdw = tdw; if (cdw->filetext == vht[vhtindex]->valtxt) mdw = cdw; // GROT GROT GROT get rid of this if we go to MAPped file vars. if (mdw == NULL) { fatalerror5 (" Bogus text bloc containing variable : ", varname, CRM_ENGINE_HERE); goto bailout; }; // if (user_trace) // major debug { long i; // fprintf (stderr, "\n surgery on the var %s\n ", varname); fprintf (stderr, " surgery on the var >"); for (i = 0; i < varlen; i++ ) fprintf (stderr, "%c", varname[i]); fprintf (stderr, "<\n"); //fprintf (stderr, "new value is: \n***%s***\n", newstr); fprintf (stderr, " new value is ***>"); for (i = 0; i < newlen; i++ ) fprintf (stderr, "%c", newstr[i]); fprintf (stderr, "<***\n"); } // slice and splice the mdw text area, to make the right amount of // space... crm_slice_and_splice_window (mdw, vht[vhtindex]->vstart, delta); // // Zap the mstart and mlen markers so that searches are reset to start // of the variable. Note that we have to do this _after_ we slice // and splice, otherwise we mangle our own mstart and mlen. vht[vhtindex]->mstart = vht[vhtindex]->vstart; vht[vhtindex]->mlen = 0; // // now we have space, and we can put in the characters from // the new pattern memmove (&(mdw->filetext[vht[vhtindex]->vstart]), newstr, newlen); // semicolon (null stmt) on next line to keep some compilers happy: // bailout: ; }; // Surgically lengthen or shorten a window. The window pointed // to by mdw gets delta extra characters added or cut at "where". // (more precisely, just _before_ "where" - the insert/delet // point is just before the "where'th" character, and the // where'th character will be the first one moved. If the // allocated length is not enough, additional space can be // malloced. Finally, the vht is fixed up so everything still // points "correctly". // void crm_slice_and_splice_window ( CSL_CELL *mdw, long where, long delta) { char *taildest; char *tailsrc; long taillen; // these are to keep the compiler quiet. taildest = NULL; tailsrc = NULL; taillen = 0; if (delta + mdw->nchars > data_window_size - 10) { fatalerror5 (" Data window trying to get too long.", " Try increasing the data window maximum size.", CRM_ENGINE_HERE); goto bailout; }; if (delta == 0) { if (internal_trace) { fprintf (stderr, " zero delta, no buffer hackery required\n"); }; return; }; // bump chars in input window delta places if (internal_trace) { fprintf (stderr, "moving text in window %lx,", (long int) mdw->filetext); fprintf (stderr, " starting at %ld, ", where); fprintf (stderr, "delta length is %ld\n", delta); }; if (delta > 0) { // lengthening alteration... taildest = &(mdw->filetext[where + delta]); tailsrc = &(mdw->filetext[where]); taillen = mdw->nchars - where; }; if (delta < 0) // shortening alteration { taildest = &(mdw->filetext[where]); tailsrc = &(mdw->filetext[where - delta]); // delta is minus already!! taillen = mdw->nchars - where + delta; // taillen = mdw->nchars + 1 - where; } if (internal_trace) fprintf (stderr, "buffer sliding, tailsrc: %lx, taildest: %lx, length: %ld\n", (long int) tailsrc, (long int) taildest, taillen); // and move the actual data if (taillen + 1 > 0) memmove ( taildest, tailsrc, taillen + 1 ); // update the length of the window as well. mdw->nchars = mdw->nchars + delta; // and update all of our captured variables to have the right ptrs. crm_updatecaptures (mdw->filetext, where, delta); bailout: // GROT GROT GROT // The following bit of absolutely meaningless code is just there // so that some versions of the C compiler don't complain. It does // nothing. { delta = 0; } } // allow_data_window_to_grow #ifdef no_dont_do_this_yet // Grow the window to hold the incoming text, if needed. // Grow it by 4x each time. while (delta + mdw->nchars > data_window_size - 1) { char *ndw; long odws, i; odws = data_window_size; data_window_size = 4 * data_window_size; nonfatalerror5 (" Data window trying to get too long.", " increasing data window... ", CRM_ENGINE_HERE); ndw = (char *) malloc ( data_window_size); if (!ndw) untrappableerror5("Couldn't malloc ndw. This is bad too.\n","", CRM_ENGINE_HERE); // now copy the old data window into the new one memmove (ndw, mdw->filetext, odws); // and update the outstanding pointers, like the ones in the // vht... for (i = 0; i < vht_size; i++) if (vht[i] != NULL) { if (vht[i]->nametxt == mdw->filetext) vht[i]->nametxt = ndw; if (vht[i]->valtxt == mdw->filetext) vht[i]->valtxt = ndw; }; // and lastly, point the cdw or tdw to the new larger window. free (mdw->filetext); mdw->filetext = ndw; }; #endif // no_dont_do_this_yet // // crm_vht_lookup - given a char *start, long len, varnam // finds and returns the vht index of the variable // or the index of the appropriate NULL slot to put // the var in, if not found. long crm_vht_lookup (VHT_CELL **vht, char *vname, long vlen) { unsigned long hc; unsigned long i, j, k; int done; long vsidx; long vslen; j = 0; // just so J is used. // Consistency scan - look for those null varnames! Do this every // time! if (1) { long i, j; long corrupted; for (i = 0; i < vht_size; i++) { corrupted = 0; if (vht[i] != NULL && vht[i]->nlen < 2) fprintf (stderr, "Short length %ld ", i); if (vht[i] !=NULL && vht[i]->nlen > 1) { if (vht[i]->nametxt[vht[i]->nstart] != ':') { fprintf (stderr, "Ztart corrupted "); corrupted = 1; }; if (vht[i]->nametxt[vht[i]->nstart + vht[i]->nlen - 1] != ':') { fprintf (stderr, "Zend corrupted "); corrupted = 1; }; if (corrupted) { fprintf (stderr, " i %ld len %ld name = -", i, vht[i]->nlen ); for (j = 0; j < vht[i]->nlen; j++) fprintf (stderr, "%c", vht[i]->nametxt[vht[i]->nstart + j]); fprintf (stderr, "- "); } }; } }; crm_nextword ( vname, vlen, 0, &vsidx, &vslen); if (internal_trace) { fprintf (stderr, " variable len %ld, name is -", vslen); for (k = vsidx; k < vsidx+vslen; k++) fprintf (stderr, "%c", vname[k]); fprintf (stderr, "- .\n"); }; hc = (strnhash ( &vname[vsidx], vslen)) % vht_size; // go exploring - find either an empty cell (meaning that this // is the first time this variable name has been entered into the // vht) or find the variable already entered. Or find that we've // gone the whole way 'round the vht, in which case the vht is full // and we should print ut a message and fatal error away (or maybe // even build a bigger vht?) i = hc; // consider a "wrap" to have occurred if we even think about // the slot just before the hashcoded slot done = 0; while ( ! done ) { // is there anything here yet? if (vht[i] == NULL) { if (internal_trace) { int ic; fprintf (stderr, " var "); for (ic = 0; ic < vlen; ic++) fprintf (stderr, "%c", vname[ic]); fprintf (stderr, "(len %ld) not at %ld (empty)\n", vlen, i); fprintf (stderr, "Returning the index where it belonged.\n"); }; return (i); }; // there's something here - is it what we have been seeking if ( vlen == vht[i]->nlen && memcmp (&((vht[i]->nametxt)[vht[i]->nstart]), vname, vlen) == 0) { // Yes, we found it. if (internal_trace) { int ic; fprintf (stderr, " var '"); for (ic = 0; ic < vht[i]->nlen; ic++) fprintf (stderr, "%c", (vht[i]->nametxt)[ic+vht[i]->nstart] ); fprintf (stderr, " (len %ld) found at %ld (", vlen, i); if (vht[i]->valtxt == cdw->filetext) { fprintf (stderr, "(main)"); } else { fprintf (stderr, "(isol)"); }; fprintf (stderr, " s: %ld, l:%ld)\n", vht[i]->vstart, vht[i]->vlen); }; return (i); } else { if (internal_trace) { int ic; fprintf (stderr, "\n Hash clash (at %ld): wanted %s (len %ld)", i, vname, vlen); fprintf (stderr, " but found '"); for (ic = 0; ic < vht[i]->nlen; ic++) fprintf (stderr, "%c", (vht[i]->nametxt)[ic+vht[i]->nstart] ); fprintf (stderr, "' instead."); }; }; i++; // check wraparound if (i >= vht_size) i = 0; // check for hash table full - if it is, right now we // do a fatal error. Eventually we should just resize the // hash table. Even better- we should keep track of the number // of variables, and thereby resize automatically whenever we // get close to overflow. if (i == (hc - 1)) { static char badvarname [MAX_VARNAME]; strncpy (badvarname, &vname[vsidx], vslen); badvarname[vslen+1] = 0; { long index; fprintf (stderr, "Variable Hash Table Dump\n"); for (index = 0; index < vht_size; index++) { int ic; fprintf (stderr, " var '"); for (ic = 0; ic < vht[index]->nlen; ic++) fprintf (stderr, "%c", (vht[index]->nametxt) [ic+vht[index]->nstart] ); fprintf (stderr, "'[%ld] found at %ld (", vht[index]->nlen, index); if (vht[index]->valtxt == cdw->filetext) { fprintf (stderr, "(main)"); } else { fprintf (stderr, "(isol)"); }; fprintf (stderr, " s: %ld, l:%ld)\n", vht[index]->vstart, vht[index]->vlen); ; } }; fatalerror5 (" Variable hash table overflow while looking " "for variable: " , badvarname, CRM_ENGINE_HERE); done = 1; return (0); }; }; return (0); } // // crm_setvar - set the value of a variable into the VHT, putting a // new cell in if necessary. Note that this ONLY modifies the VHT // data itself. It does NOT do any of the background work like // copying data at all, copying varnames into the tdw, keeping track // of the cdw and tdw usage, etc. // void crm_setvar (char *filename, int filedesc, char *nametxt, long nstart, long nlen, char *valtxt, long vstart, long vlen, long linenumber, long lazy_redirects) { int i, j; // some indices to bang on // first off, see if the variable is already stored. i = crm_vht_lookup (vht, &(nametxt[nstart]), nlen); if (vht[i] == NULL) { // Nope, this is an empty VHT slot // allocate a fresh, empty VHT cell vht[i] = (VHT_CELL *) malloc (sizeof (VHT_CELL)); if (!vht[i]) untrappableerror5("Couldn't malloc space for VHT cell. We need VHT cells for variables. We can't continue.","", CRM_ENGINE_HERE); // fill in the name info data vht[i]->filename = filename; vht[i]->filedesc = filedesc; vht[i]->nametxt = nametxt; vht[i]->nstart = nstart; vht[i]->nlen = nlen; vht[i]->vstart = 0 ; vht[i]->vlen = 0;\ vht[i]->lazy_redirects = lazy_redirects; // and now that the slot has proper initial information, // we can use the same code as is used in an update to do // the initial setting of values. This is good because // if we someday change the way variable values are stored, // we need change it only in one place. } else { // The cell is already here. :) }; // Either way, the cell is now here, so we can set the value. // vht[i]->valtxt = valtxt; vht[i]->vstart = vstart; vht[i]->vlen = vlen; vht[i]->mstart = vstart; vht[i]->mlen = 0; vht[i]->linenumber = linenumber; vht[i]->lazy_redirects = lazy_redirects; if(internal_trace) { j = 0; fprintf (stderr, " Successful set value of "); //for (j = 0; j < vht[i]->nlen; j++) // fprintf (stderr, "%c", vht[i]->nametxt[vht[i]->nstart+j]); dontcare = fwrite (&(vht[i]->nametxt[vht[i]->nstart]), vht[i]->nlen, 1, stderr); fprintf (stderr, " at vht entry %d ", i); fprintf (stderr, " with value -"); // for (j = 0; j < vht[i]->vlen; j++) // fprintf (stderr, "%c", vht[i]->valtxt[vht[i]->vstart+j]); dontcare = fwrite (&(vht[i]->valtxt[vht[i]->vstart]), vht[i]->vlen, 1, stderr); fprintf (stderr, "- (start %ld, length %ld)", vht[i]->vstart, vht[i]->vlen); fprintf (stderr, "and %ld lazy redirects", vht[i]->lazy_redirects); fprintf (stderr, "\n"); }; } // look up what the line number is of a variable. // long crm_lookupvarline (VHT_CELL **vht, char *text, long start, long len) { int i; // some indices to bang on i = crm_vht_lookup (vht, &(text[start]), len); // GROT GROT GROT // We should check here for GOTOing a label that isn't in // the current file (i.e. the equivalent of a C "longjmp"). if (vht[i] != NULL) { // Yes, we found it. Return the line number if (internal_trace) fprintf (stderr, " looked up ... line number %ld\n", vht[i]->linenumber); return (vht[i]->linenumber); } else { // long q; // char *deathfu ; // deathfu = (char *) malloc ( len+10); // if (!deathfu) // untrappableerror("Couldn't malloc 'deathfu'.\n Time to die. ",""); // strncpy (deathfu, &(csl->filetext[start]), len); // q = fatalerror ("Control Referencinge a non-existent variable- this" // "is almost always a very _bad_ thing", // deathfu); // If fatalerror found a TRAP for this error, cstmt now points to // the TRAP - 1. We want to go to the trap itself, no auto-incr... // if ( q == 0) // return ( csl->cstmt + 1); }; return (-1); } // Update the start and length of all captured variables whenever // a buffer gets mangled. Mangles are all expressed in // the form of a start point and a delta. // // Note to the Reader - yes, I consider the nonlinearity of this // function to be a grossitude. Not quite an obscenity, but definitely // a wart. void crm_updatecaptures (char *text, long loc, long delta) { long vht_index; long i; long ostart = 0, oend = 0 ; long nstart = 0, nend = 0 ; if (internal_trace) fprintf (stderr, "\n updating captured values start %ld len %ld \n", loc, delta); // check each VHT entry for a need to relocate for (vht_index = 0; vht_index < vht_size; vht_index++) { // is this an actual entry? if (vht[vht_index] != NULL) { if (vht[vht_index]->valtxt == text) { // start of valtext block check // value text area if (internal_trace > 1) { fprintf (stderr, "\n checking var "); for (i = 0; i < vht[vht_index]->nlen; i++) fprintf (stderr, "%c", vht[vht_index]->nametxt[vht[vht_index]->nstart+i]); fprintf (stderr, " "); fprintf (stderr, " s: %ld, l:%ld, e:%ld n:%ld ~ %ld ...", vht[vht_index]->vstart, vht[vht_index]->vlen, vht[vht_index]->vstart+vht[vht_index]->vlen, vht[vht_index]->nstart, vht[vht_index]->nstart + vht[vht_index]->nlen ); }; ostart = nstart = vht[vht_index]->vstart; oend = nstart = ostart + vht[vht_index]->vlen; nstart = crm_mangle_offset (ostart, loc, delta, 0); nend = crm_mangle_offset (oend, loc, delta, 1); if (internal_trace) fprintf (stderr, "\n index %ld vstart/vlen upd: %ld, %ld ", vht_index, vht[vht_index]->vstart, vht[vht_index]->vlen); vht[vht_index]->vstart = nstart; vht[vht_index]->vlen = nend - nstart; if (internal_trace) fprintf (stderr, "to %ld, %ld.\n", vht[vht_index]->vstart, vht[vht_index]->vlen); // // And do the same for mstart/mlen (match start/length) ostart = vht[vht_index]->mstart; oend = ostart + vht[vht_index]->mlen; nstart = crm_mangle_offset (ostart, loc, delta, 0); nend = crm_mangle_offset (oend, loc, delta, 1); if (internal_trace) fprintf (stderr, "\n index %ld mstart/mlen upd: %ld, %ld ", vht_index, vht[vht_index]->mstart, vht[vht_index]->mlen); vht[vht_index]->mstart = nstart; vht[vht_index]->mlen = nend - nstart; if (internal_trace) fprintf (stderr, "to %ld, %ld.\n", vht[vht_index]->mstart, vht[vht_index]->mlen); }; // Don't forget entries that may be varNAMES, not just // var values! if (vht[vht_index]->nametxt == text) { long orig_len; // // Same thing here... // ostart = nstart = vht[vht_index]->nstart; orig_len = vht[vht_index]->nlen; oend = nend = ostart + orig_len; if (orig_len == 0) fprintf (stderr, "CRUD on %ld", vht_index); nstart = crm_mangle_offset (ostart, loc, delta, 0); nend = crm_mangle_offset (oend, loc, delta, 1); if (oend - ostart != orig_len) fprintf (stderr, "Length change on %ld! Was %ld, now %ld ", vht_index, orig_len, oend-ostart); if (internal_trace) fprintf (stderr, "\n index %ld nstart/nlen upd: %ld, %ld ", vht_index, vht[vht_index]->nstart, vht[vht_index]->nlen); vht[vht_index]->nstart = nstart; vht[vht_index]->nlen = nend - nstart; if (internal_trace) fprintf (stderr, "to %ld, %ld.\n", vht[vht_index]->nstart, vht[vht_index]->nlen); } } } if (internal_trace) fprintf (stderr, "\n end of updates\n"); } // // How to calculate the new offsets of the start and end // (that is, a "mark"), given a location (dot) and a delta of that // location. Dot doesn't move... only mark does. // // sl is Start v. End - do we treat this mangle as altering the // _start_ of a var, or the _end_ ? (this is because we don't move // a Start if Dot is the same, but we do move an End. Alternatively, // this is "is "dot considered to be before or after a mark with the // same value) // long crm_mangle_offset ( long mark, long dot, long delta, long sl) { long absdelta; absdelta = delta; if (absdelta < 0) absdelta = -absdelta; if (sl == 0) { // HOW WE DEAL WITH START POINTS // (that is, "dot" is considered to follow "mark") // // are we earlier than dot? If so, we can't be changed by dot. // // edge condition for start: // // Mark ==> Mark // Dot Dot // if (mark <= dot) return (mark); // are we beyond the reach of dot and delta? If so, we just slide. // // edge condition: // // Mark ==> Mark // Dot+Delta Dot // if ((dot + absdelta) < mark ) return (mark + delta); // Neither - we're in the range where dot and mark can affect us // // If delta is positive, we can just slide further out. if (delta > 0) return (mark + delta); // // but, if delta is negative (a deletion) then we can move toward // dot, but not earlier than dot. mark = mark + delta; // delta is negative, so we ADD it to subtract! if (mark < dot) mark = dot; return (mark); } else { // HOW WE DEAL WITH END POINTS // (that is, "dot" is considered to be in front of "mark") // // are we earlier than dot? If so, we can't be changed by dot. // // edge condition for finish points: // // Mark ==> Mark // Dot Dot // if (mark < dot) return (mark); // are we beyond the reach of dot and delta? If so, we just slide. // // edge condition: // // Mark ==> Mark // Dot+Delta Dot // if ((dot + absdelta) <= mark ) return (mark + delta); // Neither - we're in the range where dot and mark can affect us // // If delta is positive, we can just slide further out. if (delta > 0) return (mark + delta); // // but, if delta is negative (a deletion) then we can move toward // dot, but not earlier than dot. mark = mark + delta; // delta is negative, so we ADD it to subtract! if (mark < dot) mark = dot; return (mark); }; } /// // // crm_buffer_gc - garbage-collect a buffer. This isn't a perfect // solution, but it will work. (i.e. it's slow and annoying)// // // The algorithm: // - find the lowest index currently used (takes 1 pass thru VHT) // - find the highest user of that index (takes 1 pass thru VHT) // * - see if any block overlaps that block // - find the next lowest starting block // int crm_buffer_gc ( CSL_CELL *zdw) { fprintf (stderr, "Sorry, GC is not yet implemented"); exit (EXIT_FAILURE); return (0); } crm114-20100106-BlameMichelson.src/paolo_ov2.crm0000755000000000017500000000653311321154266017412 0ustar rootwsy#! /usr/bin/crm # # paolo_ov2.crm - paolo test script for MIME messages # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. #On Fri, Aug 26, 2005 at 03:14:54PM +0200, Hilko Bengen wrote: #> I am running Debian/unstable on an x86 box with version #> 20050721-BlameNeilArmstrong and have encountered a segmentation fault #> while working on some code that digests MIME messages. I have also #> reproduced the problem with 20050415.BlameTheIRS installed on a #> Debian/sarge box. #> #> The script (with part of a spam mail included into the code) is #> attached at the end of the message. #> #> crm114 apparently only crashes if :headers: is ISOLATEd between the #> first and the second MATCH and if :headers: is then re-used as a #> target in the last MATCH. #> #> Cheers, #> -Hilko #> window alter (:_dw:) /Return-path: \nEnvelope-to: bengen@hilluzination.de\nDelivery-date: Mon, 08 Aug 2005 12:09:51 +0200\nReceived: from quechua.inka.de\n ([193.197.184.2] helo=mail.inka.de ident=mail)\n by paranoia with esmtp (Exim 4.50)\n id 1E24Zm-0008Pl-Bq\n for bengen+bengen.vdst-ka.inka.de@hilluzination.de; Mon, 08 Aug 2005 12:09:51 +0200\nReceived: from adsl-67-118-169-226.dsl.lsan03.pacbell.net (adsl-67-118-169-226.dsl.lsan03.pacbell.net [67.118.169.226])\n by mail.inka.de with smtp \n id 1E24Ze-0001qC-00; Mon, 08 Aug 2005 12:09:39 +0200\nFCC: mailbox:\/\/tteqq@yahoo.com\/Sent\nX-Identity-Key: id1\nDate: Mon, 08 Aug 2005 04:11:18 -0700\nFrom: International Medical Corporation \nX-Accept-Language: en-us, en\nMIME-Version: 1.0\nTo: bengen@vdst-ka.inka.de\nSubject: You have some free time? - Use it to make yourself wealthier!\nContent-Type: multipart\/related; boundary="------------000708090009050006030006"\nMessage-Id: \nX-CRM114-Version: 20050415.BlameTheIRS ( TRE 0.7.2 (GPL) ) MF-A10FFB4C\nX-CRM114-Status: SPAM ( pR: -149.1499 )\nContent-Length: 21549\n\nThis is a multi-part message in MIME format.\n--------------000708090009050006030006\nContent-Type: text\/html; charset=us-ascii\nContent-Transfer-Encoding: 7bit\n\n<\/head>

<\/a><\/p>

The Sims in 1960 Martha Stewart in 1948<\/font><\/p>

What number? Music<\/font><\/p><\/body><\/html>\n\n--------------000708090009050006030006\nContent-Type: image\/gif;\n name="clonic.GIF"\nContent-Transfer-Encoding: base64\nContent-ID: \nContent-Disposition: inline;\n filename="clonic.GIF"\n\nR0lGODlh1wIhAvH7AAYGAAAA\/\/\/\/\/wAAACH5BAQAAAAALAAAAADIAhYCAAL\/lI+py+0Po5y02ouz3rz7D4biSJbmiabqyrbuC8fyTNf2jef6zvf+DwwKh8Si8YhMKpfMpvMJjUqn1Kr1WgUktC8AV8H9PrxIMVgS\nxpgj3rWgDX833Q26zC7fHunxKB778\/VXgpcGMehTiHaByCBYh9AYSCFpUKl2ZunkJnYJ+Nl14OmhmDem\n...more garbage follows...\n\n--------------000708090009050006030006--\n\n\n/ { match (:: :headers: :body:) /(.*?)\n\n(.*)/ } isolate (:headers:) { match [:headers:] /^Content-Type: .* boundary="(.+)"/ (:: :boundary:) } { match [:body:] (:: :headers:) /\n--:*:boundary:\n(.+?)\n\n/ output /:*:headers:\n\n/ liaf } crm114-20100106-BlameMichelson.src/windowtest.crm0000755000000000017500000000206111321154266017711 0ustar rootwsy#! /usr/bin/crm # # windowtest.crm - testing windonwing on windows nad variables # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. window { output /:*:_nl: CRM114: testing windowing on windows and variables :*:_nl: :*:_nl:/ output /:*:_nl: test one- input by chars, delimited by 'A' / window /A/ /A/ match <> (:big:) /.*/ output /:*:_nl: Got: :*:_nl: :*:big:/ output /:*:_nl: test two- input by EOFs, delimited by 'A' / window /A/ /A/ match <> (:big:) /.*/ output /:*:_nl: Got: :*:_nl: :*:big:/ output /:*:_nl: test three- window an isolated var by chars, delimited by 'A' / isolate (:frib:) / *** this is the initial value *** / window <> (:frib:) /A/ /A/ match <> [:frib:] (:big:) /.*/ output /:*:_nl: Got: :*:_nl: :*:big:/ output /:*:_nl: test four- isolated var, input by EOFs, delimited by 'A' / window (:frib:) /A/ /A/ match <> [:frib:] (:big:) /.*/ output /:*:_nl: Got: :*:_nl: :*:big:/ output /:*:_nl: and lastly- did the data window stay constant? :*:_nl:/ accept output /:*:_nl:/ } crm114-20100106-BlameMichelson.src/README0000644000000000017500000024616511321154266015673 0ustar rootwsy# # README - master README for CRM114 # # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # Congratulations!!! You got this far. First things first. THIS SOFTWARE IS LICENSED UNDER THE GNU PUBLIC LICENSE IT MAY BE POORLY TESTED. IT MAY CONTAIN VERY NASTY BUGS OR MISFEATURES. THERE IS NO WARRANTY. THERE IS NO WARRANTY WHATSOEVER! A TOTAL, ALMOST KAFKA-ESQUE LACK OF WARRANTY. Y O U A R E W A R N E D ! ! ! Now that we're clear on that, let's begin. ----- News This Release: ----- August 07, 2009 - BlameThorstenAndJenny We now have a new SVM algorithm in place. This algorithm uses Thorsten Joachims' structural SVM to achieve O(n) runtimes in the solver, and can solve 10,000 examples in about 5 minutes (assuming you APPEND them all and then do one FROMSTART). It also does incremental training if you do _not_ specify FROMSTART (as before, APPEND doesn't run the solver, it assumes you will be adding more examples before needing to classify). As we're shifting to the new libcrm114 style of one-file-many-classes, the calling sequence for SVM (and _only_ SVM) has changed; instead of the previous many-file situation, you specify ONE file for both positive and negative examples (you learn negative examples with the REFUTE flag - there is now an ERASE flag to forget things). For a (screamingly fast) demo of the SVM, run (and read) the alternating_example_svm.crm demo file, which shows using APPEND and the new single-file-many-classes format. Naturally, old SVM files (both the data files and the "versus" file) are utterly incompatible and you will get nothing but ridicule if you try to use them here. March 27, 2009 - BlameSteveJobs This is a large set of changes. Old .css files will not be compatible with this software, and must be rebuilt. The code is now fully 64-bit compatibile. Support GNU regex has been entirely discarded. The copyright has been updated to GPLv3, except for those functions moved to the library, which are copyrighted as LGPLv3 to allow use in non-GPL applications. March 26, 2008 - BlameSentansoken This release is basically a bugfix release and includes some (but not all) fixes to the Hyperspace classifier, the vector tokenizer (VT), and the neural network. Many thanks to Paolo, Ger, and the other mailing list denizens who found the bugs. The neural network can now learn nonlinear problems in reasonable times (such as the classic XOR problem). It also includes a new example - alternating_example_neural.crm. This is a good example of how to turn a single big example file into a number of small ones (which is recommended). If you run it as "crm -t alternating_example_neural.crm" you also get an amusing look at the neural net as it trains (and no, I did not intentionally make it into a "The Matrix" style output- that was actually the natural way to express the state of the training). Upgrade to this version only if you are using Hyperspace or the Neural Network. Feb 13, 2008 - BlameJoeLangeway_VT This release includes an improved vector tokenizer (that is, VT), VT in the Hyperspace classifier, and a VT-enabled and much-improved neural net as well. Bug fixes include the pR=5.0 change to work versus microspams better, as well as some other bugfixes in both the code and the test set. The neural network is now 8192 x 8 x 8 x 2 (that is 8192 retina slots feeding 8 input neurons; each of the input neurons feeds all 8 of the hidden layer neurons, all 8 hidden layer neurons feed both of the two output neurons) for minimum disk footprint of just over half a megabyte before you staart adding documents. For large data sets you should expand the network with -s NNN where NNN is the number of retina slots (the rest of the net will scale reasonably; the minimum is 1024 retina slots feeding 4 x 4 x 2). The algorithm now uses both backpropagation and stochastic updates to avoid getting caught in local minima. As before, the neural net works best if you train in both in-class and out-of-class (i.e. REFUTE) examples. Default is to update weights only on the end of each training loop epoch; use BYCHUNK to update after each example document. APPEND turns off autotraining after a LEARN. The default cycle limits for LEARN are 5000 cycles if FROMSTART is used and 250 cycles incrementally (experimentally determined to work OK). To run another 250 cycles, just LEARN with a null text to learn- this won't add a document but _will_ run 250 more backprop cycles. One trick that's not in the C code yet but that users can implement in the CRM114 program is to alternate positive and negative chunks of example text. By alternating examples, the BYCHUNK epoch learner can converge on this rather difficult problem in not-unreasonable time (an hour or three, depending on your CPU). Running the neural net with -t can be illuminating; each line starts with E (for epoch) and then the epoch number. After that, each "." means one chunk of text that was successfully classified "strongly" (3:1 neural response ratio). A # means a positive (in-class) example that was classified as out of class, while a + means positive that was classified positive but not strongly; an X means a negative (out-of-class) example that was classified as a positive example, and an x (lower case!) as an out-of-class example that was classified "correctly" but not strongly so. It's rather fun to watch the system as it learns the small differences between two very similar files (the INTRO and QUICKREF files, taken in chunks of 1Kbyte+ characters rather than as two big chunks as done in megatest. This takes a while, versus the same problem run as only two examples, which runs in about fifteen seconds.) May 5, 2007 - BlameSpamConf This is a mixed enhancement/bugfix release. You can now use INSERT localfile and also INSERT [var_expanded_file] ; note that the expansion only uses command-line and environment variables such as :_env_HOME: (this isn't run-time expansion). The algebraic math evaluator has now been switched over to a much nicer system. Several other bugs are also stomped. Feb 16, 2007 - BlameBaltar This is a bugfix version. The much-maligned :datadir: is now gone; everything is expected to be in either the :fileprefix: or the :mailreaver: directories (don't combine them, otherwise if you reave old files out of the reaver cache by age, your configuration files will be the first to die!). You can now use start/len on INPUT on stdin. Arithmetic now respects >=, <=, and != . Hyperspace has been de-normalized, entropic classification is now somewhat smarter. Mailreaver now really does put links in for prob_good and prob_spam. Megatest is now more detailed and captures runtime errors better. The "--" args have been removed from mailfilter, mailreaver, and mailtrainer, because of problems they caused for non-X86 Linux users. A PPC GCC compiler bug was found; PPC users need to use at least a 3.4+ GCC and preferably a 4.0++ GCC (sorry!). A cute little bug in SYSCALL got stepped on (mostly), as well as one that affected any program using two or more classifiers. A bad initialization in the bit-entropy classifier was fixed, and entropy disk space required has now been cut by another factor of almost 2. An incorrect result variable parsing in TRANSLATE was fixed. Megatest now runs the local version of crm114 (as built) rather than whatever's in /usr/bin, and the variable :_pgm_text: contains a matchable copy of the program source, post-preprocessing. Matthias's bugfixes are in. The highly experimental Gatling sorter is included, but may exercise bugs (be warned!) November 3, 2006 - BlameDalkey This version is yet further bugfixes. It has Paolo's "BSD doesn't have logl() so do this instead" bugfix, as well as the entropy sixfold segfault bugfix, the missing mailfilter.cf backslash bugfix, the DEFAULT doesn't default bugfix, and the bit-entropy FIR-prior threshold is now automagically scaled to work decently for different sized .ben files. The .BEN structure has been cleaned up (uses less disk space at no accuracy cost!), and both Hyperspace and bit-entropy pR values have been "rescaled" so that a pR within +10 to -10 gives good results for thick threshold training (so now you don't have to change thresholds when you change classifiers). The only remaining hitch is that multi-class OSB sometimes gives wierdish results. September 20, 2006 - BlameNico This version is mostly a bugfix version - Hyperspace is now "fixed", and some typos are cleaned up. This also introduces the _very_ experimental bit-entropic classifier. This classifier uses bits as features, forms the Markov model of the features that minimizes entropy of the system, and then uses the compression of that model as the relative gauge of match quality. It's very VERY experimental, but seems to work very well. It's not as fast as Hyperspace or OSB (1/10th as fast), and uses significantly more memory (default is 64 megs per class) but in long runs like TREC06-P at 90+Kmsgs it is about two or three times as accurate. Currently it works best as and trained with SSTTR (single-sided thick threshold training) with a threshold of just 0.03 to 0.04 (because pR for bit-entropic is not yet calibrated to the nominal scale.) Read the long comments at the top of crm_bit_entropy.c to see how the classifier works inside and how it's different from Andrej Bratko's and Gordon Cormack's designs (in short, it uses < 1/50th the memory and never overflows the node list thereby never needing to be dumped and restarted; it also runs much faster because of this). It's still HIGHLY experimental, so use bit-entropy only if you want to be a lab rat. :-) The .ben (Bit-ENtropic) files are NOT compatible with .css files or with any other classifier. July 4, 2006 - BlameRobert Mailreaver is working very well; this is a final cleanup release for Mailreaver to go public as "supported and recommended". Robert wins the award for this one as he whacked the sole remaining known bug in the runtime system (and found another nearby!). The only thing I'm contemplating further at this point is putting "autotrain anything in the following region" back in. So, speak now if you find a bug, else this one goes onto the webpage. June 16, 2006 - ButterBeast Release status: Mailreaver is now mostly debugged. No other changes from BlameTheBeast except I don't know of anything that doesn't work now, hence this is the Version of the Beast, with yummy butter added. Lots of little twiddles and documentation fixes, too. (thanks, Regis!) June 6, 2006 - BlameTheBeast Release Status: First testable "mailreaver-by-default" release This is the first test release with a usable (we think) version of mailreaver. As far as the user is concerned, mailreaver is pretty much interchangeable with mailfilter, except that mailreaver by default caches all email, tags all email with an sfid (Spam Filter ID) which is all that's needed to recover the original, un-mangled text, and thus doesn't need intact headers or mail editing in order to train. This will make it much easier to write plugins and use CRM114-based filters with MTAs and MUAs that insist on screwing with the headers. That said, this is *still* an experimental release; be aware of that if you install it. There will be bugs and rough spots; be prepared to endure, discuss, and help solve. [[ the one-character memory leak is still here in crm_var_hash_table.c, if you find it, please let me know!!! The bug is completely benign in any but an intentional "tickler" program, and at worst simply forces an error after a few million reclamations, so it is highly unlikely to affect real apps, but it's an ugly wart. -- WSY ]] ------------ HOW TO USE MAILREAVER ( instead of mailfilter ) ----- Mailreaver.crm is the "next generation" mailfilter; use it instead of mailfilter.crm. (and, please note, right now mailreaver.crm is "field test" but the plan is that it will eventually become the recommended production system, and mailfilter.crm will become nothing more than a good-looking corpse.) Mailreaver.crm takes all of the same flags and command lines (or at least it should), and the default is to use cache. It also has the new option --dontstore which means "do NOT put this text into the reavercache". Mailreaver.crm also has the possibility of becoming faster than mailfilter.crm, because it doesn't need to JIT any of the learning code unless learning is actually needed. (future plan: mailreaver will become a "wrapper" program for a set of mail service programs) Mailreaver.crm uses the same old mailfilter.cf configuration file; those things that don't make sense don't get used, and a few new options (like :thickness:) do get used. IT IS RECOMMENDED that you save your old mailfilter.cf file as mailfilter.cf.old, and use the NEW one which has the new options already set up. (defaults are OSB Unique Microgroom for the classifier cnfiguration, a thick threshold of 10 pR units, simple word tokenization, a decision length of 16000 bytes, 0/0/1 exit codes for good/spam/program fault, cacheing enabled, rewrites enabled, a spam-flag subject string of ADV:, and an unsure flag subject string of UNS:. The big advantage of mailreaver.crm is that now all mailtraining goes through mailtrainer.crm, which opens the door to some very powerful training techniques. Note - if you use the -u option, you must make sure that mailtrainer.crm, maillib.crm and shuffle.crm are in the directory you are -u'ing into. ( --fileprefix is not used for this location). Note 2 - this version does not include a "cache flusher" option; the full text of emails stored in the cache will remain there until you manually delete them; one month is probably OK for emails not in the knownspam or knowngood directory but keep anything in those directories (don't worry, we hardlink to the files on *nix systems, and make copies on Windozen). You can do this cache flushing with a cron job easily enough, if you really need the disk space that badly.. April 22, 2006 - ReaverReturn Release Status: for testing and bug chasing. This release is primarily to verify we've whacked a few bugs in the "css version" and reserved-bucket code, as well as improved the mailtrainer code and documentation. This new version also has the DEBUG statement (drops you immediately to the debugger from program code). It is suggested you install this release ONLY if you have an outstanding issue or bug. If you are currently happy and are not out "looking for a fight", this is not the release for you. Feb 6, 2006 - ReaversSecondBreakfast Release status: Looks good, but has major new (mal)features! This is the transition release to the new "Reaver" format for storing and caching the learned texts. The "Reaver" format keeps all incoming mail in a maildir-like cache (but _not_ your maildir; we don't touch that, and if you don't use maildir, that's just fine). The advantage is that the incoming mail is saved exactly as it needs to be trained, and a CacheID is added to the mail that you finally see. As long as the CacheID header is intact, you can just reference this stored copy (assuming it hasn't been purged away) You say "use the cached text" with --cache for command line users as in: bash: crm mailfilter.crm --cache --learnspam < text_with_cacheid_header or (for mail-to-myself users - forward to yourself with the CacheID header intact): command my_secret_password learnspam cache and you will train *exactly* the text should be trained, with no worry as to whether something in the system added headers, deleted suspicious text, or whatever. The "Reaver" cache hardlinks your trained data into "known spam" and "known nonspam" subdirectories, so even when the cache gets purged, you don't lose training data. Note that the :thickness: parameter now is meaningful! Anything scoring less than this amount (plus or minus) is tagged as UNSURE and will be delivered along the "good" mail path with the extra header of "Please train me!" Default thickness is 10 pR units, which gives really good results for me. Another advantage of the Reaver cache is that you can use mailtrainer.crm to run advanced training (like DSTTTR) on the full contents of your "known" and "probably" directories, to get really fast accurate .css files. After testing, we've changed the defaults on mailtrainer to only do one pass of DSTTTR. Notes to MeetTheReavers users who have already used mailtrainer.crm - we've added the --reload option and the default repeat count is now 1 pass (rather than 5). ------ How To Use mailtrainer.crm ----------- Mailtrainer.crm is a bulk mail statistics file trainer. It allows you to feed in bulk training of your own example email and get an optimized filter in a very few minutes - or to test variations if you want to play around with the different classifiers, thickness settings, etc. Mailtrainer by default uses whatever settings are in your current mailfilter.cf file, so you'll get .css files that are optimized for your standard setup including mime decoding, normalization, classifier flags, etc. Mailtrainer.crm uses DSTTTTR (Double Sided Thick Threshold Training with Testing Refutation) which is something I didn't come up with (Fidelis is the top of the list of suspects for this). The good news is that this can more than double accuracy of OSB and similar classifiers. I'm seeing better than 99.7% accuracy with mailtrainer's DSTTTTR on the 2005 TREC SA test corpus, with 10-fold validation and the default thick threshold of 5.0 pR units and the classifier set to OSB Unique Microgroom. This is substantially better than any other result I've gotten. Six of the ten runs completed with only ONE error out of the 600 test cases. It is safe to run mailtrainer.crm repeatedly on a .css fileset and training data; if the data doesn't need to be trained in, it won't be. All you will waste is CPU time. The examples need to be one example per file. The closer these files are to what mailfilter.crm wil see in real life the better your training will be. Preferably the headers and text will be complete, intact, and unmutilated. The mailtrainer.crm options are as follows. You *must* provide --spamdir and --gooddir; the other flags are optional. Required: --spamdir=/directory/full/of/spam/files/one/per/file --gooddir=/directory/full/of/good/files/one/per/file Optional: --thick=N - thickness for thick-threshold training- this overrides the thickness in your mailfilter.cf file. --streak=N - how many successive correct classifications before we conclude we're done. Default is 10000. --repeat=N - how many times to pass the training set through the DSTTTR training algorithm. --reload - if marked, then whenever either the spam or good mail training set is exhausted, reload it immediately from the full training set. The default is no reload- to run alternate texts and when either good or spam texts are exhausted, run only the other type until all of those have been run as well. --reload works a little better for accuracy but takes up to twice as long. --worst=N - run the entire training set, then train only the N worst offenders, and repeat. This is excruciatingly slow but produces very compact .css files. Use this only if your host machine is dreadfully short of memory. Default is not to use worst-offender training. N=5% of your total corpus works pretty well. --validate=regex_no_slashes - Any file with a name that matches the regex supplied is not used for training; instead it's held back and used for validation of the new .css files. The result will give you an idea of how well your .css files will work. Example: Here's an example. We want to run mailtrainer.crm against a bunch of examples in the directory ../SA2/spam/ and ../SA2/good/. Quit when you get 4000 tests in a row correct, or if you go through the entire corpus 5 times. Use DSTTTR, with a training thickness of just .05 pR units. Don't train on any filename that contains a "*3.*" in the filename; instead, save those up and use them as a "test corpus" for N-fold validation, and print out the expected accuracy. For this particular corpus, that's about 10% of the messages. Here's the command: crm mailtrainer.crm --spamdir=../SA2/spam/ --gooddir=../SA2/good/ \ --repeat=5 --streak=4000 --validate=[3][.] --thick=0.05 This will take about eight minutes to run on the SA2 (== TREC SA) corpus of about 6000 messages; 1000 messages a minute is a good estimate for 5 passes of DSTTTTR training. Notes: * If the .css statistics files don't exist, they will be created for you. * If the first test file comes back with a pR of 0.0000 exactly, it is assumed that these are empty .css statistics files, and that ONE file will be trained in, simply to get the system "off center" enough that normal training can occur. If there is anything already in the files, this won't happen. * When running N-fold validation, if the filenames are named as in the SA2 corpus, there's an easy trick: use a regex like [0][.] for the first run, [1][.] for the second run, [2][.] for the third, and so on. Notice that this a CRM114-style regex, and _not_ a BASH-style file globbing as *3.* would be. * If you want to run N-fold validation, you must remember to delete the .css files after each run, otherwise you will not get valid results. * N-fold validation does NOT run training at all on the validation set, so if you decide you like the results, you can do still better by running mailtrainer.crm once again, but not specifying --validate. That will train in the validation test set as well, and hopefully improve your accuracy still more. December 31, 2005 - BlameBarryAndPaolo This is a bugfix/bugchase release; it should remedy or at least yield information on the strange var-name bug that a few people with very-long-running demons have encountered. It also has some bugfixes (especially in the W32 code, from both Barry and JesusFreke) and in the microgroomer, bugfixes, and accuracy improvements. Upgrade is advised if you are having bug problems, otherwise, it's not that big an issue. October 2, 2005 - BlameRaulMiller This is a new-features release. The new feature is the TRANSLATE statement- it is like tr() but allows up- and down- ranges, repeated ranges, inversion, deletion, and uniquing. The Big Book has been updated with TRANSLATE and Asian Language stuff (it shows as "highlighted in yellow" in the PDF but it hasn't been indexed yet..) Next version of the code will have JesusFreke's Windows latest bugfixes, and the fixes to microgrooming (they didn't make this release; sorry) September 10, 2005 - BlameToyotomi This is mostly a docmentation/bugfix release. New features are that the Big Book is now very close to "final quality", some improvements in speed and orthogonality of the code, bugfixes in the execution engine and in mailfilter.crm, and allowing hex input and output in EVAL math computations ( the x and X formats are now allowed, but are limited to 32-bit integers; this is for future integration of libSVM to allow SVM-based classifiers). Upgrade is recommended only if you're coding your own filters (to get the new documentation) or if you are experiencing buggy behavior with prior releases. July 21, 2005 - BlameNeilArmstrong This release is an upgrade for two new classification options - UNIGRAM and HYPERSPACE. It also contains some bugfixes (including hopefully a bugfix for the mmap error-catching problem), and the new flag DEFAULT for conditionally setting an ISOLATED var only if it hasn't had any value yet. The flag for ISOLATE is designed to be an executable statement, rather than a compile-time default value. If a variable has never been set in the program, ISOLATE will set it; otherwise that ISOLATE statement does nothing. This is inspired by JesusFreke's patch). Using classification effectively turns CRM114 into a normal Bayesian classifier, as tells the classifers to use only unigrams (single words) as features. This is so people can do an apples-to-apples comparison of CRM114's Markovian, OSB Winnow, and OSB Hyperspace classifiers versus Typical Bayesian classification and not have to write tons of glue code; just change your classify/learn flag. (note - does not cause a top-N decision list as used in A Plan For Spam - CRM114 still does not throw anything away) The other (and maybe bigger) news is the new hyperspatial classifier. The hyperspatial classifier is most easily explained by analogy to astronomy. Each known-type example document represents a single star in a hyperspace of about four billion dimensions. Similar documents form clusters of light sources, like galaxies of stars. Each class of document is therefore represented by several galaxies of stars (each galaxy being documents that are hyperspatially very similar). The unknown document is an observer dropped somewhere into this four-billion-dimensional hyperspace; whichever set of galaxies appears to be brighter (hence closer) on this observer is the class of the unknown document. What's amazing is that this hyperspace approach is not only faster than Bayesian OSB, it's also more accurate (26 errors total in the final 5000 texts in the SA ten-way shuffle, better even than Winnow) and uses only about 1/40th the disk space of Markovian (300 Kbytes v. 12 Mbytes for the SA test corpus) and about 6 times faster ( 26 seconds seconds versus 3min 11 sec on the same Pentium-M 1.G GHz for one pass, in "full demonized" mode). It should be pointed out that the fully demonized hyperspace code takes just 26 seconds for the 4147-text SA corpus is 6.2 milliseconds per text classified, including time to do all learning. The only downside of Hyperspace math is that it needs single-sided thick threshold training (SSTTT), with a thickness of 0.5 pR units. With straight TOE, it's still barely faster than Markovian but not as accurate. It still only uses 400K per .css file though, Best accuracy and speed on the SA corpus is achieved with only two terms of the OSB feature set- digram and trigram; that's the default for hyperspace, and in the code. The only downside is that this is still an experimental design; use it for fun, not for production, as the file format will undoubtedly change in the future and if you don't keep your training set as individual disjoint documents you'll have to start again. Activate it by using as the classify/learn control flag; you can also use and if you want. As usual, "prime" each of the Hyperspace statistics files by LEARNing a very short example text into each one. This creates the files with the proper headings, etc. Alternatively, create a null hyperspace file with the commands: head \/dev\/zero --bytes=4 > spam.css head \/dev\/zero --bytes=4 > nonspam.css ---- What YOU Should Do Now ----- Contents: 1) "What Do You Want?" 2) If you want to write programs... 3) How to "make" CRM114 1) "What Do You Want?" **** If you just want to use CRM114 Mailfiltering, print out the CRM114_Mailfilter HOWTO and read _THAT_. Really; we will help a LOT. The instructions in the HOWTO are much more in-depth and up to date than whatever you can glean from here. 2) If you want to write programs, read the introduction file INTRO.txt That file will get you started. Remember, this is a wierd-ass language, you _don't_ understand it yet. (okay, wiseguy, what does a "LIAF" statement do? :-) ) Then, print out and read the QUICKREF.txt (quick reference card). You'll want this by your side as you write code until you get used to the language. 3) CRM114 (as of this writing) does not have a fully functional .config file. There is a beta version, but it doesn't work on all systems. Until that work is finished, you have a couple of recommended options: 1) run the pre-built binary release, or 2) use the pre-built Makefile to build from sources. Caution: if you are building from sources, you should install the TRE regex library ***first***. TRE is the recommended regex library for CRM114 (fewer bugs and more features than Gnu Regex). You will need to give the TRE .configure the --enable-static argument, i.e. " ./configure --enable-static " . The reason for the "static" linking recommendation is that many people don't have root on their site's mail server and so cannot install the TRE regex library there. By making the default standard CRM114 binary standalone (static linked), it's possible for a non-root user to run CRM114 on the host without deep magic. Here are some useful Makefile targets: "make clean" -- cleans up all of the binaries that you have that may or may not be out of date. DO NOT do a "make clean" if you're using a binary-only distribution, as you'll delete your binaries! "make all" -- makes all the utilities (both flavors of crm114, cssutil, cssdiff, cssmerge), leaving them in the local directory. "make install" -- as root will build and install CRM114 with the TRE REGEX libraries as /usr/bin/crm . If you want a "stripped install" (cuts the binary sizes down by almost a factor of two) you will need to edit the Makefile- look at the options for "INSTALLFLAGS" "make uninstall" -- undoes the installation from "make install" "make megatest" -- this runs the complete confidence test of your installed CRM114. Not every code path can be tested this way (consider- how do you test multiple untrappable fatal errors? :) ), but it's a good confidence test anyway. "make install_gnu -- as root will build and install CRM114 with the older GNU REGEX libraries. This is obsolete but still provided for those of us with a good sense of paranoid self-preservation. Not all valid CRM114 programs will run under GNU; the GNU regex library has... painful issues. "make install_binary_only -- as root, if you have the binary-only tarball, will install the pre-built, statically linked CRM114 and utilities. This is very handy if you are installing on a security-through-minimalism server that doesn't have a compiler installed. "make install_utils" -- will build the css utilities "cssutil", "cssdiff", and "cssmerge". cssutil gives you some insight into the state of a .css file, cssdiff lets you check the differences between two .css files, and cssmerge lets you merge two css files. "make cssfiles" - given the files "spamtext.txt" and "nonspamtext.txt", builds BRAND NEW spam.css and nonspam.css files. Be patient- this can take about 30seconds per 100Kbytes of input text! It's also destructive in a sense - repeating this command with the same .txt files will make the classifier a little "overconfident". If your .txt files are bigger than a megabyte, use the -w option to increase the window size to hold the entire input. ******************************************************* *** Utilities for looking into .css files. This release also contains the cssutil utility to take a look at and manage .css spectral files used in the mailfilter. Section 8 of the CRM114_Mailfilter_HOWTO tells how to use these utilities; you _should_ read that if you are going to use the CLASSIFY funtion in your own programs. If you are using the OSB classifier, you can use the cssutil program because the file formats of default SBPH Markov and OSB Markov are compatible. However, the OSBF classifier, the Winnow classifier, and the Corellative classifier all have their own (incompatible) file formats. For the OSBF classifier, you can use the osbf-util program to look inside; for Winnow you can use the cssutil program but only the bucket use counts and the chain length counts will be correct. *** How to configure the mailfilter.crm mail filter: The instructions given here are just a synopsys- refer to the CRM114 Mailfilter HOWTO, included in your distribution kit. You will need to edit mailfilter.cf , and perhaps a few other files. The edits are quite simple, usually just inserting a username, a password, or choosing one of several given options. *** The actual filtering pipeline: - If you have requested a safety copy file of all incoming mail, the safety copy is made. - An in-memory copy of the incoming mail is made; all mutilations below are performed on this copy (so you don't get a ravaged tattered sham of email, you get the real thing) - If you have specified BASE64 expansion (default ON), any base64 attachments are decoded. - If you have specified undo-interruptus, then HTML comments are removed. - The rewrites specified in "rewrites.mfp" get applied. These are strictly "from>->to" rewrites, so that your mail headers will look exactly like the "canonical" mail headers that were used when the distribution .css files were built. If you build your own .css files from scratch, you can ignore this. - Filtration itself starts with the file "priolist.mfp' . Column 1 is a '+' or '-' and indicates if the regex (which starts in column 2) should force 'accept' or 'reject' the email. - Whitelisting happens next, with "whitelist.mfp" . No need for a + or a - here; every regex is on it's own line and all are whitelisting. - Blacklisting happens next, with "blacklist.mfp" . No need for + or - here either- if the regex matches, the mail is blacklisted. - Failing _that_, the sparse binary polynomial hash with Markovian weights (SBPH/Markov) matching system kicks in, and tries to figure out whether the mail is good or not. SBPH/Markov matching can occasionally make mistakes, since it's statistical in nature. You actually have four matchers available- the default is SBPH/Markov, but there's also an OSB/Markov, an OSB/Winnow, and a full correlator. - The mailfilter can be remotely commanded. Commands start in column 1 and go like this (yes, command is just that- the letters c o m m a n d, right at the start of the line. You mail a message with the word command, the command password, and then a command word with arguments, and the mailfilter does what you told it. command yourmailfilterpassword whitelist string - auto-accepts mail containing the whitelist string. command yourmailfilterpassword blacklist string - auto-rejects mail containing the blacklisted string command yourmailfilterpassword spam - "learns" all the text following this command line as spam, and will reject anything it gets that is "like" it. It doesn't "learn" from anything above this command, so your headers (and any incoming headers) above the command are not considered part of the text learned. It's up to your judgement what part of that text you want to use or not. command yourmailfilterpassword nonspam - "learns" all the text following this line as NOT spam, and will accept any mail that it gets that is "like" it. Like learning spam, it excludes anything above it in the file from learning. The included five files (priolist.mfp, whitelist.mfp, blacklist.mfp, spam.css and nonspam.css) are meant for example, mostly. - rewrites.mfp is a set of rewrites to be applied to the incoming mail to put it in "canonical" form. You don't _need_ to edit this file to match your local system names, but your out-of-the-box accuracy will be improved greatly if you do. - priolist.mfp is a set of very specific regexes, prefixed by + or -. These are done first, as highest priority. - whitelist.mfp is mailfilterpatterns that are "good". No line-spans allowed- the pattern must match on one line. - blacklist.mfp is mailfilterpatterns that are "bad". Likewise, linespanning is not allowed (by default). Entries in this file are all people who spam me so much I started to recognize their addresses... so I've black-holed them. If you like them, you might want to unblackhole them. - spam.css and nonspam.css: These are large files and as of 2003-09-20, are included only in the .css kits. CRM .css files are "Sparse Spectra" files and they contain "fingerprints" of phrases commonly seen in spam and nonspam mail. The "fingerprint pipeline" is currently configured at five words, so a little spam matches a whole lot of other spam. It is difficult but not impossible to reverse-engineer the spam and nonspam phrases in these two files if you really want to know. To understand the sparse spectrum algorithm, read the source code (or the file "classify_details.txt"); the basic principle is that each word is hashed, words are conglomerated into phrases, and the hash values of these phrases are stored in the css file. Matching a hash means a word or phrase under consideration is "similar to" a message that has been previously hashed. It's usually quite accurate, though not infallable. The filter also keeps three logs: one is "alltext.txt", containing a complete transcript of all incoming mail, the others are spamtext.txt and nonspamtext.txt; these contain all of the text learned as spam and as nonspam, respectively (quite handy if you ever migrate between versions, let me assure you). Some users have asked why I don't distribute my learning text, just the derivative .css files: it's because I don't own the copyright on them! They're all real mail messages, and the sender (whoever that is) owns the copyright, not me (the recipient). So, I can't publish them. But never fear, if you don't trust my .css files to be clean, you can build your own with just a few day's spam and nonspam traffic. Your .css files will be slightly different than mine, but they will _precisely_ match your incoming message profile, and probably be more accurate for you too. A few words on accuracy: there is no warranty- but I'm seeing typical accuracies > 99% with only 12 hours worth of incoming mail as example text. With the old (weak, buggy, only 4 terms) polynomials, I got a best case of 99.87% accuracy over a one-week timespan. I now see quality averaging > 99.9% accuracy (that is, in a week of ~ 3000 messages, I will have 1 or 2 errors, usually none of them significant. Of course, this is tuned to MY spam and non-spam email mixes; your mileage will almost certainly be lower until you teach the system what your mail stream looks like. ============== stop here stop here stop here ========== ----- Old News ----- Jan 17, 2006 - BlameTheReavers This is a big new functionality release- we include mailtrainer.crm as well as changing the default mailfilter.crm from Markovian to OSB. This new mailtrainer program is fed directories of example texts (one example per file), and produces optimized satistics files matched to your particular mailfilter.cf setup (each 1meg of example takes about a minute of CPU). It even does N-fold validation. Default training is 5-pass DSTTTTR (a Fidelis-inspired improvement of TUNE) with a thick threshold of 5.0 pR units. Worst-offender DSTTTTR training as a (very slow) option. There are also speedups and bugfixes throughout the code. Unless you really like Markovian, now is a good time to think about saving your old .css files and switching over to the new default mailfilter.crm config that uses OSB unique microgroom. Then run mailtrainer.crm on your saved spam and good mail files, and see how your accuracy jumps. I'm seeing a four-fold increase in accuracy on the TREC SA corpus; this is hot stuff indeed. Version CRM114-20050511.BlameMercury This version is a documentation primary release - CRM114 Revealed (.pdf.gz, 240 pages) is now available for download. BlameMercury has lots of bugfixes and only three extensions - you can now demonize minions onto pipes, you can re-execute failing commands from a TRAP finish, and you can now use a regex directly as a var-restriction subscripting action, so [ :my_var: /abc.*xyz/ ] gets you the matching substring in the :my_var: variable. Var-restriction matches do NOT change the "previous MATCH" data on a variable. Version 20050415.BlameTheIRS (TRE 0.7.2) Math expressions can now set whether they are algebraic or RPN by a leading A or N as the first character of the string. Listings are now controllable with -l N, from simple prettyprinting to full JIT parse. A bug in the microgroomer that causes problems when both microgrooming and unique were used in the same learning scenario was squashed in Markovian, OSB, and Winnow learners (it remains in OSBF). Dependency on formail (part of procmail) in the default mailfilter.crm has been removed. A cleaner method of call-by-name, the :+: indirection-fetch operator, has been activated. The var :_cd: give the call depth for non tail recursive routines. Minor bugs have been fixed and minor speedups added. "make uninstall" works. Documentation of regexes has been improved. Cached .css mapping activated. Win32 mmap adapters inserted. Version 20041231.BlameSanAndreas (TRE 0.7.2) Major topics: New highest-accuracy voodoo OSBF classifier (from Fidelis Assis), CALL/RETURN now work less unintuitively, SYSCALL can now fork the local process with very low overhead (excellent for demons that spawn a new instance for each connection), and of course bug fixes around the block. Floating point now accepts exponential notation (i.e. 6.02E23 is now valid) and you can specify output formatting. MICROGROOM is now much smarter (thanks again to Fidelis) and you can now do windowing BYCHUNK. This new revision has Fidelis Assis' new OSBF local confidence factor generator; with the OSB front end and single-sided threshold training with pR of roughly 10, it is more than 3 times more accurate and 6 times faster than straight SBPH Markovian and uses 1/10th the file space. The only downsides are that the OSBF file format is incompatible and not interconvertable between .css files and OSBF .cfc files, and that you _must_ use single-sided threshold training to achieve this accuracy. Single-sided threshold training means that if a particular text didn't score above a certain pR value, it gets trained even if it was classified correctly. For the current formulation of OSBF, training all nonspams with pR's less than 10, and all spams with pR's greater than -10 yields a very impressive 17 errors on the SA torture test, versus 42 errors with Winnow (doublesided threshold with a threshold of 0.5) and straight Markovian (54 errors with Train Only Errors training, equivalent to singlesided training with a threshold of zero) We also have several improvement in OSB, which gets down to 22 errors on the same torture test, again with the same training regimen (train if you aren't at least 10 pR units "sure", and _is_ upward compatible with prior OSB and Markovian .css files, and with the same speed as OSBF. It also doesn't use the "voodoo exponential confidence factor", so it may be a more general solution (on parsimony grounds); it has similar properties to OSB. (though there is a known bug that feature counts are all == 1 for now, but this doesn't hurt anything) CLASSIFY and LEARN both default to the obvious tokenize regex of /[[:graph:]]+/. CALL now takes three parameters: CALL /:routine_to_call:/ [:downcall_concat:] (:return_var:) The routine itself gets one parameter, which is the concatenation of all downcall_concat args (use a MATCH to shred it any way you want). RETURN now has one parameter: RETURN /:return_concat:/ The values in the :return_concat: are concatenated, and returned to the CALL statement as the new value of :return_var: ; they replace whatever was in :return_var: SYSCALL can now fork the local process and just keep running; this saves new process invocation time, setup time, and time to run the first pass of the microcompiler. See the examples in call_return_test.crm for these new hoop-jumping tricks. WINDOW now has a new capability flag - BYCHUNK mode, specifically for users WINDOWing through large blocks of data. BYCHUNK reads as large a block of incoming data in as is available (modulo limits of the available buffer space), then applies the regex. BYCHUNK assumes it's read all that will be available (and therefore sets EOF), so repeated reads will need to use EOFRETRY as well. Version 20041110.BlameFidelisMore (TRE 0.7.0) This new revision has Fidelis Assis' new OSBF local confidence factor generator; with the OSB front end and single-sided threshold training with pR of roughly 10, it is more than 3 times more accurate and 6 times faster than straight SBPH Markovian and uses 1/10th the file space. The only downsides are that the OSBF file format is incompatible and not interconvertable between .css files and OSBF .cfc files, and that you _must_ use single-sided threshold training to achieve this accuracy. Single-sided threshold training means that if a particular text didn't score above a certain pR value, it gets trained even if it was classified correctly. For the current formulation of OSBF, training all nonspams with pR's less than 10, and all spams with pR's greater than -10 yields a very impressive 17 errors on the SA torture test, versus 42 errors with Winnow (doublesided threshold with a threshold of 0.5) and straight Markovian (54 errors with Train Only Errors training, equivalent to singlesided training with a threshold of zero) We also have several improvement in OSB, which gets down to 22 errors on the same torture test, again with the same training regimen (train if you aren't at least 10 pR units "sure", and _is_ upward compatible with prior OSB and Markovian .css files, and with the same speed as OSBF. It also doesn't use the "voodoo exponential confidence factor", so it may be a more general solution (on parsimony grounds); it has similar properties to OSB. (though there is a known bug that feature counts are all == 1 for now, but this doesn't hurt anything) CLASSIFY and LEARN both default to the obvious tokenize regex of /[[:graph:]]+/. Version 20040921.BlameResidentWeasel Bugs stomped: several in the exit routine code, as well as fixes for detecting minion creation failures. SYSCALL can now do forking of the currently executing code; CALL is now implemented and provides CORBA-like argument transfer. A missing INSERT file in a .crm program is now a trappable error Version 20040815.BlameClockworkOrange Start/Length operators in match qualification are now working (same syntax as seek/length operators in file I/O), -v (and :_crm_version:) now also ID the regex engine type and version, and several bugs (including two different reclaimer bugs) have now been stomped. Other code cleanups and documentation corrections have been done. Version 20040808.BlamePekingAcrobats This is a bugfix/performance improvement release. The bugs are minor edge cases, but better _is_ better. SYSCALL now has better code for and (async is now truly "fire and forget"; keep keeps the process around without losing state, and default processes will now not hang forever if they overrun the buffer.) Documentation has been improved. Both OSB and Winnow are now both faster and more accurate (as bugs were removed). A particularly nasty bug that mashed isolated vars of zero length was quashed. -D and -R (Dump and Restore) are available in cssutil for moving .css files between different-endian architectures. Version 20040723.BlameNashville This is a major bugfix release with significant impact on accuracy, especially for OSB users. There's now a working incremental reclaimer, so there's no more ISOLATE-MATCH-repeat bug (feel free to isolate and match without fear of memory leakage). The "exit 9" bug has been fixed (at least I can no longer coerce it to appear)- users of versions after 20040606-BlameTamar should upgrade to this version. Version CRM114-20040625-BlameSeifkes Besides the usual minor bugfixes (thanks!) there are two big new features in this revision: 1) We now test against ( and ship with ) TRE version 0.6.8 . Better, faster, all that. :) 2) A fourth new classifier with very impressive statistics is now available. This is the OSB-Winnow classifier, originally designed by Christian Siefkes. It combines the OSB frontend with a balanced Winnow backend. But it may well be twice as accurate as SBPH Markovian and four times more accurate than Bayesian. Like correlative matching, it does NOT produce a direct probability, but it does produce a pR, and it's integrated into the CLASSIFY statement. You invoke it with the flag: classify (file1.cow | file2.cow) /token_regex/ and learn (file1.cow) /token_regex/ learn (file2.cow) /token_regex/ Note that you MUST do two learns on a Winnow .cow files- one "positive" one on the correct class, and a "refute" learn on the incorrect class (actually, it's more complicated than that and I'm still working out the details.) Being experimental, the OSB-Winnow file format is NOT compatible with Markovian, OSB, nor correlator matching, and there's no functional checking mechanism to verify you haven't mixed up a .cow file with a .css file. Cssutil, cssdiff, and cssmerge think they can handle the new format- but they can't. Further, you currently have to train it in a two-step process, learning it into one file, and refuting it in all other files: LEARN (file1.cow) /regex/ then LEARN (file2.cow) /regex/ which will do the right thing. If the OSB-winnow system works as well as we hope, we may put the work into adding CLASSIFY-like multifile syntax into the LEARN statement so you don't have to do this two-step dance. Version 20040601-BlameKyoto 1) the whitelist.mfp, blacklist.mfp, and priolist.mfp files shipped are now "empty", the prior lists are now shipped as *list.mfp.example files. Since people should be very careful setting up their black and white lists, this is (hopefully!) an improvement and people won't get stale .mfp's . 2) The CLASSIFY statement, running in Markovian mode, now uses Arne's speedup, and thus runs about 2x faster. Note that this speedup is currently incompatible with , and so you should use either one or the other. Once a file has been ed, you should continue to use . This is _not_ enforced yet in the software; if you get it wrong you will get a slightly higher error rate, but nothing apocalyptic will happen. 3) the CLASSIFY statement now supports Orthogonal Sparse Bigram features. These are mostly up- and down-compatible with the standard Markovian filter, but about 2x faster than Markovian even with Arne's speedup. Even though there is up- and down-compatibility, you really should pick one or the other and stick with it, to gain the real speed improvement and best accuracy. 4) The CLASSIFY (that is, full correlative) matcher has been improved. It now gives less counterintuitive pR results and doesn't barf if the test string is longer than the archetype texts (it still isn't _right_, but at least it's not totally _wrong_. :) Using will approach maximal accuracy, but it's _slow_ (call it 1/100th the speed of Markovian). We're still working on the information theoretic aspects of correlative matching, but it may be that correlative matching may be even more powerful than Markovian or OSB matching. However, it's so slow (and completely incompatible with Markovian and OSB) that a statistically significant test has yet to be done. Note: this version (and prior versions) are NOT compatible with TRE version 0.6.7. The top TRE person has been notified; so use TRE version 0.6.8 (which is included in the source kit) or drop back to TRE-0.6.6 as a fallback. Documentation is (as usual) cleaned up yet further. Work continues on the full neural recognizer. It's unlikely that the neural recognizer will ue a compatible file format, so keep around your training sets! Version 20040418-BlameEasterBunny This is the new bleeding edge release. It has several submitted bugfixes (attachments, windowing), major speedups in data I/O, and now allows random-access file I/O (detailed syntax can be found on the QUICKREF text). For example, if you wanted to read 16 bytes starting at byte 32 of an MP3 file (to grab one of the ID3 tags), you could say input [myfile.mp3 32 16] (:some_tag:) Likewise, you can specify an fseek and count on output as well; to overwrite the above ID3 tag, use: output [myfile.mp3 32 16] /My New Tag Text/ As usual for a bleeding-edge release, this code -is- poorly tested yet. Caution is advised. There's still a known memory leak if you reassign (via MATCH) a variable that was isolated; the short-term fix is to MATCH with another var and then ALTER the isolated copy. March 27, 2004 - BlameStPatrick This is the new bleeding edge release. A complete rewrite of the WINDOW code has been done (byline and eofends are gone, eofretry and eofaccepts are in), we're integrating with TRE 0.6.6 now, and a bunch of bugs have been stomped. For those poor victims who have mailreader pipelines that alter headers, you can now put "--force" on the BASH line or "force" on the mailer command line, e.g. you can now say command mysecretpassword spam force to force learning when CRM114 thinks it doesn't need to learn. However, this code -is- poorly tested yet. Caution is advised. There's still a known memory leak if you reassign (via MATCH) a variable that was isolated; the short-term fix is to MATCH with another var and then ALTER the isolated copy. February 2, 2004 - V1.000 This is the V1.0 release of CRM114 and Mailfilter. The last few known bugs have been stomped (including a moderately good infinite loop detector for string rewrites, and a "you-didn't-set-your-password" safety check), the classifier algorithms have been tuned (default is full Markovian), and it's been moderately well tested. Accuracies over 99.95% are documented on real-time mail streams, and the overall speed is 3 to 4x faster than SpamAssassin. My thanks to all of you whose contributions of brain-cycles made this code as good as it is. 20040118 (final tweaks?) It turns out that CAMRAM needs (as in is a virtual showstopper) the ability to specify which user directory all of the files are to be found in. Since #insert _cannot_ do this (it's compile time, not run time), mailfilter.crm (and classifymail.crm) now have a new --fileprefix=/somewhere/ option. To use it, put all of the files (the .css's, the .mfp's etc) that are on a per-user basis in one directory, then specify mailfilter.crm --fileprefix=/where/the/files/are/ Note that this is a true prefix- you must put a trailing slash on to specify a directory by that name. On the other hand, you can specify a particular prefix on a per-user basis, e.g.: mailfilter.crm --fileprefix=/var/spool/mail/crm.conf/joe- so that user "joe" will use mailfilter.crm with these files: /var/spool/mail/crm.conf/joe-mailfilter.cf /var/spool/mail/crm.conf/joe-rewrites.mfp /var/spool/mail/crm.conf/joe-spam.css /var/spool/mail/crm.conf/joe-nonspam.css and so on. Note that this does NOT override --spamcss and --nonspamcss options; rather, the actual .css filenames are the concatenation of the fileprefix and spamcss (or nonspamcss) names. Version 20040105 (recheck) Version 1.00, at last! The only fixes here are to make the Makefile a little more bulletproof and lets you know how to fix a messed-up /etc/ld.so.conf, and of course this document has been updated. Otherwise this version should be the same as the December 27 2003 (SanityCheck) version, which has no reported reproducible bugs higher than a P4 (documentation and feature request). For the last two weeks, I had _one_ outright error and two that I myself found borderline out of about 5000 messages. That's 2x better than a human at the same task. My thanks to all of you whose contributions of brain-cycles made this code as good as it is. -Bill Yerazunis Version 20031227 (SanityCheck) This is (hopefully) the last test version before V1.0, and bug fixes are minimal. This is really a sanity check release for V1.0 . It is now time to triage what needs to be fixed versus what doesn't, and very few things NEED to be fixed. Things that changed (or not) are: 1) BUGS ACTUALLY FIXED: removed the arglist feature from mailfilter.crm; there's a poorly understood bug in NetBSD versus Linux that breaks things. allmail.txt flag control wasn't being done correctly. That's fixed. a couple of misleading comments in the code are fixed. 2) THINGS THAT ARE NOT CHANGED IN THIS VERSION BUT ARE V1.1 CANDIDATES: the install location fix is NOT in V1.0. This will move the location of the actual binary (/usr/bin/crm versus /usr/local/bin/crm- and then add a symlink /usr/bin/crm --> /usr/local/bin/crm- ) the --mydir feature of mailfilter.crm is not yet implemented and won't be in V1.0 . Expect it in V1.1 Other than that and a few documentation fixes, this version is identical to 20031217. It's just the final sanity check before we do V1.0 Version 20031215-RC11 Minor bugs smashed. Math evaluation now works decently (but be nice to it). Mailfilter accuracy is up past 99.9% (less than 1 error per thousand, usually when a spammer joins a well-credentialed list and spams the list, or a seldom-heard-from friend sends a one-line message with a URL wrapped in HTML). Command line features for CAMRAM added ("--spamcss" and "--nonspamcss"; these will probably become unified to a --mydir). Lots of documentation updates; if it says something in the documentation, there's actually a good chance it works as described. Version 20031111-RC7 More bugs smashed- there are still a few outstanding bugs here and there, but you aren't likely to find them unless you're really pushing the limits. Improvements are everywhere; You can now embed the classical C escape chars in a var-expanded string (e.g. \n for a newline) as well as hex and octal characters like \xFF and \o132.) EVAL now can do string length and some RPN arithmetic/comparisons; approximate regexing is now available by default, and the command line input is improved. Version 20031101-RC4 (November 1, 2003) The only changes this release are some edge-condition bugfixes (thanks to Paolo and JSkud, among others) and the inclusion of Ville Laurikari's new TRE 0.6.0-PRE3 regex module. This regex module is tres-cool because it actually has a useful approximate matcher built right in, dovetailed into the REGEX syntax for #-of-matches. Consider the regex /aaa(foo){1,3}zzz/ . This matches "foo", "foofoo", or "foofoofoo". Cognitively anything in a regex's {} doesn't say what to match, just how to match it. The cognitive jump you hve to take here is /foo{bar}/ can have a {bar} that says _how accurately_ to match foo. For instance: foo{~} finds the _closest_ match to "foo" (and it always succeeds). The full details of approximate matching are in the quickref. Read and Enjoy. (for your convenience, we also include the well-proven 0.5.3 TRE library, so you should install ONE and ONLY one of these. Realize that 0.6.0-PRE3 is still a fairly moderately tested library; install whichever one meets your need to bleed. :-) ) Oct 23, 2003 ( version 20031023-RC3 ) Yes, we're now at RC3. Changes are that EVAL now works right, lots of bugfixes, and the latent code for RFC-compliant inoculation is now in the shipped mailfilter.crm (but turned off in mailfilter.cf) All big changes are being deferred to V1.1 now; this is bugfix city. Make it bleed, folks, make it _bleed_. -Bill Yerazunis October 15, 2003 It's been a long road, but here it is - RC1, as in Release Candidate 1. WINDOW and HASH have been made symmetrical, the polynomials have been optimized, and it's ready. Accuracy is steady at around 3 nines. Because of all the bugfixes, upgrading to this version (compatible with the BETA series) is recommended. -Bill Yerazunis This is the September 25th 2003 BETA-2 What's new: a few dozen bugs stomped, and new functionality everywhere. Command line args can now be restricted to acceptable sets; will keep your .css files nicely trimmed; ISOLATE will copy preexisting captures, --learnspam and --learnnonspam in mailfilter.crm will perform exactly the same configured mucking as filtering would, and then learn; --stats_only will generate ONLY the 'pR' value (this is mostly for CAMRAM users), positional args will be assigned :_posN: variables, the kit has been split so you don't have to download 8 megs of .css if you are building your .css locally, and it's working well enough that this is a full BETA release. 'August 07, 2003 bugfix release. Changes: lots and lots of bugfixes. Really. The only new code is experimental code in mailfilter (to add 'append verbosity as attachment') and getting WINDOW to work on any variable, everything else is bugstomping or enhanced testing (megatest.sh runs a lot of tests automatically now). There's still a bug or dozen out there, so keep sending me bug reports! (and has anyone else done the cssutil --> cssmerge to build small .css files for fast running?) This is the July 23, 2003 alpha release. This release is a bugfix release for the July 20 secret release. Fixes include: configuration toggles for allmail.txt and rejected_mail.txt, execution time profiling works, (-p generates an execution time profile, -P now limits number of statements in program), Good news: the new .css file format seems to be working very well; although we spend a little more time in .css evaluation, the accuracy increase is well worth it (I've had _one_ error since 07-20, a false accept to a mailing list that came back as "marginally nonspam" because the mailing list is usually squeaky clean). Merging works well; you can now make your .css files as big (or small) as you dare (within reason; you'll need to throw away features if you want to compress the heck out of it and you'll use lots of memory or page like crazy if you make them too big). If experiment shows that this memory usage is excessive, let me know and I'll see if I can do a less-space-for-more-time tradeoff. Profiling indicates that we spend more time in blacklist processing than in the whole SBPH/BCR evaluator, (which isn't that surprising, when you get down to it), so maybe trimming the blacklist to people who spam _you_ would be a good performance improvement. Anyway, here you go; this is a _recommended_ release. Grab it and have fun. :) As usual, prior news and updates are at the end of this file. --------- This is the July 19, 2003 SECRET alpha release. It won't be linked on the webpage- the only people who will know about it are the ones who get this email. Y'all are special, you know that? :-) Since this is a SECRET release, you all have a "need to know". That need is simple: I'd like to get a little more intense testing on this new setup before I put it out for general release. Enough has changed that you _need_ to read ALL the news before you go off and install this version. Be AFRAID. :) LOTS of changes have occurred - the biggest being that the new, totally incompatible but far better .css format has been implemented. The new version has everything you all wanted- both for people who want huge .css files, and for people who want _smaller_ .css files. This new stuff has necessitated scouring cssutil and cssdiff so don't use the old versions for the new format files. Lastly, because the old bucket max was 255 and the new is 4 gigs, the renormalization math changed a little. Expect pRs to be closer to 0 until you train some more. Accuracy should be better, even _before_ training, so overall it's a net win. There's also string rewriting in the pre-classification stage (who wanted that? Somebody did....) and since term rewriting is so darn useful, I'm releasing an expurgated version of the string rewriter I use to scrub my spam and nonspam text of words that should not be learned. This scrubber automatically gets used if you "make cssfiles". Here's the details: 1) The format of the .css files has changed drastically. What used to be a collisionful (and error-accepting) hash is now a 64-bit hash that is (probably) nearly error free, as it's also tagged with the full 64-bit feature value; if two values clash as to what bucket they would like to use, proper overflow techniques keep them from both using the same bucket. Bucket values were maxxed at 255 (they were bytes) now they're 32-bit longs, so you are _highly_ _unlikely_ to max out a bucket. These two changes make things significantly more robust. These changes also make it possible (in fact, trivial) to resize (both upward and downward!), compress, optimize, and do other very useful things to .css files. Right now, the only supported operation is to _merge_ one .css file onto another... but the good news is that now these files can be of different sizes! So, the VERY good news is that you can look at your .css files with cssutil, decide if (or where) you want to zero out less significant data, and then use dd to create a blank, new outfile.css file that will be about half to 2/3 full, then use cssmerge outfile.css infile.css to merge your infile.css into the outfile.css. This will be a real help for people who have (or need) very large OR very small .css files. :) You can create the blank .css file with the command 'dd' as in: dd bs=12 count= if=/dev/zero of=mynew.css (the bs=12 is because the new feature buckets are 12 bytes long) Because chain overflowing is done "in table, in sequence" you can't have more features than your table has feature buckets. You'll get a trappable error if you try to exceed it. Minor nit- right now, feature bucket 0 is reserved for version info- but it's never used (left as all 0's). That's no major hassle, but just-so-you-know... :) 2) A major error in error trapping has been corrected. TRAPs can now nest at least vaguely correctly; a nonfatal trap that is bounced does not turn into a fatal. Also, the :_fault: variable is gone, each TRAP now specifies it's own fault code. This isn't to say that error trapping is now perfect, but it's a darn sight better than it was before. 3) term rewriting on the matched or learned text is now supported; this will mean significant gains in out-of-the-box accuracy as well as keeping your mail gateway name from becoming a spam word. :) Far more fancy rewritings can be implemented, if you should choose. The rewriting rules are in rewrites.mfp - YOU must edit this to match your local and network mailer service configuration, so that your email address, email name, local email router, and local mail router IP all get mapped to the same strings as the ones I built the distribution .css files with. 4) Minor bugs - a minor bug (inaccurate edge on matching) for the polynomial; annoying segfault on insert files that ended with '#' that were immeidately followed by a { in the main program was fixed; 5) a new utility is provided - rewriteutil.crm. This utility can do string rewriting for whatever purpose you need. I personally use it to "scrub" the spam and nonspam text files; the file rewrites.mfp contains an (expurgated) set of rewrite rules that I use. You will need to edit rewrites.mfp to put your account name and server nodes in, otherwise you'll be using mine (and losing accuracy) For examples on the term rewriting, both in the mailfilter and in the standalone utility rewriteutil.crm, just look at the example/test code in rewritetest.crm (which uses the rewrite rules in test_rewrites.mfp) This is the July 1, 2003 alpha release. This is a further major bugstomping release. The .css files are expanded to 8 megabytes to decrease the massive hash-clashing that has occurred. UNION and INTERSECTION now work as described in the (updated) quickref.txt, with the (:out:) var in parens and the [:in1: :in2: ...] vars in boxes. A major bug in LEARN and CLASSIFY has been stomped; however this is a "sorta incompatible" change and you are encouraged to rebuild your .css files with a hundred Kbytees or so of prime-grade spam and nonspam (which has been stored for you in spamtext.txt and nonspamtext.txt). The included spam.css and nonspam.css files are already rebuilt for the corrected bug in LEARN and CLASSIFY. These .css files are also completely fresh and new; I restarted learning about a week ago and they're well into the 99.5% accuracy range. This is the June 23, 2003 alpha release. This is a major bugstomping release. and now seem to work more like they are described to work. The backslash escapes now are cleaner; you may find yuor programs work "differnently" but it _should_ be backward_compatible. The preprocessor no longer inserts random carriage returns. A '\' at the end of a line is a continuation onto the next line. Mailfilter now can be configured for separate exit codes on "nonspam", "spam" and "problem with the program". Exit codes on CRM114 itself have been made more appropriate; compiler errors and untrapped fatal faults now give an error exit code. Additionally, FAULT and TRAP are scrubbed, and the documentation made more accurate. June 10 news: This new version implements the new FAULT / TRAP semantics, so user programs can now do their own error catching and hopefully error fixups. Incomplete statements are now flagged a (little bit) better. Texts are now Base64-expanded and decommented before being learned There's a bunch of other bugfixes as well. Default window size is dropped to 8 megs, for compatiblity with HPUX (change this in crm114_config.h). June 01, 2003 news: the ALIUS statement - provides if/then/else and switch/case capabilities to CRM114 programmers. See the example code in aliustest.crm to get some understaning of the ALIUS statement. the ISOLATE statement - now takes a /:*:initial: value / for the freshly isolated variable. Mailfilter.crm is now MUCH more configurable, including inserting X-CRM114-Status: headers and passthru modes for Procmail, configurable verbosity on statistics and expansions, inserting trigger 'ADV:' tags into the subject line, and other good integration stuff. Overall speed has improved significantly - mailfilter is now about four times FASTER than SpamAssassin with no loss of accuracy. bugfix - we now include Ville Laurikari's TRE regexlib version 0.5.3 with CRM114; using it is still optional ("make experimental") but it's the recommended system if your inputs include NULL bytes. bugfix - OUTPUT to non-local files now goes where it claims, it should no longer be necessary to pad with a bunch of spaces. yet more additions to the .css files April 7th version: 0) We're now up to "beta test quality"... no more "alpha" quality level. This is good. :-) 1) As always, lots of bugfixes. And LOTS of thanks from all of you poor victims out there. We've reached critical mass to the point now where I'm even getting bug _fix_ suggestions; this is great! If you do make a bug report or a bugfix suggestion, please include not only the version of CRM114 you're running, but also the OS and version of that OS you're running. I've seen people porting CRM114 to Debian, to BSD, to Solaris, and even to VMS... sp please let me know what you're running when you make a bug report. PLEASE PUT AT LEAST THE CRM114 VERSION IN THE SUBJECT LINE. 2) We now have an even better 'mailfilter.crm' . Even with the highly evolved spam in the last couple of, we're still solidly above 99% (averaging around 99.5%). (it's clear that the evolution is due to the pressures brought by Bayesian filters like CRM114)... some of these new spams are very, VERY good. But we chomp 'em anyway. :-) 3) The new metaflag "--" in a CRM1114 command line flags the demarcation between "flags for CRM114" and "flags for the user program to see as :_argN:". Command line arguments before the "--" are seen only by CRM114; arguments after the "--" are seen only by the user program. 4) EXPERIMENTAL DEPARTMENT: We now have better support for the 8-bit-clean, approximate-capable TRE regex engine. It's still experimental, but we now include TRE 0.5.1 directory in this kit; you can just go into that subdirectory, do a .configure, a make, and a make install there, and you'll have the TRE regex engine installed onto your machine (you need to be root to do this). Then go back up to the main install directory, and do a "make experimental" to compile and install the experimental version as /usr/bin/crma (the 'a' is for 'approximate regex support'. Using the experimental version 'crma' WILL NOT AFFECT the main-line version 'crm'; both can coexist without any problems. To use the approximate regex support (only in version 'crma') just add a second slashed string to the MATCH command. This string should contain four numbers, in the order SIMD (which every computer hacker should remember easily enough). The four integers are the: Substitution cost, Insertion cost Maximum cost Deletion cost in an approximate regex match. If you don't add the second slash-delimited string, you get ordinary matching. Example: match /foobar/ /1 1 1 1/ means match for the string "foobar" with at most one substitution, insertion, or deletion. This syntax will eventually improve- like the makefile says, this is an experimental option. DO NOT ASSUME that this syntax will not change TOTALLY in the near future. DO NOT USE THIS for production code. 4) Yet futher improvements to the debugger. 5) Further improvements to the classifier and the shipped .css files. 6) The "stats" variable in a CLASSIFY statement now gives you an extra value- the pR number. It's pR for the same reason pH is pH - it gives an easy way to express very large numeric ratios conveniently. The pR number is the log base 10 of the .css matchfile signal strength ratios; it typically ranges from +350 or so to -350 or so. If you're writing a system that uses CRM114 as a classifier, you should use pR as your decision criterion ( as used by mailfilter.crm and classifymail.crm, pR values > 0 indicate nonspam, <0 indicates spam ) If you want to add a third classification, say "SPAM/UNSURE/NONSPAM", use something like pR > 100 for nonspam, between +100 and -100 for unsure, and < -100 for spam. CAMRAM users, take note. :) 6) The functionality of 'procmailfilter.crm' has been merged back into mailfilter.crm, classifymail.crm, learnspam.crm and learnnonspam.crm. Do NOT use the old "procmailfilter.crm" any more - it's buggy, booger-filled, and unsupported from now on. PLEASE PLEASE PLEASE don't use it, and if you have been using it, please stop now! Jan 28th release news Many thanks to all of you who sent in fixes, and taught me some nice programming tricks on the side. 0) INCOMPATIBLE CHANGES: a) INCOMPATIBLE (but regularizing) change: Input took from the file [this-file.txt] but output went to (that-file.txt); this was a wart and is now fixed; INPUT and OUTPUT both now use the form of INPUT [the-file-in-boxes.txt] and OUTPUT [the-file-in-boxes.txt] b) INCOMPATIBLE (but often-requested) change: You don't need to say "#insert" any more. Now it's just ' insert ', with no '#' . Too many people were saying that #insert was bogus, and it was too easy to get it wrong. Now, insert looks like all other statements; insert yourfilenamehere.crm c) The gzip file no longer unpacks into "installdir", but into a directory named crm114- . 1) BUGFIXES: bugs stomped all over the place - debugger bugs (now the debugger doesn't go into lalaland if an error occurs in a batch file), infinite loop on bogus statements fixed, debugger "n" not doing the right thing), window statement cleaned and now works better, '\' now works correctly even in /match patterns/, default buffer length is now 16 megabytes (!), the program source file is now opened readonly. 2) 8-BIT-CLEAN: code cleanups and reorganizations to make CRM114 8-bit-cleaner; There may be bugs in this (may? MAY?) but it's a start. (note- you won't get much use of this unless you also turn on the TRE engine, see next item.) 3) REGEX ENGINES: the default regex engine is still GNU REGEX (which is not 8-bit-clean) but we include the TRE regex engine as well (which is not only 8-bit-clean, but also does approximate regexes. TRE is still experimental, you will need to edit crm114_config.h to turn it on and then rebuild from sources. Do searches of www.freshmeat.net to see when the next rev of TRE comes out. 4) SUBPROCESSES: Spawned minion buffers now set as a fraction of the data window size, so programs don't die on overlength buffers if they copy a full minion output buffer into a non-empty main data window. The current default size is scaled to the size of the main data buffers, currently 1/8th of a data buffer, with the new default of a 16-meg allocate-on-the-fly data buffer that means your subprocesses can spout up to 2 megs of data before you need to think about using asynchronous processes. 5) The debugger now talks to your tty even if you've redirected stdin to come from a data file. EOF on the controlling tty exits the program, so -d nnnn sets an upper limit on the number of cycles an unattended batch process will run before it exits. (this added because I totally hosed my mailserver with an infinite loop. Quite the "learning experience", but I advise against it. ) 6) An improved tokenizer for mail filtering. You can pick any of 7) Option for exit codes for easy ProcMail integration, so the old "procmailfilter.crm" file goes away, it's no longer necessary to have that code fork., 8) For those of you who want eaiser integration with your local mail delivery software, without all the hassle of configuring mailfilter.crm, there's three new very bare-bones programs, meant to be called from Procmail. These do NOT use the blacklist or whitelist files, nor can they be remotely commanded like the full mailfilter.crm: learnspam.crm learnnonspam.crm classifymail.crm * learnspam.crm < some-spam.txt will learn that spam into your current spam.css database. Old spam stays there, so this is an "incremental" learn. * learnnonspam.crm < some-non-spam.txt will learn that nonspam into your current nonspam.css database. Old nonspam stays there, so this is an "incremental" learn. * classifymail.crm < mail-message.txt will do basic classification of text. This code doesn't do all the advanced things like base-64 armor-piercing nor html comment removal that mailfilter.crm does, and so it isn't as accurate, but it's easier to understand how to set it up and use it. Classifymail.crm returns a 0 exit code on nonspam, and a 1 exit code on spam. Simple, eh? Classifymail does NOT return the full text of the message, you need to get that another way (or modify classifymail.crm to output it- just put an "accept" statement right before the two "output ..." statements and you'll get the full incoming text, unaltered. November 26, 2002: NEW Built-in Debugger - the "-d" flag at the end of the command line puts you into a line-oriented high-level debugger for CRM114 programs. Improved Classifier - the new classifier math is giving me > 99.92% accuracy (N+1 scaling). In other words, once the classifier is trained to your errors, you should see less than one spam per thousand sneak through. Bug fixes - the code base now should compile more cleanly on newer systems that have IEEE float.h defined. Security fix- a non-exploitable buffer overflow fixed Documentation fixes - Serious doc errors were fixed Nov 8th, 2002 version *) Procmail users: a version of mailfilter.crm specifically set up for calling from inside procmail is included- see the file "procmailfilter.crm" for the filter, and "procmailrc.recipe" for an example recipe of how to call it. (courtesy Craig Hagan) *) Bayesian Chain Rule implemented - scoring is now done in a much more mathematically well-founded way. Because of this, you may see some retraining required, but it shouldn't be a lot. Users that couldn't use my pre-supplied .css files should delete the supplied .css files and retrain from their own spamtext.txt and nonspamtext.txt files. *) classifier polynomial calculation has been improved but is compatible with previous .css files. *) -s will let you change the default size for creating new .css files (needed only if you have HUGE training sets.) Rule of thumb: the .css files should be at least 4x the size of the training set. *) Multiple .css files will now combine correctly - that is, if you have categorized your mail into more than "spam" and "nonspam", it now works correctly. Ex: You might create categories "beer", "flames", "rants", "kernel", "parties", and "spam", and all of these categories will plug-and-play together in a reasonable way, *) speed and correctness improvements - some previously fatal errors can now be corrected automagically. Oct 31, 2002: Bayesian Chain Rule implemented - scoring is now done in a much more mathematically well-founded way. Because of this, you may see some retraining required, but it shouldn't be a lot. Users that couldn't use my pre-supplied .css files should delete the supplied .css files and retrain from their own spamtext.txt and . nonspamtext.txt files. Classifier polynomial calculation has been improved but is compatible with previous .css files. -s will let you change the default size for creating new .css files (needed only if you have HUGE training sets.) Rule of thumb: the .css files should be at least 4x the size of the training set. Multiple .css files will now combine correctly - that is, if you have categorized your mail into more than "spam" and "nonspam", it now works correctly. Ex: You might create categories "beer", "flames", "rants", "kernel", "parties", and "spam", and all of these categories will plug-and-play together in a reasonable way, e.g. classify (flames.css rants.css spam.css | beer.css parties.css kernel.css) will split out flames, rants, and spam from beer, parties, and linux-kernel email. (I don't supply .css files for anything but spam and nonspam, though.) Lastly, there are some new speed and correctness improvements - some previously fatal errors can now be corrected automagically. Oct 21: Improvements everywhere - a new symmetric declensional parser, a much more powerful and accurate sparse binary polynomial hash system ( sadly, incompatible; - if you LEARNed new data into the .css files, you must use learntest.crm to LEARN the new data into the new .css files as the old file used a less effective polynomial.) Also, many bugfixes including buffer overflows fixed, -u to change user, -e to ignore environment variables, optional [:domain:] restrictions allowed on LEARN and CLASSIFY, status output on CLASSIFY, and exit return codes. Grotty code has been removed, the Remote LEARN invocation now cleaned up, and CSSUTIL has been scrubbed up. Oct 5: Craig Rowland points out a possible buffer exploit- it's been fixed. In the process, the -w flag now boosts all intermediate calculation text buffers as well, so you can do some big big things without blowiing the gaskets. :) crm114-20100106-BlameMichelson.src/inserttest_c.crm0000755000000000017500000000030611321154266020210 0ustar rootwsy#! /usr/bin/crm # # inserttest_c.crm - test insert processor # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. output / the really middle-middle bit .../ crm114-20100106-BlameMichelson.src/crm_pca.c0000644000000000017500000017322011321154266016552 0ustar rootwsy// crm_pca.c - Principal Component Analysis //////////////////////////////////////////////////////////////////////// // This code is originally copyright and owned by William // S. Yerazunis as file crm_neural_net. In return for addition of // significant derivative work, Jennifer Barry is hereby granted a full // unlimited license to use this code, includng license to relicense under // other licenses. //////////////////////////////////////////////////////////////////////// // // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. #include "crm_pca.h" //static function declarations static Vector *convert_document(char *text, long text_len, unsigned int *features, ARGPARSE_BLOCK *apb); static int compare_features(const void *a, const void *b); static void *pca_map_file(crm_pca_block *blck, char *filename); static void pca_get_meta_data(char *filename, crm_pca_block *blck); static int has_new_vectors(char *filename); static PCA_Solution *get_solution_from_pca_file(char *filename, void **st_addr); //these are around for the times when we want to read in the file //without overwriting what we have in memory (ie in a learn during // a classify) static int pca_read_file(crm_pca_block *blck, char *filename); static int pca_read_file_fp(crm_pca_block *blck, FILE *fp); //these always use fwrite. they have to be called sometimes even //though we try to use mmap to grow the file size. static size_t pca_write_file(crm_pca_block *blck, char *filename); static size_t pca_write_file_fp(crm_pca_block *blck, FILE *fp); static size_t pca_write_theta(Vector *theta, FILE *fp); //this writes to the mmap'd file in memory if there's room or //forces an unmap and calls append static size_t append_vector_to_pca_file(Vector *v, char *filename); //this writes everything back to disk using fwrite or unmap as //appropriate. if the file was read, it always uses fwrite. if //the file was mapped in, it tries to alter that memory to have the //correct new values in it and, if it can't, fwrites it. static size_t crm_pca_save_changes(crm_pca_block *blck, void *addr, char *filename); static void crm_pca_block_init(crm_pca_block *blck); static void crm_pca_block_free_data(crm_pca_block blck); static void crm_pca_learn_new_examples(crm_pca_block *blck, int microgroom); int pca_trace = 0; /**********************CONVERTING TEXT TO FEATURES***************************/ //function to be passed to qsort that compares two features //the sort will be in INCREASING feature value order static int compare_features(const void *a, const void *b) { unsigned int *c = (unsigned int *)a; unsigned int *d = (unsigned int *)b; if (*c < *d) { return -1; } if (*c > *d) { return 1; } return 0; } /******************************************************************* *Helper function to convert text to features. * *INPUT: text: text to convert * text_len: number of characters in text * apb: the argparse block. * *OUTPUT: (features: as pass by reference contains the exact features.) * A vector of the features as a MATR_COMPACT, SPARSE_ARRAY. This * feature vector adds in the class magnitude if it is non-zero using the * CRM_REFUTE flag to set the sign. * *WARNINGS: *1) You need to free the returned vector (using vector_free) * once you are done with it. *2) The returned vector is NOT just a vector of the features. We do * PCA-specific manipulations to it, specifically, multiplying the * features by their label and adding a column if PCA_ADD_CONSTANT * is set. *******************************************************************/ static Vector *convert_document(char *text, long text_len, unsigned int *features, ARGPARSE_BLOCK *apb) { long next_offset; long n_features, i; int class, entry = 1; Vector *v; VectorIterator vit; crm_vector_tokenize_selector (apb, // the APB text, // input string buffer 0, // start offset text_len, // length NULL, // parser regex 0, // parser regex len NULL, // tokenizer coeff array 0, // tokenizer pipeline len 0, // tokenizer pipeline iterations features, // where to put the hashed results MAX_PCA_FEATURES - 1, // max number of hashes &n_features, // how many hashes we actually got &next_offset); // where to start again for more hashes if (apb->sflags & CRM_REFUTE) { //this is a negative example if (PCA_CLASS_MAG) { class = -1*PCA_CLASS_MAG; } } else { class = PCA_CLASS_MAG; } if (!n_features) { if (class) { v = vector_make_size(1, SPARSE_ARRAY, MATR_COMPACT, 1); vectorit_set_at_beg(&vit, v); vectorit_insert(&vit, 0, class, v); } else { v = vector_make_size(1, SPARSE_ARRAY, MATR_COMPACT, 0); } return v; } //Put the features into a vector qsort(features, n_features, sizeof(unsigned int), compare_features); v = vector_make_size(features[n_features-1]+1, SPARSE_ARRAY, MATR_COMPACT, n_features); vectorit_set_at_beg(&vit, v); if (class) { //insert the class mag vectorit_insert(&vit, 0, class, v); } //put the features into the vector, making them unique //if necessary for (i = 0; i < n_features; i++) { if (features[i] == 0) { continue; } vectorit_find(&vit, features[i], v); if (vectorit_curr_col(vit, v) == features[i]) { if (!(apb->sflags & CRM_UNIQUE)) { //if we see something twice and we don't have UNIQUE set //it's entry is 2 (or -2) instead of 1 vectorit_insert(&vit, features[i], vectorit_curr_val(vit, v) + entry, v); } } else { vectorit_insert(&vit, features[i], entry, v); } } //make v only take up the amount of memory it should if (v && v->type == SPARSE_ARRAY) { expanding_array_trim(v->data.sparray); } return v; } /**************************PCA FILE FUNCTIONS*********************************/ /****************************************************************************** * *The PCA file is a binary file formatted as follows: * *PCA_FIRST_NBIT bytes: A string or whatever you want defined in * PCA_FIRST_BITS. This isn't a checksum since we don't want to have to read * in the whole file every time in order to verify it - it's simply a stored * value (or even a string) that all PCA stat files have as the first few * bytes to identify them. While there is as much error checking as I can do * in this code, non-PCA binary files can create seg faults by mapping two * vector headers into the same space so that changing one changes part of * another. There is almost nothing I can do about that, so, to eliminate * that problem as far as I can, we have a little "magic string" in front. * *N_OFFSETS_IN_PCA_FILE size_t's: * * size_t size: The offset until the end of the actual data stored in the file. * We leave a large hole at the end of the file so we can append to it without * having to uncache it from memory. This is the offset to the beginning of * the hole. When reading the file in, we do not need to read past this * offset since the rest is garbage. This changes each time we append a * vector. * *N_CONSTANTS_NOT_IN_BLOCK ints: don't actually have any :) * *N_CONSTANTS_IN_PCA_BLOCK ints: * * int has_new: 1 if there are new vectors, 0 else * * int has_solution: 1 if there is a solution in the file, 0 else * * int n0: number of examples in class 0 * * int n1: number of examples in class 1 * * int n0f: total number of features in class 0 * * int n1f: total number of features in class 1 * * *PRINCIPLE COMPONENT: * * theta: the principle component written as a vector * * int fill: the amount of filler we leave to allow the principle component to * to grow without having to grow the file. * * void fill: a "hole" allowing the decision vector to grow in size in new * learns. * * double mudottheta: the decision point * *EXAMPLE VECTORS: * * Each new vector is formatted as a vector (ie we don't keep the matrix header * - this makes appending easy). * *The file is formatted this way to make the following actions quick both using * fread/fwrite and mmap/munmap: * * Finding if the file has a solution: requires a seek to has_solution and a * read of that value. * * Finding the principle if it exists: requires a sequential fread * of N_CONSTANTS_IN_PCA_BLOCK, a seek to DECISION BOUNDARY, reading in the * vector stored there. * * Querying if there are unlearned on vectors: requries a seek has_new and a * read of that value. * * Appending a vector: requires mapping in the file, reading in size and * has_new, updating has_new, and seeking to point size in the file. * if there is room, writes the vector there. else forcibly munmaps the * file and opens it for appending. creates a file if there isn't one. *****************************************************************************/ static void *pca_map_file(crm_pca_block *blck, char *filename) { struct stat statbuf; long act_size; void *addr, *last_addr, *st_addr; Vector *v; size_t size; int fill; if (stat(filename, &statbuf)) { nonfatalerror("Attempt to read from nonexistent pca file", filename); return NULL; } if (!blck) { //this really shouldn't happen fatalerror5("pca_map_file: bad crm_pca_block pointer.", "", CRM_ENGINE_HERE); return NULL; } crm_pca_block_init(blck); addr = crm_mmap_file(filename, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, &act_size); if (addr == MAP_FAILED) { nonfatalerror("Attempt to map pca file failed. The file was", filename); return NULL; } st_addr = addr; if (act_size < sizeof(size_t) + PCA_FIRST_NBIT) { nonfatalerror ("Attempt to read from corrupted pca file. It is much too small.", ""); crm_munmap_file(st_addr); return NULL; } if (strncmp(PCA_FIRST_BITS, (char *)st_addr, strlen(PCA_FIRST_BITS))) { nonfatalerror ("Attempt to map from corrupted PCA file. The header is incorrect.", ""); crm_munmap_file(st_addr); return NULL; } addr += PCA_FIRST_NBIT; //this is where the data actually ends size = *((size_t*)addr); if (size > act_size) { //corrupted file nonfatalerror("Attempt to read from corrupted pca file. It thinks it has a larger length than it does. The file is", filename); crm_munmap_file(st_addr); return NULL; } addr += sizeof(size_t); last_addr = st_addr + size; //last address that contains good data if (size < N_CONSTANTS_IN_PCA_BLOCK*sizeof(int)) { //this is isn't a good file nonfatalerror("Attempt to read from corrupted pca file. It is somewhat too small.", filename); crm_munmap_file(st_addr); return NULL; } blck->has_new = *((int*)(addr)); //do we have unlearned-on examples? addr += sizeof(int); blck->has_solution = *((int*)(addr)); //do we have a solution? addr += sizeof(int); blck->n0 = *((int *)(addr)); //# learned-on examples in class 0 addr += sizeof(int); blck->n1 = *((int *)(addr)); //# learned-on examples in class 1 addr += sizeof(int); blck->n0f = *((int *)(addr)); //# features in class 0 addr += sizeof(int); blck->n1f = *((int *)(addr)); //# features in class 1 addr += sizeof(int); if (blck->has_solution) { blck->sol = (PCA_Solution *)malloc(sizeof(PCA_Solution)); if (!blck->sol) { nonfatalerror("Unable to malloc space for solution struct. Could this be a corrupted file?", ""); crm_pca_block_free_data(*blck); crm_pca_block_init(blck); crm_munmap_file(st_addr); } //read in the solution blck->sol->theta = vector_map(&addr, last_addr); //decision boundary if (addr + sizeof(int) > last_addr) { nonfatalerror ("Attempt to map from bad pca file. It can't fit its solution.", ""); crm_pca_block_free_data(*blck); crm_pca_block_init(blck); crm_munmap_file(st_addr); return NULL; } fill = *((int *)addr); //hole to grow pca addr += sizeof(int); if (!blck->sol->theta || addr + fill + sizeof(double) > last_addr) { nonfatalerror ("Attempt to map from bad pca file. It can't fit in the solution.", ""); crm_pca_block_free_data(*blck); crm_pca_block_init(blck); crm_munmap_file(st_addr); return NULL; } addr += fill; blck->sol->mudottheta = *((double *)addr); addr += sizeof(double); } else { fill = *((int *)addr); addr += sizeof(int); addr += fill + sizeof(double); } //example vectors! if (addr < last_addr) { v = vector_map(&addr, last_addr); if (v) { if (!blck->X) { blck->X = matr_make_size(0, v->dim, v->type, v->compact, v->size); } if (!blck->X) { nonfatalerror("Attempt to map from bad pca file. A very new vector had an unrecognized type.", ""); crm_pca_block_free_data(*blck); crm_pca_block_init(blck); crm_munmap_file(st_addr); return NULL; } matr_shallow_row_copy(blck->X, blck->X->rows, v); while (addr < last_addr) { v = vector_map(&addr, last_addr); if (v && v->dim) { matr_shallow_row_copy(blck->X, blck->X->rows, v); } else { if (v && !v->dim) { vector_free(v); } break; } } } } return st_addr; } //gets the integers (like n0, n1, etc) stored in the first few bytes //of the file without reading in the whole file. //puts them in blck static void pca_get_meta_data(char *filename, crm_pca_block *blck) { void *addr, *last_addr, *st_addr; struct stat statbuf; size_t size; long act_size; if (stat(filename, &statbuf)) { //heck, we don't even have a file! nonfatalerror ("You are trying to use a PCA to classify from the nonexistant file", filename); if (blck) { blck->has_new = 0; blck->has_solution = 0; blck->n0 = 0; blck->n1 = 0; blck->n0f = 0; blck->n1f = 0; } else { fatalerror5("pca_get_meta_data: bad crm_pca_block pointer.", "", CRM_ENGINE_HERE); } return; } if (!blck) { fatalerror5("pca_get_meta_data: bad crm_pca_block pointer.", "", CRM_ENGINE_HERE); return; } //just always do PROT_READ | PROT_WRITE so that if it's cached we get it addr = crm_mmap_file(filename, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, &act_size); if (addr == MAP_FAILED || act_size < sizeof(size_t) + PCA_FIRST_NBIT) { fatalerror5("Could not map PCA file to get meta data. Something is very wrong and I doubt we can recover. The file is", filename, CRM_ENGINE_HERE); if (addr != MAP_FAILED) { crm_munmap_file(addr); } return; } st_addr = addr; if (strncmp(PCA_FIRST_BITS, (char *)addr, strlen(PCA_FIRST_BITS))) { nonfatalerror("This pca file is corrupted. The file is", filename); blck->has_new = 0; blck->has_solution = 0; blck->n0 = 0; blck->n1 = 0; blck->n0f = 0; blck->n1f = 0; crm_munmap_file(st_addr); return; } addr += PCA_FIRST_NBIT; size = *((size_t *)addr); //actual size (rest is garbage hole) last_addr = st_addr + size; if (size > act_size || addr + N_OFFSETS_IN_PCA_FILE*sizeof(size_t) + (N_CONSTANTS_IN_PCA_BLOCK + N_CONSTANTS_NOT_IN_BLOCK)*sizeof(int) > last_addr) { nonfatalerror("This pca file is corrupted. The file is", filename); blck->has_new = 0; blck->has_solution = 0; blck->n0 = 0; blck->n1 = 0; blck->n0f = 0; blck->n1f = 0; crm_munmap_file(st_addr); return; } addr += N_OFFSETS_IN_PCA_FILE*sizeof(size_t) + N_CONSTANTS_NOT_IN_BLOCK*sizeof(int); blck->has_new = *((int *)addr); //Are there un-learned on examples? addr += sizeof(int); blck->has_solution = *((int *)addr); //1 if there is a solution addr += sizeof(int); blck->n0 = *((int *)addr); //# examples in class 0 addr += sizeof(int); blck->n1 = *((int *)addr); //# examples in class 1 addr += sizeof(int); blck->n0f = *((int *)addr); //# features in class 0 addr += sizeof(int); blck->n1f = *((int *)addr); //# features in class 1 addr += sizeof(int); crm_munmap_file(st_addr); } //returns 1 if the file has vectors that have been appended but not yet //learned on //returns 0 else static int has_new_vectors(char *filename) { void *addr, *last_addr, *st_addr; size_t size; int *data, ret; struct stat statbuf; long act_size; if (stat(filename, &statbuf)) { //heck, we don't even have a file! return 0; } //this is PROT_WRITE because, if we read in a vector, we may flip //a bit telling us that the vector was mapped in - which tells us what parts //of the vector should be freed addr = crm_mmap_file(filename, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, &act_size); if (addr == MAP_FAILED || act_size < sizeof(size_t) + PCA_FIRST_NBIT) { nonfatalerror("There was a problem mapping the pca file in while checking for new vectors. I am going to assume there are no new vectors. The file was", filename); if (addr != MAP_FAILED) { crm_munmap_file(addr); } return 0; } st_addr = addr; if (strncmp(PCA_FIRST_BITS, (char *)addr, strlen(PCA_FIRST_BITS))) { nonfatalerror("The PCA file is corrupted. I am going to assume it contains no new examples. The file is", filename); crm_munmap_file(st_addr); return 0; } addr += PCA_FIRST_NBIT; size = *((size_t *)addr); //actual amount of good data last_addr = st_addr + size; if (size > act_size || addr + N_OFFSETS_IN_PCA_FILE*sizeof(size_t) + (N_CONSTANTS_IN_PCA_BLOCK + N_CONSTANTS_NOT_IN_BLOCK)*sizeof(int) > last_addr) { nonfatalerror("There was a problem mapping the pca file in while checking for new vectors. I am going to assume there are no new vectors. The file was", filename); crm_munmap_file(st_addr); return 0; } addr += N_OFFSETS_IN_PCA_FILE*sizeof(size_t) + N_CONSTANTS_NOT_IN_BLOCK*sizeof(int); data = (int *)addr; ret = data[HAS_NEW_INDEX]; crm_munmap_file(st_addr); return ret; } //returns the decision boundary from an pca file //we map the decision boundary from the file so you must // FREE THE DECISION BOUNDARY returned by the function // MUNMAP THE FILE returned pass-by-reference in *addr static PCA_Solution *get_solution_from_pca_file(char *filename, void **st_addr) { PCA_Solution *sol; void *last_addr, *addr; size_t size; int *hs, fill; struct stat statbuf; long act_size; if (stat(filename, &statbuf)) { nonfatalerror("You are trying to read a decision boundary from a file that doesn't exist. The file is", filename); return NULL; } //this is PROT_WRITE because, if we read in theta, we may need to flip //a bit telling us that theta was mapped in - which tells us what parts //of theta should be freed addr = crm_mmap_file(filename, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, &act_size); if (addr == MAP_FAILED || act_size < sizeof(size_t) + PCA_FIRST_NBIT) { nonfatalerror("Attempt to map pca file while getting decision boundary failed. The file was", filename); if (addr != MAP_FAILED) { crm_munmap_file(addr); } *st_addr = NULL; return NULL; } *st_addr = addr; if (strncmp(PCA_FIRST_BITS, (char *)addr, strlen(PCA_FIRST_BITS))) { nonfatalerror("Attempt to read decision boundary from a corrupt PCA file. The file was", filename); crm_munmap_file(*st_addr); *st_addr = NULL; return NULL; } addr += PCA_FIRST_NBIT; size = *((size_t *)addr); last_addr = *st_addr + size; if (size > act_size || addr + N_OFFSETS_IN_PCA_FILE*sizeof(size_t) + (N_CONSTANTS_NOT_IN_BLOCK+N_CONSTANTS_IN_PCA_BLOCK)*sizeof(int) > last_addr) { nonfatalerror("Attempt to map pca file while getting decision boundary failed. The file was", filename); crm_munmap_file(*st_addr); *st_addr = NULL; return NULL; } addr += N_OFFSETS_IN_PCA_FILE*sizeof(size_t) + N_CONSTANTS_NOT_IN_BLOCK*sizeof(int); hs = (int *)addr; addr += sizeof(int)*N_CONSTANTS_IN_PCA_BLOCK; if (addr > last_addr || !hs[HAS_SOLUTION_INDEX]) { nonfatalerror("You are trying to read a decision boundary from a file that doesn't contain a PCA solution or is corrupted. The file is", filename); crm_munmap_file(*st_addr); *st_addr = NULL; return NULL; } sol = (PCA_Solution *)malloc(sizeof(PCA_Solution)); sol->theta = vector_map(&addr, last_addr); if (addr +sizeof(int) > last_addr) { nonfatalerror("You are trying to read a decision boundary from a file that doesn't contain a PCA solution or is corrupted. The file is", filename); crm_munmap_file(*st_addr); *st_addr = NULL; pca_free_solution(sol); } fill = *((int *)addr); addr += sizeof(int); if (addr + fill +sizeof(double) > last_addr) { nonfatalerror("You are trying to read a decision boundary from a file that doesn't contain a PCA solution or is corrupted. The file is", filename); crm_munmap_file(*st_addr); *st_addr = NULL; pca_free_solution(sol); } addr += fill; sol->mudottheta = *((double *)addr); return sol; } //functions used to read in the file //when we need to do a learn in classify. static int pca_read_file(crm_pca_block *blck, char *filename) { FILE *fp = fopen(filename, "rb"); int ret; if (!fp) { nonfatalerror("Attempt to read from nonexistent pca file", filename); return 0; } ret = pca_read_file_fp(blck, fp); fclose(fp); return ret; } //reads a binary pca block from a file //returns 0 on failure static int pca_read_file_fp(crm_pca_block *blck, FILE *fp) { size_t amount_read, size; Vector *v; int fill; char firstbits[strlen(PCA_FIRST_BITS)]; if (!blck) { //this really shouldn't happen fatalerror5("read_pca_file_fp: bad crm_pca_block pointer.", "", CRM_ENGINE_HERE); return 0; } if (!fp) { nonfatalerror("Attempt to read pca from bad file pointer.", ""); return 0; } crm_pca_block_free_data(*blck); crm_pca_block_init(blck); amount_read = fread(firstbits, 1, PCA_FIRST_NBIT, fp); if (strncmp(PCA_FIRST_BITS, firstbits, strlen(PCA_FIRST_BITS))) { nonfatalerror("This pca file is corrupted. I cannot read it.", ""); return 0; } amount_read = fread(&size, sizeof(size_t), 1, fp); amount_read = fread(&(blck->has_new), sizeof(int), 1, fp); amount_read += fread(&(blck->has_solution), sizeof(int), 1, fp); amount_read += fread(&(blck->n0), sizeof(int), 1, fp); amount_read += fread(&(blck->n1), sizeof(int), 1, fp); amount_read += fread(&(blck->n0f), sizeof(int), 1, fp); amount_read += fread(&(blck->n1f), sizeof(int), 1, fp); if ((amount_read < N_CONSTANTS_IN_PCA_BLOCK) || ftell(fp) > size) { nonfatalerror("Attempt to read from bad pca file", ""); crm_pca_block_init(blck); return 0; } //read in solution if (blck->has_solution) { blck->sol = (PCA_Solution *)malloc(sizeof(PCA_Solution)); blck->sol->theta = vector_read_bin_fp(fp); amount_read = fread(&fill, sizeof(int), 1, fp); fseek(fp, fill, SEEK_CUR); if (!blck->sol->theta || !amount_read || feof(fp) || ftell(fp) > size) { //die! nonfatalerror("Attempt to read from bad pca file.", ""); crm_pca_block_free_data(*blck); crm_pca_block_init(blck); return 0; } amount_read = fread(&(blck->sol->mudottheta), sizeof(double), 1, fp); if (!amount_read) { //die! nonfatalerror("Attempt to read from bad pca file.", ""); crm_pca_block_free_data(*blck); crm_pca_block_init(blck); return 0; } } else { amount_read = fread(&fill, sizeof(int), 1, fp); fseek(fp, fill + sizeof(double), SEEK_CUR); if (!amount_read || feof(fp) || ftell(fp) >= size) { nonfatalerror("Attempt to read from bad SVM file.", ""); crm_pca_block_free_data(*blck); crm_pca_block_init(blck); return 0; } } //read in new vectors if (!feof(fp) && ftell(fp) < size) { v = vector_read_bin_fp(fp); if (v && v->dim) { if (!(blck->X)) { blck->X = matr_make_size(0, v->dim, v->type, v->compact, v->size); } if (!blck->X) { nonfatalerror("Attempt to map from bad pca file.", ""); crm_pca_block_free_data(*blck); crm_pca_block_init(blck); return 0; } matr_shallow_row_copy(blck->X, blck->X->rows, v); while (!feof(fp) && ftell(fp) < size) { v = vector_read_bin_fp(fp); if (v && v->dim) { matr_shallow_row_copy(blck->X, blck->X->rows, v); } else { if (v && !v->dim) { vector_free(v); } break; } } } else if (v) { vector_free(v); } } return 1; } static size_t pca_write_file(crm_pca_block *blck, char *filename) { //this is tricky because the file may be mmapped in //and we may want to be writing some of that back out //so we write it to a temporary file //then unmap the file //then rename the temporary file char tmpfilename[MAX_PATTERN]; FILE *fp; size_t size; int i, lim; //figure out what directory filename is in for (i = strlen(filename); i > 0; i--) { if (filename[i-1] == '/') { break; } } if (!i) { tmpfilename[0] = '.'; tmpfilename[1] = '/'; i = 2; } else { strncpy(tmpfilename, filename, i); } lim = i+6; for ( ; i < lim; i++) { tmpfilename[i] = 'X'; } tmpfilename[lim] = '\0'; //create a temporary file in that directory lim = mkstemp(tmpfilename); if (lim < 0) { if (pca_trace) { perror("Error opening temporary file"); } fatalerror5("Error opening a temporary file. Your directory may be too full or some other problem, but this will really mess things up.\n", "", CRM_ENGINE_HERE); return 0; } else { close(lim); } fp = fopen(tmpfilename, "wb"); if (!fp) { fatalerror5("Error opening a temporary file. Your directory may be too full or some other problem, but this will really mess things up.\n", "", CRM_ENGINE_HERE); return 0; } size = pca_write_file_fp(blck, fp); fclose(fp); //do the unmap AFTER since blck probably has memory somewhere in that mmap crm_force_munmap_filename(filename); //delete the old file if (unlink(filename)) { if (pca_trace) { perror("Error deleting out-of-date pca file"); } unlink(tmpfilename); return 0; } //now rename our temporary file to be the old file if (rename(tmpfilename, filename)) { if (pca_trace) { perror("Error renaming temporary file"); } unlink(tmpfilename); fatalerror5("Could not copy from the temporary file to the new pca file. Perhaps you don't have write permissions? Whatever is going on, we are unlikely to be able to recover from it.", "", CRM_ENGINE_HERE); return 0; } return size; } //writes an pca block to a file in binary format //returns the number of bytes written //doesn't munmap the file since it doesn't have a file name!! //frees blck static size_t pca_write_file_fp(crm_pca_block *blck, FILE *fp) { size_t size = MAX_INT_VAL, unused; int i; Matrix *M = matr_make(0, 0, SPARSE_ARRAY, MATR_COMPACT); void *hole; double d; if (!blck) { fatalerror5("pca_write_file: attempt to write NULL block.", "", CRM_ENGINE_HERE); return 0; } if (!fp) { nonfatalerror("Trying to write a pca file to a null file pointer.", ""); return 0; } if (blck->sol && blck->sol->theta) { blck->has_solution = 1; } else { blck->has_solution = 0; } size = sizeof(char)*fwrite(PCA_FIRST_BITS, 1, PCA_FIRST_NBIT, fp); size += sizeof(size_t)*fwrite(&size, sizeof(size_t), 1, fp); size += sizeof(int)*fwrite(&(blck->has_new), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->has_solution), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->n0), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->n1), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->n0f), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->n1f), sizeof(int), 1, fp); //write the principle component and the fill //write the principle component dot the mean vector if (blck->sol) { size += pca_write_theta(blck->sol->theta, fp); size += sizeof(double)*fwrite(&(blck->sol->mudottheta), sizeof(double), 1, fp); } else { //leave room size += pca_write_theta(NULL, fp); d = 0.0; size += sizeof(double)*fwrite(&d, sizeof(double), 1, fp); } //now write out the example vectors if (blck->X) { for (i = 0; i < blck->X->rows; i++) { if (blck->X->data[i]) { size += vector_write_bin_fp(blck->X->data[i], fp); } } } //this tells you where the data in the file ends fseek(fp, PCA_FIRST_NBIT, SEEK_SET); //this tells you the offset to appended vectors //so you can check if there *are* new vectors quickly unused = fwrite(&size, sizeof(size_t), 1, fp); //now leave a nice big hole //so we can add lots of nice vectors //without changing the file size if (PCA_HOLE_FRAC > 0) { fseek(fp, 0, SEEK_END); hole = malloc((int)(PCA_HOLE_FRAC*size)); size += fwrite(hole, 1, (int)(PCA_HOLE_FRAC*size), fp); free(hole); } matr_free(M); crm_pca_block_free_data(*blck); crm_pca_block_init(blck); return size; } //writes theta to a file, leaving it room to grow static size_t pca_write_theta(Vector *theta, FILE *fp) { int dec_size = MATR_DEFAULT_VECTOR_SIZE*sizeof(double); size_t size = 0, theta_written, theta_size; void *filler = NULL; if (!fp) { if (pca_trace) { fprintf(stderr, "pca_write_theta: null file pointer.\n"); } return 0; } if (theta) { theta_size = vector_size(theta); while (theta_size >= dec_size) { if (!(dec_size)) { dec_size = 1; } dec_size *= 2; } theta_written = vector_write_bin_fp(theta, fp); } else { theta_written = 0; } size += theta_written; dec_size -= theta_written; if (dec_size > 0) { filler = malloc(dec_size); } else { dec_size = 0; } size += sizeof(int)*fwrite(&dec_size, sizeof(int), 1, fp); if (filler) { size += fwrite(filler, 1, dec_size, fp); free(filler); } return size; } //appends a vector to the pca file to be learned on later without //reading in the whole file //frees the vector static size_t append_vector_to_pca_file(Vector *v, char *filename) { FILE *fp; crm_pca_block blck; int exists = 0; long size; size_t data_ends, vsize; int ret; void *addr, *last_addr, *new_addr, *st_addr; struct stat statbuf; if (!v) { nonfatalerror("Something is wrong with the new input. I think it is NULL. I am not trying to append it.", ""); return 0; } //do we have space to write this vector without forcing an unmap? if (!stat(filename, &statbuf)) { if (statbuf.st_size > 0) { exists = 1; addr = crm_mmap_file(filename, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, &size); if (addr == MAP_FAILED || size < sizeof(size_t) + sizeof(int) + PCA_FIRST_NBIT) { vector_free(v); fatalerror5("Unable to map PCA file in order to append a vector. Something is very wrong and we are unlikely to be able to recover. The file is", filename, CRM_ENGINE_HERE); return 0; } st_addr = addr; last_addr = st_addr+size; if (strncmp(PCA_FIRST_BITS, (char *)addr, strlen(PCA_FIRST_BITS))) { nonfatalerror("I think this PCA file is corrupted. You may want to stop now and rerun this test with an uncorrupted file. For now, I'm not going to touch it. The file is", filename); crm_munmap_file(st_addr); vector_free(v); return 0; } addr += PCA_FIRST_NBIT; data_ends = *((size_t *)addr); vsize = vector_size(v); //no matter what, the data now ends here //it's important to mark that if (data_ends <= size) { *((size_t *)addr) = data_ends + vsize; } else { *((size_t *)addr) = size + vsize; } addr += sizeof(size_t); //now note that we have new vectors that haven't been learned on *((int *)addr) = 1; if (data_ends < size && st_addr + data_ends + vsize <= last_addr) { //we have room to write the vector //so add it new_addr = vector_memmove(st_addr + data_ends, v); vector_free(v); crm_munmap_file(st_addr); return vsize; } //we don't have room to write the vector //get rid of the hole crm_munmap_file(st_addr); if (data_ends < size) { ret = truncate(filename, data_ends); } else if (data_ends > size) { nonfatalerror("I think this PCA file is corrupted. You may want to stop now and rerun this test with an uncorrupted file. For now, I'm not going to touch it. The file is", filename); vector_free(v); return 0; } } } if (!exists) { if (pca_trace) { fprintf(stderr, "Creating new stat file.\n"); } //the file doesn't exist yet //we'll create it! //note that leaving this as open for appending instead //of writing creates problems. i'm not sure why. fp = fopen(filename, "wb"); crm_pca_block_init(&blck); blck.has_new = 1; blck.X = matr_make_size(1, v->dim, v->type, v->compact, v->size); if (!blck.X) { nonfatalerror("Attempt to append bad vector to PCA file.", ""); fclose(fp); return 0; } matr_shallow_row_copy(blck.X, 0, v); size = pca_write_file_fp(&blck, fp); fclose(fp); return size; } //force an unmap if it is mapped //append this vector to the file crm_force_munmap_filename(filename); fp = fopen(filename, "ab"); size = vector_write_bin_fp(v, fp); vector_free(v); if (PCA_HOLE_FRAC > 0) { if (pca_trace) { fprintf(stderr, "Appending hole of size %d to file.\n", (int)(PCA_HOLE_FRAC*statbuf.st_size)); } new_addr = malloc((int)(PCA_HOLE_FRAC*statbuf.st_size)); size += fwrite(new_addr, 1, (int)(PCA_HOLE_FRAC*statbuf.st_size), fp); free(new_addr); } fclose(fp); return size; } //this function writes the changes that have been made to blck //to disk //if addr is NULL, it will fwrite blck to filename //if blck was mapped in, it will attempt to write things back into //memory and //if this isn't possible it will force a fwrite the file //this frees all data associated with blck static size_t crm_pca_save_changes(crm_pca_block *blck, void *addr, char *filename) { size_t theta_room, theta_req, size; void *curr = addr, *prev, *last_addr; crm_pca_block old_block; struct stat statbuf; if (!addr) { nonfatalerror("Attempting to save a file to a NULL address. Probably the original file was corrupted and couldn't be read. The file is", filename); return 0; } if (stat(filename, &statbuf)) { //ok this is really wrong fatalerror5("pca save changes: the file you are trying to save to doesn't exist. This is unrecoverable. The file is", filename, CRM_ENGINE_HERE); return 0; } if (statbuf.st_size < sizeof(size_t) + PCA_FIRST_NBIT) { if (pca_trace) { fprintf(stderr, "Writing file because it is waaaay too small.\n"); } return pca_write_file(blck, filename); } if (strncmp(PCA_FIRST_BITS, (char *)addr, strlen(PCA_FIRST_BITS))) { nonfatalerror("The magic string of the file I am trying to save isn't what I think it should be. This probably indicates that the file is corrupted and I shouldn't touch it so I won't. The file is", filename); return 0; } curr += PCA_FIRST_NBIT; size = *((size_t *)curr); curr += sizeof(size_t); if (size + sizeof(double)*MATR_DEFAULT_VECTOR_SIZE >= statbuf.st_size) { //we have no more room to append vectors to this file //so write it out now //otherwise size won't change if (pca_trace) { fprintf(stderr, "Writing file to leave a hole at the end.\n"); } return pca_write_file(blck, filename); } last_addr = addr + size; //make all of the constants correct if (blck->sol && blck->sol->theta) { blck->has_solution = 1; } else { blck->has_solution = 0; } old_block.has_new = *((int *)curr); *((int *)curr) = blck->has_new; curr += sizeof(int); old_block.has_solution = *((int *)curr); *((int *)curr) = blck->has_solution; curr += sizeof(int); old_block.n0 = *((int *)curr); *((int *)curr) = blck->n0; curr += sizeof(int); old_block.n1 = *((int *)curr); *((int *)curr) = blck->n1; curr += sizeof(int); old_block.n0f = *((int *)curr); *((int *)curr) = blck->n0f; curr += sizeof(int); old_block.n1f = *((int *)curr); *((int *)curr) = blck->n1f; curr += sizeof(int); //keep where theta starts prev = curr; //this is how much room for theta if (old_block.has_solution) { theta_room = vector_size((Vector *)curr); } else { theta_room = 0; } curr += theta_room; theta_room += *((int *)curr); curr = prev; //how much room will theta actually take? if (blck->has_solution && blck->sol && blck->sol->theta) { theta_req = vector_size(blck->sol->theta); } else { theta_req = 0; } if (curr + theta_room > last_addr || theta_room < theta_req) { //we don't have enough room in the file to write //the decision boundary //so we need to use fwrite if (pca_trace) { fprintf (stderr, "Writing file to grow PC size from %lu to %lu.\n", theta_room, theta_req); } return pca_write_file(blck, filename); } //we have enough room to unmap the solution to this file //let's do it! //write the new solution boundary if (blck->has_solution && blck->sol) { //copy over the decision boundary //it is possible that curr and blck->sol->theta //overlap if we didn't actually do a learn //so use memmove NOT memcpy prev = vector_memmove(curr, blck->sol->theta); } //leave a marker to let us know how much filler space we have *((int *)prev) = theta_room-theta_req; curr += theta_room + sizeof(int); if (blck->has_solution && blck->sol) { *((double *)curr) = blck->sol->mudottheta; } curr += sizeof(double); //and that's all folks! crm_pca_block_free_data(*blck); crm_pca_block_init(blck); crm_munmap_file(addr); return size; } /***************************PCA BLOCK FUNCTIONS*******************************/ //initializes an pca block static void crm_pca_block_init(crm_pca_block *blck) { blck->sol = NULL; blck->X = NULL; blck->has_new = 0; blck->has_solution = 0; blck->n0 = 0; blck->n1 = 0; blck->n0f = 0; blck->n1f = 0; } //frees all data associated with a block static void crm_pca_block_free_data(crm_pca_block blck) { if (blck.sol) { pca_free_solution(blck.sol); } if (blck.X) { matr_free(blck.X); } } /***************************LEARNING FUNCTIONS********************************/ //does the actual work of learning new examples static void crm_pca_learn_new_examples(crm_pca_block *blck, int microgroom) { int i, inc, loop_it, pinc, ninc, sgn, lim; VectorIterator vit; Vector *row; double frac_inc, d, val, offset, back, front, ratio, last_offset, *dt; PreciseSparseElement *thetaval = NULL; if (!blck->X) { nonfatalerror ("There are no examples for a PCA to learn on in the file you have supplied. Note that supplying a non-empty but incorrectly formatted file can cause this warning.", ""); //reset the block crm_pca_block_free_data(*blck); crm_pca_block_init(blck); return; } //update n0, n1, n0f, n1f blck->n0 = 0; blck->n1 = 0; blck->n0f = 0; blck->n1f = 0; for (i = 0; i < blck->X->rows; i++) { row = matr_get_row(blck->X, i); if (!row) { //this would be weird continue; } vectorit_set_at_beg(&vit, row); if (!vectorit_past_end(vit, row)) { if (vectorit_curr_val(vit, row) < 0) { //a new example for class 1 blck->n1++; blck->n1f += row->nz; if (PCA_CLASS_MAG) { blck->n1f--; } } else { blck->n0++; blck->n0f += row->nz; if (PCA_CLASS_MAG) { blck->n0f--; } } } } dt = (double *)malloc(sizeof(double)*blck->X->rows); //actually learn something! if (pca_trace) { fprintf(stderr, "Calling PCA solve.\n"); } frac_inc = PCA_REDO_FRAC + 1; loop_it = 0; inc = 0; pinc = 0; ninc = 0; while (frac_inc >= PCA_REDO_FRAC && loop_it < PCA_MAX_REDOS) { pca_solve((blck->X), &(blck->sol)); if (!blck->sol) { nonfatalerror("Unable to solve PCA. This is likely due to a corrupted PCA statistics file.", ""); crm_pca_block_free_data(*blck); crm_pca_block_init(blck); free(dt); return; } inc = 0; pinc = 0; ninc = 0; if (PCA_CLASS_MAG && blck->n0 > 0 && blck->n1 > 0) { //check to see if we have class mag set high enough //it's set high enough if we correctly classify everything //(around a 0 decision point) USING the first element d = vector_get(blck->sol->theta, 0); if (d < 0) { vector_multiply(blck->sol->theta, -1, blck->sol->theta); blck->sol->mudottheta *= -1; } for (i = 0; i < blck->X->rows; i++) { dt[i] = dot(matr_get_row(blck->X, i), blck->sol->theta); val = matr_get(blck->X, i, 0); if (dt[i]*val <= 0) { inc++; } //get rid of the influence of the first element //now we can use this dt[i] to find the correct //decision point later dt[i] -= vector_get(blck->sol->theta, 0)*val; if (dt[i] <= 0 && val > 0) { pinc++; } else if (dt[i] >= 0 && val < 0) { ninc++; } } } frac_inc = inc/(double)blck->X->rows; if (frac_inc >= PCA_REDO_FRAC) { for (i = 0; i < blck->X->rows; i++) { matr_set(blck->X, i, 0, 2*matr_get(blck->X, i, 0)); } pca_free_solution(blck->sol); blck->sol = NULL; if (pca_trace) { fprintf(stderr, "The fraction of wrong classifications was %lf. Repeating with class mag = %lf\n", frac_inc, matr_get(blck->X, 0, 0)); } loop_it++; } } if (!blck->sol) { nonfatalerror("Unable to solve PCA. This is likely due to a corrupted PCA statistics file.", ""); crm_pca_block_free_data(*blck); crm_pca_block_init(blck); free(dt); return; } offset = 0; if (PCA_CLASS_MAG) { if (loop_it) { //we increased the class mags - set them back to the initial value for (i = 0; i < blck->X->rows; i++) { d = matr_get(blck->X, i, 0); if (d > 0) { matr_set(blck->X, i, 0, PCA_CLASS_MAG); } else { matr_set(blck->X, i, 0, -PCA_CLASS_MAG); } } } //calculate decision point //if number of negative examples = number of positive examples, //this point is xbardotp. however, it turns out that if there //is a skewed distribution, the point moves. i feel like there //should be a theoretical way of knowing where this point is since //we know it for a non-skewed distribution, but i can't seem to find //it. so... we do this as a binary search - we are trying to make the //number of positive and negative mistakes the same //figure out initial direction (n n1 > 0) { ratio = blck->n0/(double)blck->n1;//ratio of positive examples to negative } else { ratio = 0; } inc = ninc+pinc; if ((int)(ratio*ninc + 0.5) < pinc) { //we are getting more positive examples wrong than negative //ones - we should decrease the offset sgn = -1; } else { sgn = 1; } offset = 0; //one point of the binary search is zero - we need the other //far point. just go out in big jumps until we find it while ((sgn < 0 && (int)(ratio*ninc + 0.5) < pinc) || (sgn > 0 && (int)(ratio*ninc + 0.5) > pinc)) { offset += sgn; ninc = 0; pinc = 0; for (i = 0; i < blck->X->rows; i++) { val = matr_get(blck->X, i, 0); if ((dt[i] - offset)*val <= 0) { if (val < 0) { ninc++; } else { pinc++; } } } } //now do the search //our boundaries on the binary search are 0 and offset if (offset > 0) { front = 0; back = offset; } else { front = offset; back = 0; } last_offset = offset + 1; while ((int)(ratio*ninc + 0.5) != pinc && front < back && last_offset != offset) { last_offset = offset; offset = (front + back)/2.0; ninc = 0; pinc = 0; for (i = 0; i < blck->X->rows; i++) { val = matr_get(blck->X, i, 0); if ((dt[i] - offset)*val <= 0) { if (val < 0) { ninc++; } else { pinc++; } } } if ((int)(ratio*ninc + 0.5) < pinc) { //offset should get smaller //ie back should move closer to front back = offset; } else if ((int)(ratio*ninc + 0.5) > pinc) { front = offset; } if (pca_trace) { fprintf(stderr, "searching for decision point: current point = %lf pinc = %d ninc = %d ratio = %lf\n", offset, pinc, ninc, ratio); } } inc = pinc+ninc; //offset is now the decision point blck->sol->mudottheta = offset; if (pca_trace) { fprintf(stderr, "found decision point: %lf pinc = %d ninc = %d ratio = %lf\n", offset, pinc, ninc, ratio); } } if (pca_trace) { fprintf(stderr, "Of %d examples, we classified %d incorrectly.\n", blck->X->rows, inc); } //microgroom if (microgroom && blck->X->rows >= PCA_GROOM_OLD) { if (pca_trace) { fprintf(stderr, "Microgrooming...\n"); } thetaval = (PreciseSparseElement *) malloc(sizeof(PreciseSparseElement)*blck->X->rows); for (i = 0; i < blck->X->rows; i++) { thetaval[i].data = dt[i] - offset; if (matr_get(blck->X, i, 0) < 0) { thetaval[i].data = -thetaval[i].data; } thetaval[i].col = i; } //sort based on the value qsort(thetaval, blck->X->rows, sizeof(PreciseSparseElement), precise_sparse_element_val_compare); //get rid of the top PCA_GROOM_FRAC qsort(&(thetaval[(int)(blck->X->rows*PCA_GROOM_FRAC)]), blck->X->rows - (int)(blck->X->rows*PCA_GROOM_FRAC), sizeof(PreciseSparseElement), precise_sparse_element_col_compare); lim = blck->X->rows; for (i = (int)(blck->X->rows*PCA_GROOM_FRAC); i < lim; i++) { matr_remove_row(blck->X, thetaval[i].col); } free(thetaval); } free(dt); //we've learned all new examples blck->has_new = 0; //we've solved it! so we have a solution blck->has_solution = 1; } /****************************************************************************** *Use a PCA to learn a classification task. *This expects two classes: a class with a +1 label and a class with *a -1 label. These are denoted by the presence or absense of the *CRM_REFUTE label (see the FLAGS section of the comment). *For an overview of how the algorithm works, look at the comments in *crm_pca_lib_fncts.c. * *INPUT: This function is for use with CRM 114 so it takes the * canonical arguments: * csl: The control block. Never actually used. * apb: The argparse block. This is passed to vector_tokenize_selector * and I use the flags (see the FLAG section). * txtptr: A pointer to the text to classify. * txtstart: The text to classify starts at txtptr+txtstart * txtlen: number of characters to classify * *OUTPUT: 0 on success * *FLAGS: The PCA calls crm_vector_tokenize_selector so uses any flags * that that function uses. For learning, it interprets flags as * follows: * * CRM_REFUTE: If present, this indicates that this text has a -1 * label and should be classified as such. If absent, indicates * that this text has a +1 label. * * CRM_UNIQUE: If present, CRM_UNIQUE indicates that we should ignore * the number of times we see a feature. With CRM_UNIQUE, feature * vectors are binary - a 1 in a column indicates that a feature * with that column number was seen once or more. Without it, features * are integer valued - a number in a column indicates the number of * times that feature was seen in the document. * * CRM_MICROGROOM: If there are more than PCA_GROOM_OLD (defined in * (crm114_config.h) examples, CRM_MICROGROOM will remove the PCA_GROOM_FRAC * (defined in crm11_config.h) of them furthest from the decision * boundary. CRM_MICROGROOM ONLY runs AFTER an actual learn - ie * we will never microgroom during an APPEND. In fact, PASSING IN * MICROGROOM WITH APPEND DOES NOTHING. Also note that the effects * of microgrooming are not obvious until the next time the file is * written using fwrite. This will actually happen the next time enough * vectors are added * * CRM_APPEND: The example will be added to the set of examples but * not yet learned on. We will learn on this example the next time * a learn without APPEND or ERASE is called or if classify is called. * If you call learn with CRM_APPEND and actual learn will NEVER happen. * All calls to learn with CRM_APPEND will execute very quickly. * * CRM_FROMSTART: Relearn on every seen (and not microgroomed away) example * instead of using an incremental method. If CRM_FROMSTART and * CRM_APPEND are both flagged, the FROMSTART learn will be done the * next time there is a learn without APPEND or ERASE or a classify. If * examples are passed in using CRM_APPEND after CRM_FROMSTART, we will * also learn those examples whenever we do the FROMSTART learn. * * CRM_ERASE: Erases the example from the example set. If this * example was just appended and never learned on or if it is not * in the support vector set of the last solution, this simply erases * the example from the set of examples. If the example is a support * vector, we relearn everything from the start including any new examples * that were passed in using CRM_APPEND and haven't been learned on. If * CRM_ERASE and CRM_APPEND are passed in together and a relearn is required, * the relearn is done the next time learn is called without APPEND or ERASE * or a classify is called. * * ALL FLAGS NOT LISTED HERE OR USED IN THE VECTOR_TOKENIZER ARE IGNORED. * *WHEN WE LEARN: * * The various flags can seem to interact bizarrely to govern whether a * learn actually happens, but, in fact, everything follows three basic rules: * * 1) WE NEVER LEARN ON CRM_APPEND. * 2) IF WE LEARN, WE LEARN ON ALL EXAMPLES PRESENT. * 3) WHEN ERASING, WE DO EXACTLY AS MUCH WORK IS REQUIRED TO ERASE THE * EXAMPLE AND NO MORE EXCEPT WHERE THIS CONFLICTS WITH THE FIRST 2 RULES. * * Therefore, rule 2 says that a FROMSTART, for example, will learn on both * old and new examples. Likewise rule 2 states that an ERASE that requires * a relearn, will learn on both old and new examples. An ERASE that DOESN'T * require a relearn, however, is governed by rule 3 and therefore * will NOT run a learn on new examples because that is NOT necessary to * erase the example. Rule 1 ensures that passing in CRM_MICROGROOM with * CRM_APPEND does nothing because we only MICROGROOM after a learn and we * NEVER learn on CRM_APPEND. Etc. *****************************************************************************/ int crm_pca_learn(CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen) { char htext[MAX_PATTERN], filename[MAX_PATTERN]; long i, j; unsigned int features[MAX_PCA_FEATURES]; crm_pca_block blck; size_t unused; Vector *nex, *row; int read_file = 0, do_learn = 1, lim = 0; void *addr = NULL; if (user_trace) { pca_trace = 1; } if (internal_trace) { //this is a "mediumly verbose" setting pca_trace = PCA_INTERNAL_TRACE_LEVEL + 1; } PCA_DEBUG_MODE = pca_trace - 1; if (PCA_DEBUG_MODE < 0) { PCA_DEBUG_MODE = 0; } if (pca_trace) { fprintf(stderr, "Doing a PCA learn.\n"); } //Get the filename //crm_stmt_parser.c crm_get_pgm_arg(htext, MAX_PATTERN, apb->p1start, apb->p1len); crm_nexpandvar(htext, apb->p1len, MAX_PATTERN); i = 0; while (htext[i] < 0x021) i++; j = i; while (htext[j] >= 0x021) j++; htext[j] = '\000'; strcpy (filename, &htext[i]); //set things to NULL that should be null crm_pca_block_init(&blck); if (txtptr && txtlen > 0) { //get the new example nex = convert_document(txtptr+txtstart, txtlen, features, apb); if (apb->sflags & CRM_ERASE) { //find this example and remove all instances of it //then do a FROMSTART unless we haven't learned on this //example yet //requires reading in the whole file //load our stat file in if (!(addr = pca_map_file(&blck, filename))) { nonfatalerror("An error occurred trying to map in the file. Likely it is corrupted. Your vector will not be erased. The file is", filename); } else { read_file = 1; } do_learn = 0; //we are erasing, not learning j = 0; lim = blck.X->rows; for (i = 0; i < lim; i++) { row = matr_get_row(blck.X, i-j); if (!row) { continue; } if (vector_equals(nex, row)) { //have to start over do_learn = 1; if (!(apb->sflags & CRM_FROMSTART)) { apb->sflags = apb->sflags | CRM_FROMSTART; } matr_remove_row(blck.X, i-j); j++; if (vector_get(nex, 0) < 0) { blck.n1--; blck.n1f -= nex->nz; if (PCA_CLASS_MAG) { blck.n1f++; } } else { blck.n0--; blck.n0f -= nex->nz; if (PCA_CLASS_MAG) { blck.n0f++; } } } } vector_free(nex); } else { //add the vector to the new matrix append_vector_to_pca_file(nex, filename); } } if (apb->sflags & CRM_FROMSTART) { do_learn = 1; if (!read_file) { if (!(addr = pca_map_file(&blck, filename))) { nonfatalerror("An error occurred trying to map in the file. Likely it is corrupted. The fromstart learn will have no effect. The file is", filename); } else { read_file = 1; } } //get rid of the old solution pca_free_solution(blck.sol); blck.sol = NULL; //reset the constants blck.n0 = 0; blck.n1 = 0; blck.n0f = 0; blck.n1f = 0; } if (!(apb->sflags & CRM_APPEND) && do_learn) { if (!read_file) { if (!(addr = pca_map_file(&blck, filename))) { nonfatalerror("An error occurred trying to map in the file. Likely it is corrupted. Your learn will have no effect. The file is", filename); do_learn = 0; } else { read_file = 1; } } if (do_learn) { crm_pca_learn_new_examples(&blck, apb->sflags & CRM_MICROGROOM); } } if (read_file) { //we did something to it! //save it unused = crm_pca_save_changes(&blck, addr, filename); } //free everything crm_pca_block_free_data(blck); return 0; } /****************************CLASSIFICATION FUNCTIONS*************************/ /****************************************************************************** *Use a PCA for a classification task. *This expects two classes: a class with a +1 label and a class with *a -1 label. The class with the +1 label is class 0 and the class *with the -1 label is class 1. When learning, class 1 is denoted by *passing in the CRM_REFUTE flag. The classify is considered to FAIL *if the example classifies as class 1 (-1 label). The PCA requires *at least one example to do any classification, although really you should *give it at least one from each class. If classify is called without any *examples to learn on at all, it will classify the example as class 0, but *it will also print out an error. * *If classify is called and there are new examples that haven't been learned *on or a FROMSTART learn that hasn't been done, this function will do that *BEFORE classifying. in other words: * *CLASSIFY WILL DO A LEARN BEFORE CLASSIFYING IF NECESSARY. IT WILL NOT STORE *THAT LEARN BECAUSE IT HAS NO WRITE PRIVILEGES TO THE FILE. * *INPUT: This function is for use with CRM 114 so it takes the * canonical arguments: * csl: The control block. Used to skip if classify fails. * apb: The argparse block. This is passed to vector_tokenize_selector * and I use the flags (see the FLAG section). * txtptr: A pointer to the text to classify. * txtstart: The text to classify starts at txtptr+txtstart * txtlen: number of characters to classify * *OUTPUT: return is 0 on success * The text output (stored in out_var) is formatted as follows: * * LINE 1: CLASSIFY succeeds/fails success probability: # pR: # * (note that success probability is high for success and low for failure) * LINE 2: Best match to class #0/1 probability: # pR: # * (probability >= 0.5 since this is the best matching class.) * LINE 3: Total features in input file: # * LINE 4: #0 (label +1): documents: #, features: #, prob: #, pR # * LINE 5: #1 (label -1): documents: #, features: #, prob: #, pR # * (prob is high for match class, low else. pR is positive for match class.) * * I've picked a random method for calculating probability and pR. Thinking * about it, there may be literature for figuring out the probability at * least. Anyone who wants to do that, be my guest. For now, I've found * a function that stays between 0 and 1 and called it good. Specifically, * if theta is the decision boundary and x is the example to classify: * * prob(class = 0) = 0.5 + 0.5*tanh(theta dot x - mudottheta) * pR = (theta dot x - mudottheta)*10 * *FLAGS: The PCA calls crm_vector_tokenize_selector so uses any flags * that that function uses. For classifying, it interprets flags as * follows: * * CRM_UNIQUE: If present, CRM_UNIQUE indicates that we should ignore * the number of times we see a feature. With CRM_UNIQUE, feature * vectors are binary - a 1 in a column indicates that a feature * with that column number was seen once or more. Without it, features * are integer valued - a number in a column indicates the number of * times that feature was seen in the document. If you used CRM_UNIQUE * to learn, use CRM_UNIQUE to classify! (duh) * * CRM_MICROGROOM: If classify does a learn, it will MICROGROOM. See the * comment to learn for how microgroom works. * * ALL FLAGS NOT LISTED HERE OR USED IN THE VECTOR_TOKENIZER ARE IGNORED. * INCLUDING FLAGS USED FOR LEARN! *****************************************************************************/ int crm_pca_classify(CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen) { char htext[MAX_PATTERN], filename[MAX_PATTERN], out_var[MAX_PATTERN]; long i, j, out_var_len = 0; unsigned int features[MAX_PCA_FEATURES], out_pos = 0; Vector *nex; double dottheta = 0; int class, sgn, nz; crm_pca_block blck; void *addr = NULL; if (user_trace) { pca_trace = 1; } if (internal_trace) { //this is a "mediumly verbose" setting pca_trace = PCA_INTERNAL_TRACE_LEVEL + 1; } PCA_DEBUG_MODE = pca_trace - 1; if (PCA_DEBUG_MODE < 0) { PCA_DEBUG_MODE = 0; } if (pca_trace) { fprintf(stderr, "Doing a PCA classify.\n"); } crm_pca_block_init(&blck); //Get the filename (we only have one) //crm_stmt_parser.c crm_get_pgm_arg(htext, MAX_PATTERN, apb->p1start, apb->p1len); crm_nexpandvar(htext, apb->p1len, MAX_PATTERN); i = 0; while (htext[i] < 0x021) i++; j = i; while (htext[j] >= 0x021) j++; htext[j] = '\000'; strcpy (filename, &htext[i]); //Get the output variable name if (apb->p2start) { crm_get_pgm_arg(out_var, MAX_PATTERN, apb->p2start, apb->p2len); out_var_len = crm_nexpandvar(out_var, apb->p2len, MAX_PATTERN); } //do we have new vectors to learn on? if (has_new_vectors(filename)) { //we use read so that we don't make changes to the file if (pca_read_file(&blck, filename)) { crm_pca_learn_new_examples(&blck, 0); } } else { pca_get_meta_data(filename, &blck); blck.sol = get_solution_from_pca_file(filename, &addr); } //get the new example nex = convert_document(txtptr+txtstart, txtlen, features, apb); if (PCA_CLASS_MAG) { //we're classifying. we don't want a class mag running around. vector_set(nex, 0, 0); } //classify it if (blck.sol && blck.sol->theta) { //this is biased towards the negative not sure why dottheta = dot(nex, blck.sol->theta) - blck.sol->mudottheta; crm_pca_block_free_data(blck); } else { nonfatalerror ("Nothing was learned before asking PCA for a classification. I am trying to classify from the file", filename); dottheta = 0; } if (addr) { crm_munmap_file(addr); } if (dottheta < 0) { class = 1; sgn = -1; } else { class = 0; sgn = 1; } if (fabs(dottheta) > 100000) { nonfatalerror("The pR values here are HUGE. One fix for this is to redo things with the unique flag set. This is especially true if you are also using the string flag.", ""); dottheta = sgn*9999.9; } if (pca_trace) { fprintf (stderr, "The dot product of the decision boundary and the example is %lf\n", dottheta); } if (apb->p2start) { //annnnnnd... write it all back out if (!class) { //these are very arbitrary units of measurement //i picked tanh because... it's a function with a middle at 0 //and nice asymptotic properties near 1 //yay! out_pos += sprintf (outbuf + out_pos, "CLASSIFY succeeds"); } else { out_pos += sprintf(outbuf + out_pos, "CLASSIFY fails"); } out_pos += sprintf(outbuf + out_pos, " success probability: %f pR: %6.4f\n", 0.5 + 0.5*tanh(dottheta), dottheta*1000.0); //note: this next pR is always positive (or zero) out_pos += sprintf(outbuf + out_pos, "Best match to class #%d prob: %6.4f pR: %6.4f \n", class, 0.5 + 0.5*tanh(fabs(dottheta)), sgn*dottheta*1000.0); nz = nex->nz; if (PCA_CLASS_MAG) { nz--; } out_pos += sprintf(outbuf + out_pos, "Total features in input file: %d\n", nz); //these following pR's always have opposite signs from each other out_pos += sprintf (outbuf + out_pos, "#0 (label +1): documents: %d, features: %d, prob: %3.2e, pR: %6.2f\n", blck.n0, blck.n0f, 0.5 + 0.5*tanh(dottheta), dottheta*1000.0); out_pos += sprintf (outbuf + out_pos, "#1 (label -1): documents: %d, features: %d, prob: %3.2e, pR: %6.2f\n", blck.n1, blck.n1f, 0.5 - 0.5*tanh(dottheta), -1*dottheta*1000.0); if (out_var_len) { crm_destructive_alter_nvariable(out_var, out_var_len, outbuf, out_pos); } } vector_free(nex); if (class) { //classifies out-of-class csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk[csl->mct[csl->cstmt]->nest_level] = -1; return 0; } return 0; } crm114-20100106-BlameMichelson.src/indirecttest.crm0000755000000000017500000000061711321154266020210 0ustar rootwsy#! /usr/bin/crm # # indirecttest.crm - test :+: indirection # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # Program to test :+: indirection window { isolate (:c:) /and that's good/ isolate (:abe:) /:b:/ isolate (:b:) /you hit the jackpot/ isolate (:d:) /to see it/ output /result -->:+:abe:<--->:+:c:<--->:+:frobotz:<--- \n/ } crm114-20100106-BlameMichelson.src/alternating_example_pca.crm0000755000000000017500000000400011321154266022343 0ustar rootwsy#! /usr/bin/crm window { output /**** Alternating Example PCA Network Classifier TRAINING\n/ # load the files in isolate (:Macbeth: :Alice:) input (:Macbeth:) [ Macbeth_Act_IV.txt 0 16000] input (:Alice:) [ Alice_In_Wonderland_Chap_1_And_2.txt 0 16000] # Now loop. isolate (:loopcontrol:) // isolate (:loopcounter:) /0/ { eval (:loopcounter:) / :@: :*:loopcounter: + 1 : / # output /Top of loop at :*:loopcounter: \n/ match [:loopcontrol:] /./ { { # Grab a good chunk of Macbeth... match (:onep:) /(....){255}.*?\n/ [:Macbeth:] match [:onep:] /.../ learn [:onep:] < PCA unigram append> (m_test.css) learn [:onep:] < PCA unigram refute append> (a_test.css) } alius # Set done mark { alter (:loopcontrol:) /X/ } } { { # Grab a good chunk of Alice... match (:twop:) /(....){255}.*?\n/ [:Alice:] match [:twop:] /.../ learn [:twop:] < PCA unigram append> (a_test.css) learn [:twop:] < PCA unigram refute append> (m_test.css) } alius # reset to start of Macbeth file. { alter (:loopcontrol:) /X/ } } liaf } # Now run one fromstart loop on each of the files learn [:_nl:] (m_test.css ) # learn [:_nl:] (a_test.css) } output /\n**** Alternating Example PCA Network Classifier RUNNING TEST\n/ isolate (:s:) isolate (:filetxt:) // { input (:filetxt:) [ Alice_In_Wonderland_Chap_1_And_2.txt 16000 4096 ] match (:t1:) [:filetxt:] /(....){255}.*?\n\n/ { classify < PCA unigram > ( m_test.css ) (:s:)/[[:graph:]]+/ [:t1:] output / type M \n:*:s:\n/ } alius { output / type A \n:*:s:\n/ } } { isolate (:t2:) // input (:filetxt:) [ Macbeth_Act_IV.txt 16000 4096 ] match (:t2:) [:filetxt:] /(....){255}.*?\n/ { classify < PCA unigram > ( m_test.css ) (:s:) /[[:graph:]]+/ [:t2:] output / type M \n:*:s:\n/ } alius { output / type A \n:*:s:\n/ } } crm114-20100106-BlameMichelson.src/crm_expr_alter.c0000644000000000017500000001505611321154266020156 0ustar rootwsy// crm_expr_alter.c - expression alter or eval tools // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *tempbuf; int crm_expr_eval (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { // Here we evaluate the slash-string _repeatedly_, not just // once as in ALTER. // // To prevent infinite loops (or at least many of them) we: // 1) strictly limit the total number of loop iterations to // the compile-time parameter MAX_EVAL_ITERATIONS // 2) we also keep an array of the hashes of the values, // if we see a repeat, we assume that it's a loop and we stop // right there. char varname[MAX_VARNAME]; long varnamelen = 0; long newvallen; unsigned long long ihash; unsigned long long ahash [MAX_EVAL_ITERATIONS]; long ahindex; long itercount; long loop_abort; long qex_stat; long has_output_var; // should use tempbuf for this instead. // char newstr [MAX_PATTERN]; if (user_trace) fprintf (stderr, "Executing an EVALuation\n"); qex_stat = 0; has_output_var = 1; // get the variable name crm_get_pgm_arg (varname, MAX_VARNAME, apb->p1start, apb->p1len); if (apb->p1len < 3) { has_output_var = 0; if (user_trace) fprintf (stderr, "There's no output var for this EVAL, so we won't " "be assigning the result anywhere.\n It better have a " "relational test, or you're just wasting CPU.\n"); }; if (has_output_var) { // do variable substitution on the variable name varnamelen = crm_nexpandvar (varname, apb->p1len, MAX_VARNAME); if (varnamelen < 3) { nonfatalerror5 ( "The variable you're asking me to alter has an utterly bogus name\n", "so I'll pretend it has no output variable.", CRM_ENGINE_HERE); has_output_var = 0; }; }; // get the new pattern, and expand it. crm_get_pgm_arg (tempbuf, data_window_size, apb->s1start, apb->s1len); ihash = 0; itercount = 0; for (ahindex = 0; ahindex < MAX_EVAL_ITERATIONS; ahindex++) ahash[ahindex] = -1; ahindex = 0; loop_abort = 0; // // Now, a loop - while it continues to change, keep looping. // But to try and detect infinite loops, we keep track of the // previous values (actually, their hashes) and if one of those // values recur, we stop evaluating and throw an error. // newvallen = apb->s1len; while (itercount < MAX_EVAL_ITERATIONS && ! (loop_abort)) { int i; itercount++; ihash = strnhash (tempbuf, newvallen); // // build a 64-bit hash by changing the initial conditions and // by using all but two of the characters and by overlapping // the results by two bits. This is intentionally evil and // tangled. Hopefully it will work. // if (newvallen > 3) ihash = (ihash << 30) + strnhash (&tempbuf[1], newvallen - 2); if (internal_trace) fprintf (stderr, "Eval ihash = %lld\n", ihash); for (i = 0; i < itercount; i++) if (ahash[i] == ihash) { loop_abort = 1; if ( i != itercount - 1) loop_abort = 2; }; ahash[i] = ihash; newvallen = crm_qexpandvar (tempbuf, newvallen, data_window_size, &qex_stat ); }; if (itercount == MAX_EVAL_ITERATIONS ) { nonfatalerror5 ("The variable you're attempting to EVAL seems to eval " "infinitely, and hence I cannot compute it. I did try " "a lot, though. I got this far before I gave up: ", tempbuf, CRM_ENGINE_HERE); return (0); } if (loop_abort == 2) { nonfatalerror5 ("The variable you're attempting to EVAL seemes to return " "to the same value after a number of iterations, " "so it is probably an " "infinite loop. I think I should give up. I got this " "far: ", tempbuf, CRM_ENGINE_HERE); return (0); }; // and shove it out to wherever it needs to be shoved. // if (has_output_var) crm_destructive_alter_nvariable (varname, varnamelen, tempbuf, newvallen); if (internal_trace) fprintf (stderr, "Final qex_stat was %ld\n", qex_stat); // for now, use the qex_stat that came back from qexpandvar. if (qex_stat > 0) { if (user_trace) fprintf (stderr, "Mathematical expression at line was not satisfied, doing a FAIL at line %ld\n", csl->cstmt); csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk [ csl->mct[csl->cstmt]->nest_level ] = -1; } return (0); } int crm_expr_alter (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { // here's where we surgiclly alter a variable. We have to // watch out in case a variable is not in the cdw (it might // be in tdw; that's legal as well. // syntax is to replace the contents of the variable in the // varlist with the evaluated string. // Syntax is "alter (var) /newvalue/ char varname[MAX_VARNAME]; long varnamestart; long varnamelen; long newvallen; // should use tempbuf for this instead. // char newstr [MAX_PATTERN]; if (user_trace) fprintf (stderr, "Executing an ALTERation\n"); // get the variable name crm_get_pgm_arg (varname, MAX_VARNAME, apb->p1start, apb->p1len); if (apb->p1len < 3) { nonfatalerror5 ( "This statement is missing the variable to alter,\n", "so I'll ignore the whole statement.", CRM_ENGINE_HERE); return (0); }; // do variable substitution on the variable name varnamelen = crm_nexpandvar (varname, apb->p1len, MAX_VARNAME); // this next part goes away for LAZY variables crm_nextword (varname, varnamelen, 0, &varnamestart, &varnamelen); if (varnamelen - varnamestart < 3) { nonfatalerror5 ( "The variable you're asking me to alter has an utterly bogus name\n", "so I'll ignore the whole statement.", CRM_ENGINE_HERE); return (0); }; // get the new pattern, and expand it. crm_get_pgm_arg (tempbuf, data_window_size, apb->s1start, apb->s1len); newvallen = crm_nexpandvar (tempbuf, apb->s1len, data_window_size); crm_destructive_alter_nvariable (&varname[varnamestart], varnamelen, tempbuf, newvallen); return (0); }; crm114-20100106-BlameMichelson.src/match_isolate_reclaim.crm0000755000000000017500000000764211321154266022024 0ustar rootwsy#! /usr/bin/crm # # match_isolate_reclaim.crm - test match/isolate memory reclamation # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. { window output /\nMatch-Isolate reclamation test. If this program doesn't \n/ output /error out, the test is passed. (we allocate about 80 megs, \n/ output /well past the window size, but in small (400K) chunks that\n/ output /alternately are allocated and become reclaimable) \n/ alter (:_dw:) /foo/ isolate (:bytes: :s:) syscall () (:bytes:) /cat \/usr\/share\/dict\/words/ eval (:s:) /:#:bytes:/ isolate (:startbytes:) // isolate (:endbytes:) // isolate (:passes:) /0/ isolate (:flux:) /0/ isolate (:dflux:) /0/ isolate (:b2: :b3:) isolate (:ihead:) /X/ eval (:startbytes:) /:#:_iso:/ output /OK_IF_SIZE_CHANGES: Size of isolation at start: :*:startbytes:\n/ { isolate (:b2:) /:*:bytes:/ match (:b3:) /.*/ [:b2:] eval (:flux:) /:@: :*:flux: + :#:b2::/ match (:bytes:) /./ isolate (:bytes:) /:*:b2:/ eval (:flux:) /:@: :*:flux: + :#:bytes::/ match (:b2:) /./ eval (:passes:) /:@: :*:passes: + 1 :/ # how many passes should we run? eval /:@: :*:passes: < 10 :/ liaf } match (:b3:) /./ eval (:endbytes:) /:#:_iso:/ output /Bytes used per pass: :*:s:\n/ output /Passes done: :*:passes: \n/ output /Total flux through isolation: :*:flux:\n/ output /OK_IF_SIZE_CHANGES: Final isolation size: :*:endbytes: \n/ eval (:dflux:) /:@: :*:endbytes: - :*:startbytes::/ output /OK_IF_LESS_THAN_10: Total growth: :*:dflux:\n/ match [:_iso: 0 1000] (:ihead:) /.*/ # output /HEAD: :*:ihead:\n/ match (:ihead:) /./ { isolate (:b2:) /:*:bytes:/ match (:b3:) /.*/ [:b2:] eval (:flux:) /:@: :*:flux: + :#:b2::/ match (:bytes:) /./ isolate (:bytes:) /:*:b2:/ eval (:flux:) /:@: :*:flux: + :#:bytes::/ match (:b2:) /./ eval (:passes:) /:@: :*:passes: + 1 :/ # how many passes should we run? eval /:@: :*:passes: < 20 :/ liaf } match (:b3:) /./ eval (:endbytes:) /:#:_iso:/ output /Bytes used per pass: :*:s:\n/ output /Passes done: :*:passes: \n/ output /Total flux through isolation: :*:flux:\n/ output /OK_IF_SIZE_CHANGES: Final isolation size: :*:endbytes: \n/ eval (:dflux:) /:@: :*:endbytes: - :*:startbytes::/ output /OK_IF_LESS_THAN_20: Total growth: :*:dflux:\n/ match [:_iso: 0 1000] (:ihead:) /.*/ # output /HEAD: :*:ihead:\n/ match (:ihead:) /./ { isolate (:b2:) /:*:bytes:/ match (:b3:) /.*/ [:b2:] eval (:flux:) /:@: :*:flux: + :#:b2::/ match (:bytes:) /./ isolate (:bytes:) /:*:b2:/ eval (:flux:) /:@: :*:flux: + :#:bytes::/ match (:b2:) /./ eval (:passes:) /:@: :*:passes: + 1 :/ # how many passes should we run? eval /:@: :*:passes: < 30 :/ liaf } match (:b3:) /./ eval (:endbytes:) /:#:_iso:/ output /Bytes used per pass: :*:s:\n/ output /Passes done: :*:passes: \n/ output /Total flux through isolation: :*:flux:\n/ output /OK_IF_SIZE_CHANGES: Final isolation size: :*:endbytes: \n/ eval (:dflux:) /:@: :*:endbytes: - :*:startbytes::/ output /OK_IF_LESS_THAN_30: Total growth: :*:dflux:\n/ match [:_iso: 0 1000] (:ihead:) /.*/ # output /HEAD: :*:ihead:\n/ match (:ihead:) /./ { isolate (:b2:) /:*:bytes:/ match (:b3:) /.*/ [:b2:] eval (:flux:) /:@: :*:flux: + :#:b2::/ match (:bytes:) /./ isolate (:bytes:) /:*:b2:/ eval (:flux:) /:@: :*:flux: + :#:bytes::/ match (:b2:) /./ eval (:passes:) /:@: :*:passes: + 1 :/ # how many passes should we run? eval /:@: :*:passes: < 40 :/ liaf } match (:b3:) /./ eval (:endbytes:) /:#:_iso:/ output /Bytes used per pass: :*:s:\n/ output /Passes done: :*:passes: \n/ output /Total flux through isolation: :*:flux:\n/ output /OK_IF_SIZE_CHANGES: Final isolation size: :*:endbytes: \n/ eval (:flux:) /:@: :*:endbytes: - :*:startbytes::/ output /OK_IF_LESS_THAN_40: Total growth: :*:flux:\n/ match [:_iso: 0 1000] (:ihead:) /.*/ # output /HEAD: :*:ihead:\n/ match (:ihead:) /./ } crm114-20100106-BlameMichelson.src/backwardstest.crm0000755000000000017500000000065311321154266020350 0ustar rootwsy#! /usr/bin/crm # # backwardstest.crm - search backwards for foo, then bar.ooo # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. { { output / searching for a foo.../ match /foo/ output /found a foo, searching for bar.ooo... / match (:b:) /bar.ooo/ output /found a bar.ooo: -:*:b:- :*:_nl: / } alius { output /no bar.ooo in front of the foo. :*:_nl:/ } } crm114-20100106-BlameMichelson.src/quine.crm0000755000000000017500000000031611321154266016624 0ustar rootwsy#! /usr/bin/crm # # quine.crm - produce program's own source code as output # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. { window output /:*:_pgm_text:/ } crm114-20100106-BlameMichelson.src/overalterisolatedtest.crm0000755000000000017500000000406411321154266022137 0ustar rootwsy#! /usr/bin/crm # # overalterisolatedtest.crm - CRM114 testing ALTERation and copying # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. window { output /:*:_nl: CRM114 testing ALTERation and copying :*:_nl::*:_nl:/ alter (:_dw:) /01abcdefghijkl89/ isolate (:z:) // match <> (:big:) /.*/ output /----- Whole file -----:*:_nl:/ output /:*:big:/ output /----------------------/ match <> (:1:) /abcde/ match <> (:2:) /cde+fg/ match <> (:3:) /fghij/ alter <> (:z:) /:*:2:/ output /:*:_nl: 1: :*:1:, 2: :*:2:, 3: :*:3: z: :*:z: :*:_nl:/ output / ---altering the z copy ----- :*:_nl:/ alter (:z:) /CDEEEFG/ output /:*:_nl: 1: :*:1:, 2: :*:2:, 3: :*:3: z: :*:z: :*:_nl:/ output /----- Whole file -----:*:_nl:/ output /:*:big:/ output / ---altering the original ----- :*:_nl:/ alter (:2:) /CDEEEFG/ output /:*:_nl: 1: :*:1:, 2: :*:2:, 3: :*:3: z: :*:z: :*:_nl:/ output /----- Whole file -----:*:_nl:/ output /:*:big:/ output / ---altering the original again----- :*:_nl:/ alter (:2:) /CDFG/ output /:*:_nl: 1: :*:1:, 2: :*:2:, 3: :*:3: z: :*:z: :*:_nl:/ output /----- Whole file -----:*:_nl:/ output /:*:big:/ output /----------------------:*:_nl:/ match <> (:big:) /.*/ output /----- Rematched Whole file -----:*:_nl:/ output /:*:big:/ output /:*:_nl:----------------------:*:_nl:/ output /------putting things back-----:*:_nl:/ alter (:_dw:) /01abcdefghijkl89/ match <> (:big:) /.*/ output /----- Whole file -----:*:_nl:/ output /:*:big:/ output /:*:_nl:----------------------:*:_nl:/ match (:q:) /cdefg/ output /:*:_nl:----------------------:*:_nl:/ output / :q: = :*:q: :*:_nl:/ output / ISOLATEing :q: - this should copy :*:_nl:/ isolate (:q:) output / :q: = :*:q: :*:_nl:/ output / ALTERing :q: to ZZZZZ:*:_nl:/ alter (:q:) /ZZZZZ/ output / :q: = :*:q: :*:_nl:/ match <> (:big:) /.*/ output /----- Whole file -----:*:_nl:/ output /:*:big:/ output /:*:_nl:----------------------:*:_nl:/ output /:*:_nl: Checking initialization of isolation :*:_nl:/ isolate (:r:) / AAAAA/ output /:*:r: :*:_nl:/ } crm114-20100106-BlameMichelson.src/crm_expr_isolate.c0000644000000000017500000003007711321154266020507 0ustar rootwsy// crm_expr_isolate.c - isolate a variable (includes mem management) // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *tempbuf; // Allow creation of a temporary isolated variable; // this lives in the same big buffer as the environ // args, the arg0 args, and the basic formatting args // (like :_nl:, :_ht:, :_bs:, etc). int crm_expr_isolate (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { char temp_vars [MAX_VARNAME]; long tvlen; long vn_start_here; long vstart; long vlen; long mc; long done; long vallen; int iso_status; if (user_trace) fprintf (stderr, "executing an ISOLATE statement\n"); // get the list of variable names // crm_get_pgm_arg (temp_vars, MAX_VARNAME, apb->p1start, apb->p1len); tvlen = crm_nexpandvar (temp_vars, apb->p1len, MAX_VARNAME); if (tvlen == 0) { nonfatalerror5( "This statement is missing the variable to isolate" " so I'll just ignore the whole statement.", "", CRM_ENGINE_HERE); } if (internal_trace) fprintf (stderr, " creating isolated vars: ***%s***\n", temp_vars); done = 0; mc = 0; // Now, find the vars (space-delimited, doncha know) and make them // isolated. // vstart = 0; vlen = 0; vn_start_here = 0; while (!done) { crm_nextword (temp_vars, tvlen, vn_start_here, &vstart, &vlen); vn_start_here = vstart + vlen + 1; if (vlen == 0) { done = 1; } else // not done yet. { // must make a copy of the varname. char vname[MAX_VARNAME]; long vmidx; memmove (vname, &(temp_vars[vstart]), vlen); vname [vlen] = '\000'; if (vlen < 3) { nonfatalerror5 ("The variable you're asking me to ISOLATE " " has an utterly bogus name. I'll ignore" " the rest of the statement", " ", CRM_ENGINE_HERE); break; }; if (strcmp (vname, ":_dw:") == 0) { nonfatalerror5 ("You can't ISOLATE the :_dw: data window! ", "We'll just ignore that for now", CRM_ENGINE_HERE); } else // OK- isolate this variable { vmidx = crm_vht_lookup (vht, vname, vlen); // // get initial value - that's the slashed value. // GROT GROT GROT // Although the initial design for CRM114 used slashed // values, this is really problematic in two senses; it's // not a pattern string (it gets expanded!) and second // from a practical point of view: writing a pathname // with all slashes escaped is a pain. So we'll allow // [boxed strings] as well as /slashes/. // GROT GROT GROT vallen = 0; if (apb->s1len > 0) { crm_get_pgm_arg (tempbuf, data_window_size, apb->s1start, apb->s1len); vallen = apb->s1len; }; if (apb->b1len > 0) { crm_get_pgm_arg (tempbuf, data_window_size, apb->b1start, apb->b1len); vallen = apb->b1len; }; // // Now, check these cases in order: // // not preexisting, no /value/ - isolate, set to "" // not preexisting, with /value/ -isolate, set /val/ // preexisting _dw, with /value/ - isolate, set to /val/ // preexisting _dw, no /value/ - isolate, set to dwval. // if preexisting AND default turned on - do nothing! // preexisting isolated, no /value/ - copy value. // preexisting isolated, with /value/ - alter /value/ // // not preexisting. if (vht[vmidx] == NULL) { // not preexisting, no slash value if (internal_trace) fprintf (stderr, "Not a preexisting var.\n"); if (!vallen) { // no slash value- set to "" if (internal_trace) fprintf (stderr, "No initialization value given, using" " a zero-length string.\n"); tempbuf[0] = '\0'; } else { // not preexisting, has a /value/, use it. if (internal_trace) fprintf (stderr, "using the slash-value given.\n"); vallen = crm_nexpandvar(tempbuf, apb->s1len, data_window_size - tdw->nchars); }; } else // it IS preexisting { // It is preexisting, maybe in isolation, maybe // not but we're isolating again. So we need to // copy again. // if (internal_trace) fprintf (stderr, "Preexisting var.\n"); if (apb->sflags & CRM_DEFAULT) { if (user_trace) fprintf (stderr, " The var already exists, default flag on, " "so no action taken.\n"); goto no_isolate_action; // return (0); }; if (vallen > 0) { // yes, statement has a /value/ // get the /value/ if (internal_trace) fprintf (stderr, "Using the provided value.\n"); vallen = crm_nexpandvar(tempbuf, apb->s1len, data_window_size - tdw->nchars); } else { // no /value/, so we need to use the old value. // if (internal_trace) fprintf (stderr, "No given value, using old value.\n"); strcpy (tempbuf, ":*"); strncat (tempbuf, vname, vlen); vallen = crm_nexpandvar (tempbuf, vlen+2, data_window_size - tdw->nchars); }; }; // // Now we have the name of the variable in vname/vlen, // and the value string in tempbuf/vallen. We can then // isolate it with crm_isolate_this, which always does // the right thing (pretty much). :-) // iso_status = crm_isolate_this (&vmidx, vname, 0, vlen, tempbuf, 0, vallen); if ( iso_status > 0) return (iso_status); }; // the semicolon is to keep the C compiler happy. no_isolate_action: ; }; vstart = vstart + vlen; if (temp_vars[vstart] == '\000' || vstart >= tvlen ) done = 1; }; return (0); }; // // General-purpose routine to "do the right thing" to isolate a // string value. This routine takes care of all of the possible // extenuating circumstances, like is/is not already isolated, // is/is not requiring a reclaim, etc. (how it knows? If the // value is in the TDW, it may need to be reclaimed, otherwise it // doesn't reclaim.) // // Note - this routine will malloc and then free a spare buffer if // it gets handed data that's already in the TDW. Best not to do // that if you can avoid it; that is an efficiency speedup // int crm_isolate_this (long *vptr, char *nametext, long namestart, long namelen, char *valuetext, long valuestart, long valuelen) { long is_old; long is_old_isolated; long vmidx; long oldvstart = 0; long oldvlen = 0; long neededlen; if (internal_trace) { fprintf (stderr, "using crm_isolate_this, vptr = %lX\n", (long) vptr); } // keep track of the amount of storage needed for this variable // to be inserted into the isolated space. neededlen = 10; // // gather information // In particular - does the name already exits? Is it // already isolated? if (vptr) { vmidx = *vptr; } else { vmidx = crm_vht_lookup ( vht, &nametext[namestart], namelen); } // check the vht - if it's not here, we need to add it. // if (vht[vmidx] == NULL) { // optimization - if it's not old, we don't need to run a reclaim // phase later on. is_old = 0; // must allow space for the name. neededlen += namelen; } else { is_old = 1; }; if (is_old && vht[vmidx]->valtxt == tdw->filetext) { is_old_isolated = 1; // and save old start and length for the incremental reclaimer oldvstart = vht[vmidx]->vstart; oldvlen = vht[vmidx]->vlen; // how much space will the new value take up? neededlen += valuelen - oldvlen; } else { is_old_isolated = 0; // and how much space will we need? neededlen += valuelen; }; // Do a check - is there enough space in the tdw to hold // the new variable (both name and value)? if ( tdw->nchars + neededlen > data_window_size ) { char vname[130]; strncpy (vname, &nametext[namestart], ( (128 < namelen) ? 128 : namelen)); vname[129] = 0; fatalerror5 ("You have blown the memory-storage gaskets while trying" "to store the ISOLATEd variable ", vname, CRM_ENGINE_HERE); return (1); } // If we get to here, there's more than enough space; so we're good to // go with no further checks. // If there wasn't a vht slot existing, make one. // if (!is_old) { // nope, never seen this var before, add it into the VHT // namestart is where we are now. long nstart, vstart; if (internal_trace) fprintf (stderr, "... this is a new isolated var\n"); // // Put the name into the tdw memory area, add a & after it. // // do the name first. Start on a newline. tdw->filetext[tdw->nchars] = '\n'; tdw->nchars++; nstart = tdw->nchars; memmove (&tdw->filetext[nstart], &nametext[namestart], namelen); tdw->nchars = tdw->nchars + namelen; tdw->filetext[tdw->nchars] = '='; tdw->nchars++; // and put in the value in now vstart = tdw->nchars; memmove (&tdw->filetext[vstart], &valuetext[valuestart], valuelen); tdw->nchars = tdw->nchars + valuelen; tdw->filetext[tdw->nchars] = '\n'; tdw->nchars++; // now, we whack the actual VHT. crm_setvar (NULL, 0, tdw->filetext, nstart, namelen, tdw->filetext, vstart, valuelen, csl->cstmt, 0); // that's it. It's now in the TDW and in the VHT return (0); } // No, it's a preexisting variable. We need to do the shuffle. // // Note that this code is almost but not quite a mirror // of the code that lives in crm_set_temp_nvar. // // if (internal_trace) fprintf (stderr, "Resetting valtxt to point at tdw.\n"); vht[vmidx]->valtxt = tdw->filetext; if (internal_trace) fprintf (stderr, "Fresh start: offset %ld length %ld.\n", tdw->nchars, valuelen); // // // If we have a zero-length string, followed by a // non-zero-lenth string, next to each other, with // no intervening allocations, both strings will // have the _same_ start point. This messes things // up badly on subsequent alters. Thus, we // _must_ put a spacer in. // // This code must also be echoed in crm_set_temp_nvar // if (valuelen == 0) { tdw->filetext[tdw->nchars] = '\n'; tdw->nchars++; }; // (end of danger zone) // vht[vmidx]->vstart = tdw->nchars; vht[vmidx]->vlen = valuelen; if (internal_trace) fprintf (stderr, "Memmoving the value in.\n"); memmove (&(tdw->filetext[tdw->nchars]), tempbuf, valuelen); tdw->nchars = tdw->nchars + valuelen; // // trailing separator tdw->filetext[tdw->nchars] = '\n'; tdw->nchars++; // // and reset 'previous match' start and length if (internal_trace) fprintf (stderr, "reset the previous-match to start.\n"); vht[vmidx]->mstart = vht[vmidx]->vstart; vht[vmidx]->mlen = 0; // Step 2 - if this was isolated, reclaim the // old storage if nobody else is using it. // if (is_old_isolated) { if (internal_trace) fprintf (stderr, "This was already an isolated var, so " "do a reclamation on the old space.\n"); // vstart==0 means "ignore this value" to reclamation // crm_compress_tdw_section (vht[vmidx]->valtxt, oldvstart , oldvstart+oldvlen); }; return (0); }; crm114-20100106-BlameMichelson.src/crm_pca_lib_fncts.h0000644000000000017500000000326511321154266020603 0ustar rootwsy// crm_pca_lib_fncts.h - Principal Component Analysis //////////////////////////////////////////////////////////////////////// // This code is originally copyright and owned by William // S. Yerazunis as file crm_neural_net. In return for addition of // significant derivative work, Jennifer Barry is hereby granted a full // unlimited license to use this code, includng license to relicense under // other licenses. //////////////////////////////////////////////////////////////////////// // // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. #ifndef __PCA__H #define __PCA__H #include "crm_svm_matrix_util.h" #include "crm_svm_matrix.h" #include "crm114_config.h" #include #define PCA_DEBUG 1 //Debug mode defines - prints out minimal information #define PCA_LOOP 8 //Prints out matrices and vectors - only use for small problems //The intermediate DEBUG modes may enable debug printing for the //matrix operations. See crm_svm_matrix_util.h for details. int PCA_DEBUG_MODE; //The debug mode for the PCA extern int MATR_DEBUG_MODE; //Debug mode for matrices. MATR_DEBUG_MODE = PCA_DEBUG_MODE //Defined in crm_svm_matrix_util.h typedef struct { Vector *theta; //first principal component double mudottheta; //decision point (unlike SVM this isn't necessarily 0) } PCA_Solution; PCA_Solution *run_pca(Matrix *M, Vector *init_pca); ExpandingArray *pca_preprocess(Matrix *X, Vector *init_pca); void pca_solve(Matrix *X, PCA_Solution **init_pca); void pca_free_solution(PCA_Solution *sol); #endif //crm_pca_lib_fncts.h crm114-20100106-BlameMichelson.src/crm_pca.h0000644000000000017500000000412611321154266016555 0ustar rootwsy// crm_pca.h - Principal Component Analysis //////////////////////////////////////////////////////////////////////// // This code is originally copyright and owned by William // S. Yerazunis as file crm_neural_net. In return for addition of // significant derivative work, Jennifer Barry is hereby granted a full // unlimited license to use this code, includng license to relicense under // other licenses. //////////////////////////////////////////////////////////////////////// // // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. #ifndef __CRM_PCA__H #define __CRM_PCA__H #include "crm_pca_lib_fncts.h" #include "crm114_sysincludes.h" #include "crm114_config.h" #include "crm114_structs.h" #include "crm114.h" #include extern int PCA_DEBUG_MODE; //Debug mode - //see crm_pca_lib_fncts.h //and crm_svm_matrix_util.h //for details. extern char *outbuf; #define PCA_FIRST_BITS "CRM114 PCA FILE FOLLOWS:" //PCA file magic string #define PCA_FIRST_NBIT strlen(PCA_FIRST_BITS)*sizeof(char) //PCA magic string //length (in bytes) #define N_OFFSETS_IN_PCA_FILE 1 //Number of size_t's before block //starts in file #define N_CONSTANTS_NOT_IN_BLOCK 0 //Number of ints before block //starts in file #define N_CONSTANTS_IN_PCA_BLOCK 6 //Number ints in block #define HAS_NEW_INDEX 0 //Position of has_new in block #define HAS_SOLUTION_INDEX 1 //Position of has_solution in block typedef struct { int has_new, has_solution, n0, n1, n0f, n1f; PCA_Solution *sol; Matrix *X; } crm_pca_block; #endif //crm_pca.h crm114-20100106-BlameMichelson.src/The_Wind_in_the_Willows_Chap_1.txt0000644000000000017500000005641711321154266023475 0ustar rootwsyI. THE RIVER BANK The Mole had been working very hard all the morning, spring-cleaning his little home. First with brooms, then with dusters; then on ladders and steps and chairs, with a brush and a pail of whitewash; till he had dust in his throat and eyes, and splashes of whitewash all over his black fur, and an aching back and weary arms. Spring was moving in the air above and in the earth below and around him, penetrating even his dark and lowly little house with its spirit of divine discontent and longing. It was small wonder, then, that he suddenly flung down his brush on the floor, said 'Bother!' and 'O blow!' and also 'Hang spring-cleaning!' and bolted out of the house without even waiting to put on his coat. Something up above was calling him imperiously, and he made for the steep little tunnel which answered in his case to the graveled carriage-drive owned by animals whose residences are nearer to the sun and air. So he scraped and scratched and scrabbled and scrooged and then he scrooged again and scrabbled and scratched and scraped, working busily with his little paws and muttering to himself, 'Up we go! Up we go!' till at last, pop! his snout came out into the sunlight, and he found himself rolling in the warm grass of a great meadow. 'This is fine!' he said to himself. 'This is better than whitewashing!' The sunshine struck hot on his fur, soft breezes caressed his heated brow, and after the seclusion of the cellarage he had lived in so long the carol of happy birds fell on his dulled hearing almost like a shout. Jumping off all his four legs at once, in the joy of living and the delight of spring without its cleaning, he pursued his way across the meadow till he reached the hedge on the further side. 'Hold up!' said an elderly rabbit at the gap. 'Sixpence for the privilege of passing by the private road!' He was bowled over in an instant by the impatient and contemptuous Mole, who trotted along the side of the hedge chaffing the other rabbits as they peeped hurriedly from their holes to see what the row was about. 'Onion-sauce! Onion-sauce!' he remarked jeeringly, and was gone before they could think of a thoroughly satisfactory reply. Then they all started grumbling at each other. 'How STUPID you are! Why didn't you tell him----' 'Well, why didn't YOU say----' 'You might have reminded him----' and so on, in the usual way; but, of course, it was then much too late, as is always the case. It all seemed too good to be true. Hither and thither through the meadows he rambled busily, along the hedgerows, across the copses, finding everywhere birds building, flowers budding, leaves thrusting--everything happy, and progressive, and occupied. And instead of having an uneasy conscience pricking him and whispering 'whitewash!' he somehow could only feel how jolly it was to be the only idle dog among all these busy citizens. After all, the best part of a holiday is perhaps not so much to be resting yourself, as to see all the other fellows busy working. He thought his happiness was complete when, as he meandered aimlessly along, suddenly he stood by the edge of a full-fed river. Never in his life had he seen a river before--this sleek, sinuous, full-bodied animal, chasing and chuckling, gripping things with a gurgle and leaving them with a laugh, to fling itself on fresh playmates that shook themselves free, and were caught and held again. All was a-shake and a-shiver--glints and gleams and sparkles, rustle and swirl, chatter and bubble. The Mole was bewitched, entranced, fascinated. By the side of the river he trotted as one trots, when very small, by the side of a man who holds one spell-bound by exciting stories; and when tired at last, he sat on the bank, while the river still chattered on to him, a babbling procession of the best stories in the world, sent from the heart of the earth to be told at last to the insatiable sea. As he sat on the grass and looked across the river, a dark hole in the bank opposite, just above the water's edge, caught his eye, and dreamily he fell to considering what a nice snug dwelling-place it would make for an animal with few wants and fond of a bijou riverside residence, above flood level and remote from noise and dust. As he gazed, something bright and small seemed to twinkle down in the heart of it, vanished, then twinkled once more like a tiny star. But it could hardly be a star in such an unlikely situation; and it was too glittering and small for a glow-worm. Then, as he looked, it winked at him, and so declared itself to be an eye; and a small face began gradually to grow up round it, like a frame round a picture. A brown little face, with whiskers. A grave round face, with the same twinkle in its eye that had first attracted his notice. Small neat ears and thick silky hair. It was the Water Rat! Then the two animals stood and regarded each other cautiously. 'Hullo, Mole!' said the Water Rat. 'Hullo, Rat!' said the Mole. 'Would you like to come over?' enquired the Rat presently. 'Oh, its all very well to TALK,' said the Mole, rather pettishly, he being new to a river and riverside life and its ways. The Rat said nothing, but stooped and unfastened a rope and hauled on it; then lightly stepped into a little boat which the Mole had not observed. It was painted blue outside and white within, and was just the size for two animals; and the Mole's whole heart went out to it at once, even though he did not yet fully understand its uses. The Rat sculled smartly across and made fast. Then he held up his forepaw as the Mole stepped gingerly down. 'Lean on that!' he said. 'Now then, step lively!' and the Mole to his surprise and rapture found himself actually seated in the stern of a real boat. 'This has been a wonderful day!' said he, as the Rat shoved off and took to the sculls again. 'Do you know, I've never been in a boat before in all my life.' 'What?' cried the Rat, open-mouthed: 'Never been in a--you never--well I--what have you been doing, then?' 'Is it so nice as all that?' asked the Mole shyly, though he was quite prepared to believe it as he leant back in his seat and surveyed the cushions, the oars, the rowlocks, and all the fascinating fittings, and felt the boat sway lightly under him. 'Nice? It's the ONLY thing,' said the Water Rat solemnly, as he leant forward for his stroke. 'Believe me, my young friend, there is NOTHING--absolute nothing--half so much worth doing as simply messing about in boats. Simply messing,' he went on dreamily: 'messing--about--in--boats; messing----' 'Look ahead, Rat!' cried the Mole suddenly. It was too late. The boat struck the bank full tilt. The dreamer, the joyous oarsman, lay on his back at the bottom of the boat, his heels in the air. '--about in boats--or WITH boats,' the Rat went on composedly, picking himself up with a pleasant laugh. 'In or out of 'em, it doesn't matter. Nothing seems really to matter, that's the charm of it. Whether you get away, or whether you don't; whether you arrive at your destination or whether you reach somewhere else, or whether you never get anywhere at all, you're always busy, and you never do anything in particular; and when you've done it there's always something else to do, and you can do it if you like, but you'd much better not. Look here! If you've really nothing else on hand this morning, supposing we drop down the river together, and have a long day of it?' The Mole waggled his toes from sheer happiness, spread his chest with a sigh of full contentment, and leaned back blissfully into the soft cushions. 'WHAT a day I'm having!' he said. 'Let us start at once!' 'Hold hard a minute, then!' said the Rat. He looped the painter through a ring in his landing-stage, climbed up into his hole above, and after a short interval reappeared staggering under a fat, wicker luncheon-basket. 'Shove that under your feet,' he observed to the Mole, as he passed it down into the boat. Then he untied the painter and took the sculls again. 'What's inside it?' asked the Mole, wriggling with curiosity. 'There's cold chicken inside it,' replied the Rat briefly; 'coldtonguecoldhamcoldbeefpickledgherkinssaladfrenchrollscresssan dwichespottedmeatgingerbeerlemonadesodawater----' 'O stop, stop,' cried the Mole in ecstacies: 'This is too much!' 'Do you really think so?' enquired the Rat seriously. 'It's only what I always take on these little excursions; and the other animals are always telling me that I'm a mean beast and cut it VERY fine!' The Mole never heard a word he was saying. Absorbed in the new life he was entering upon, intoxicated with the sparkle, the ripple, the scents and the sounds and the sunlight, he trailed a paw in the water and dreamed long waking dreams. The Water Rat, like the good little fellow he was, sculled steadily on and forebore to disturb him. 'I like your clothes awfully, old chap,' he remarked after some half an hour or so had passed. 'I'm going to get a black velvet smoking-suit myself some day, as soon as I can afford it.' 'I beg your pardon,' said the Mole, pulling himself together with an effort. 'You must think me very rude; but all this is so new to me. So--this--is--a--River!' 'THE River,' corrected the Rat. 'And you really live by the river? What a jolly life!' 'By it and with it and on it and in it,' said the Rat. 'It's brother and sister to me, and aunts, and company, and food and drink, and (naturally) washing. It's my world, and I don't want any other. What it hasn't got is not worth having, and what it doesn't know is not worth knowing. Lord! the times we've had together! Whether in winter or summer, spring or autumn, it's always got its fun and its excitements. When the floods are on in February, and my cellars and basement are brimming with drink that's no good to me, and the brown water runs by my best bedroom window; or again when it all drops away and, shows patches of mud that smells like plum-cake, and the rushes and weed clog the channels, and I can potter about dry shod over most of the bed of it and find fresh food to eat, and things careless people have dropped out of boats!' 'But isn't it a bit dull at times?' the Mole ventured to ask. 'Just you and the river, and no one else to pass a word with?' 'No one else to--well, I mustn't be hard on you,' said the Rat with forbearance. 'You're new to it, and of course you don't know. The bank is so crowded nowadays that many people are moving away altogether: O no, it isn't what it used to be, at all. Otters, kingfishers, dabchicks, moorhens, all of them about all day long and always wanting you to DO something--as if a fellow had no business of his own to attend to!' 'What lies over THERE' asked the Mole, waving a paw towards a background of woodland that darkly framed the water-meadows on one side of the river. 'That? O, that's just the Wild Wood,' said the Rat shortly. 'We don't go there very much, we river-bankers.' 'Aren't they--aren't they very NICE people in there?' said the Mole, a trifle nervously. 'W-e-ll,' replied the Rat, 'let me see. The squirrels are all right. AND the rabbits--some of 'em, but rabbits are a mixed lot. And then there's Badger, of course. He lives right in the heart of it; wouldn't live anywhere else, either, if you paid him to do it. Dear old Badger! Nobody interferes with HIM. They'd better not,' he added significantly. 'Why, who SHOULD interfere with him?' asked the Mole. 'Well, of course--there--are others,' explained the Rat in a hesitating sort of way. 'Weasels--and stoats--and foxes--and so on. They're all right in a way--I'm very good friends with them--pass the time of day when we meet, and all that--but they break out sometimes, there's no denying it, and then--well, you can't really trust them, and that's the fact.' The Mole knew well that it is quite against animal-etiquette to dwell on possible trouble ahead, or even to allude to it; so he dropped the subject. 'And beyond the Wild Wood again?' he asked: 'Where it's all blue and dim, and one sees what may be hills or perhaps they mayn't, and something like the smoke of towns, or is it only cloud-drift?' 'Beyond the Wild Wood comes the Wide World,' said the Rat. 'And that's something that doesn't matter, either to you or me. I've never been there, and I'm never going, nor you either, if you've got any sense at all. Don't ever refer to it again, please. Now then! Here's our backwater at last, where we're going to lunch.' Leaving the main stream, they now passed into what seemed at first sight like a little land-locked lake. Green turf sloped down to either edge, brown snaky tree-roots gleamed below the surface of the quiet water, while ahead of them the silvery shoulder and foamy tumble of a weir, arm-in-arm with a restless dripping mill-wheel, that held up in its turn a grey-gabled mill-house, filled the air with a soothing murmur of sound, dull and smothery, yet with little clear voices speaking up cheerfully out of it at intervals. It was so very beautiful that the Mole could only hold up both forepaws and gasp, 'O my! O my! O my!' The Rat brought the boat alongside the bank, made her fast, helped the still awkward Mole safely ashore, and swung out the luncheon-basket. The Mole begged as a favour to be allowed to unpack it all by himself; and the Rat was very pleased to indulge him, and to sprawl at full length on the grass and rest, while his excited friend shook out the table-cloth and spread it, took out all the mysterious packets one by one and arranged their contents in due order, still gasping, 'O my! O my!' at each fresh revelation. When all was ready, the Rat said, 'Now, pitch in, old fellow!' and the Mole was indeed very glad to obey, for he had started his spring-cleaning at a very early hour that morning, as people WILL do, and had not paused for bite or sup; and he had been through a very great deal since that distant time which now seemed so many days ago. 'What are you looking at?' said the Rat presently, when the edge of their hunger was somewhat dulled, and the Mole's eyes were able to wander off the table-cloth a little. 'I am looking,' said the Mole, 'at a streak of bubbles that I see travelling along the surface of the water. That is a thing that strikes me as funny.' 'Bubbles? Oho!' said the Rat, and chirruped cheerily in an inviting sort of way. A broad glistening muzzle showed itself above the edge of the bank, and the Otter hauled himself out and shook the water from his coat. 'Greedy beggars!' he observed, making for the provender. 'Why didn't you invite me, Ratty?' 'This was an impromptu affair,' explained the Rat. 'By the way--my friend Mr. Mole.' 'Proud, I'm sure,' said the Otter, and the two animals were friends forthwith. 'Such a rumpus everywhere!' continued the Otter. 'All the world seems out on the river to-day. I came up this backwater to try and get a moment's peace, and then stumble upon you fellows!--At least--I beg pardon--I don't exactly mean that, you know.' There was a rustle behind them, proceeding from a hedge wherein last year's leaves still clung thick, and a stripy head, with high shoulders behind it, peered forth on them. 'Come on, old Badger!' shouted the Rat. The Badger trotted forward a pace or two; then grunted, 'H'm! Company,' and turned his back and disappeared from view. 'That's JUST the sort of fellow he is!' observed the disappointed Rat. 'Simply hates Society! Now we shan't see any more of him to-day. Well, tell us, WHO'S out on the river?' 'Toad's out, for one,' replied the Otter. 'In his brand-new wager-boat; new togs, new everything!' The two animals looked at each other and laughed. 'Once, it was nothing but sailing,' said the Rat, 'Then he tired of that and took to punting. Nothing would please him but to punt all day and every day, and a nice mess he made of it. Last year it was house-boating, and we all had to go and stay with him in his house-boat, and pretend we liked it. He was going to spend the rest of his life in a house-boat. It's all the same, whatever he takes up; he gets tired of it, and starts on something fresh.' 'Such a good fellow, too,' remarked the Otter reflectively: 'But no stability--especially in a boat!' From where they sat they could get a glimpse of the main stream across the island that separated them; and just then a wager-boat flashed into view, the rower--a short, stout figure--splashing badly and rolling a good deal, but working his hardest. The Rat stood up and hailed him, but Toad--for it was he--shook his head and settled sternly to his work. 'He'll be out of the boat in a minute if he rolls like that,' said the Rat, sitting down again. 'Of course he will,' chuckled the Otter. 'Did I ever tell you that good story about Toad and the lock-keeper? It happened this way. Toad....' An errant May-fly swerved unsteadily athwart the current in the intoxicated fashion affected by young bloods of May-flies seeing life. A swirl of water and a 'cloop!' and the May-fly was visible no more. Neither was the Otter. The Mole looked down. The voice was still in his ears, but the turf whereon he had sprawled was clearly vacant. Not an Otter to be seen, as far as the distant horizon. But again there was a streak of bubbles on the surface of the river. The Rat hummed a tune, and the Mole recollected that animal-etiquette forbade any sort of comment on the sudden disappearance of one's friends at any moment, for any reason or no reason whatever. 'Well, well,' said the Rat, 'I suppose we ought to be moving. I wonder which of us had better pack the luncheon-basket?' He did not speak as if he was frightfully eager for the treat. 'O, please let me,' said the Mole. So, of course, the Rat let him. Packing the basket was not quite such pleasant work as unpacking' the basket. It never is. But the Mole was bent on enjoying everything, and although just when he had got the basket packed and strapped up tightly he saw a plate staring up at him from the grass, and when the job had been done again the Rat pointed out a fork which anybody ought to have seen, and last of all, behold! the mustard pot, which he had been sitting on without knowing it--still, somehow, the thing got finished at last, without much loss of temper. The afternoon sun was getting low as the Rat sculled gently homewards in a dreamy mood, murmuring poetry-things over to himself, and not paying much attention to Mole. But the Mole was very full of lunch, and self-satisfaction, and pride, and already quite at home in a boat (so he thought) and was getting a bit restless besides: and presently he said, 'Ratty! Please, _I_ want to row, now!' The Rat shook his head with a smile. 'Not yet, my young friend,' he said--'wait till you've had a few lessons. It's not so easy as it looks.' The Mole was quiet for a minute or two. But he began to feel more and more jealous of Rat, sculling so strongly and so easily along, and his pride began to whisper that he could do it every bit as well. He jumped up and seized the sculls, so suddenly, that the Rat, who was gazing out over the water and saying more poetry-things to himself, was taken by surprise and fell backwards off his seat with his legs in the air for the second time, while the triumphant Mole took his place and grabbed the sculls with entire confidence. 'Stop it, you SILLY ass!' cried the Rat, from the bottom of the boat. 'You can't do it! You'll have us over!' The Mole flung his sculls back with a flourish, and made a great dig at the water. He missed the surface altogether, his legs flew up above his head, and he found himself lying on the top of the prostrate Rat. Greatly alarmed, he made a grab at the side of the boat, and the next moment--Sploosh! Over went the boat, and he found himself struggling in the river. O my, how cold the water was, and O, how VERY wet it felt. How it sang in his ears as he went down, down, down! How bright and welcome the sun looked as he rose to the surface coughing and spluttering! How black was his despair when he felt himself sinking again! Then a firm paw gripped him by the back of his neck. It was the Rat, and he was evidently laughing--the Mole could FEEL him laughing, right down his arm and through his paw, and so into his--the Mole's--neck. The Rat got hold of a scull and shoved it under the Mole's arm; then he did the same by the other side of him and, swimming behind, propelled the helpless animal to shore, hauled him out, and set him down on the bank, a squashy, pulpy lump of misery. When the Rat had rubbed him down a bit, and wrung some of the wet out of him, he said, 'Now, then, old fellow! Trot up and down the towing-path as hard as you can, till you're warm and dry again, while I dive for the luncheon-basket.' So the dismal Mole, wet without and ashamed within, trotted about till he was fairly dry, while the Rat plunged into the water again, recovered the boat, righted her and made her fast, fetched his floating property to shore by degrees, and finally dived successfully for the luncheon-basket and struggled to land with it. When all was ready for a start once more, the Mole, limp and dejected, took his seat in the stern of the boat; and as they set off, he said in a low voice, broken with emotion, 'Ratty, my generous friend! I am very sorry indeed for my foolish and ungrateful conduct. My heart quite fails me when I think how I might have lost that beautiful luncheon-basket. Indeed, I have been a complete ass, and I know it. Will you overlook it this once and forgive me, and let things go on as before?' 'That's all right, bless you!' responded the Rat cheerily. 'What's a little wet to a Water Rat? I'm more in the water than out of it most days. Don't you think any more about it; and, look here! I really think you had better come and stop with me for a little time. It's very plain and rough, you know--not like Toad's house at all--but you haven't seen that yet; still, I can make you comfortable. And I'll teach you to row, and to swim, and you'll soon be as handy on the water as any of us.' The Mole was so touched by his kind manner of speaking that he could find no voice to answer him; and he had to brush away a tear or two with the back of his paw. But the Rat kindly looked in another direction, and presently the Mole's spirits revived again, and he was even able to give some straight back-talk to a couple of moorhens who were sniggering to each other about his bedraggled appearance. When they got home, the Rat made a bright fire in the parlour, and planted the Mole in an arm-chair in front of it, having fetched down a dressing-gown and slippers for him, and told him river stories till supper-time. Very thrilling stories they were, too, to an earth-dwelling animal like Mole. Stories about weirs, and sudden floods, and leaping pike, and steamers that flung hard bottles--at least bottles were certainly flung, and FROM steamers, so presumably BY them; and about herons, and how particular they were whom they spoke to; and about adventures down drains, and night-fishings with Otter, or excursions far a-field with Badger. Supper was a most cheerful meal; but very shortly afterwards a terribly sleepy Mole had to be escorted upstairs by his considerate host, to the best bedroom, where he soon laid his head on his pillow in great peace and contentment, knowing that his new-found friend the River was lapping the sill of his window. This day was only the first of many similar ones for the emancipated Mole, each of them longer and full of interest as the ripening summer moved onward. He learnt to swim and to row, and entered into the joy of running water; and with his ear to the reed-stems he caught, at intervals, something of what the wind went whispering so constantly among them. crm114-20100106-BlameMichelson.src/crm_osbf_maintenance.c0000644000000000017500000004446611321154266021313 0ustar rootwsy// crm_osbf_maintenance.c - OSBF microgrooming utilities // Copyright 2004 Fidelis Assis // Copyright 2004-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // OBS: CSS header structure and pruning method modified for OSBF classifier. // See functions crm_osbf_microgroom and crm_osbf_create_cssfile, below, // for details. -- Fidelis Assis - 2004/10/20 // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" #include "crm114_osbf.h" /* Version names */ char *CSS_version_name[] = { "SBPH-Markovian", "OSB-Bayes", "Correlate", "Neural", "OSB-Winnow", "OSBF-Bayes", "Unknown" }; // microgroom flag for osbf static int osbf_microgroom; // turn microgroom on (1) or off (0) void crm_osbf_set_microgroom(int value) { osbf_microgroom = value; } // // How to microgroom a .css file that's getting full // // NOTA BENE NOTA BENE NOTA BENE NOTA BENE // // This whole section of code is under intense develoment; right now // it "works" but not any better than nothing at all. Be warned // that any patches issued on it may well never see the light of // day, as intense testing and comparison may show that the current // algorithms are, well, suckful. // // // There are two steps to microgrooming - first, since we know we're // already too full, we execute a 'zero unity bins'. // void crm_osbf_microgroom (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long hindex) { long i, j, k; static long microgroom_count = 0; long packstart; long packlen; long zeroed_countdown, max_zeroed_buckets; long min_value, min_value_any, distance, max_distance; int groom_any = 0; OSBF_FEATUREBUCKET_STRUCT *h; // if not set by command line, use default if (microgroom_chain_length == 0) microgroom_chain_length = OSBF_MICROGROOM_CHAIN_LENGTH; if (microgroom_stop_after == 0) microgroom_stop_after = OSBF_MICROGROOM_STOP_AFTER; // make h point to the first feature bucket h = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start; zeroed_countdown = microgroom_stop_after; i = j = k = 0; microgroom_count++; if (user_trace) { if (microgroom_count == 1) fprintf (stderr, "CSS file too full: microgrooming this css chain: "); fprintf (stderr, " %ld ", microgroom_count); }; // micropack - start at initial chain start, move to back of // chain that overflowed, then scale just that chain. i = j = hindex % header->buckets; min_value = OSBF_FEATUREBUCKET_VALUE_MAX; min_value_any = GET_BUCKET_VALUE (h[i]); while (BUCKET_IN_CHAIN (h[i])) { if (GET_BUCKET_VALUE (h[i]) < min_value && !BUCKET_IS_LOCKED (h[i])) min_value = GET_BUCKET_VALUE (h[i]); if (GET_BUCKET_VALUE (h[i]) < min_value_any) min_value_any = GET_BUCKET_VALUE (h[i]); if (i == 0) i = header->buckets - 1; else i--; if (i == j) break; // don't hang if we have a 100% full .css file // fprintf (stderr, "-"); } if (min_value == OSBF_FEATUREBUCKET_VALUE_MAX) { /* no unlocked bucket avaiable so groom any */ groom_any = 1; min_value = min_value_any; } // now, move our index to the first bucket in this chain. i++; if (i >= header->buckets) i = 0; packstart = i; /* i = j = hindex % header->buckets; */ while (BUCKET_IN_CHAIN (h[i])) { i++; if (i == header->buckets) i = 0; if (i == packstart) break; // don't hang if we have a 100% full .cfc file } // now, our index is right after the last bucket in this chain. /* if there was a wraparound, full .cfc file, */ /* i == packstart and packlen == header->buckets */ if (i > packstart) packlen = i - packstart; else packlen = header->buckets + i - packstart; // This pruning method zeroes buckets with minimum count in the chain. // It tries first buckets with minimum distance to their right position, // to increase the chance of zeroing older buckets first. If none with // distance 0 is found, the distance is increased until at least one // bucket is zeroed. // // We keep track of how many buckets we've zeroed and we stop // zeroing additional buckets after that point. NO! BUG! That // messes up the tail length, and if we don't repack the tail, then // features in the tail can become permanently inaccessible! Therefore, // we really can't stop in the middle of the tail (well, we could // stop zeroing, but we need to pass the full length of the tail in. // // Note that we can't do this "adaptively" in packcss, because zeroes // there aren't necessarily overflow chain terminators (because -we- // might have inserted them here. // // GROT GROT GROT Note that the following algorithm does multiple // passes to find the lowest-valued features. In fact, that's // actually rather slow; a better algorithm would keep track of // the N least-valued features in the chain in ONE pass and zero // those. // // -- // I'm not sure if it's worth working on a better algoritm for this: // // This is a statistics report of microgroomings for 4147 messages // of the SpamAssassin corpus. It shows that 77% is done in a single // pass, 95.2% in 1 or 2 passes and 99% in at most 3 passes. // // # microgrommings passes % accum. % // 232584 1 76.6 76.6 // 56396 2 18.6 95.2 // 11172 3 3.7 98.9 // 2502 4 0.8 99.7 // 726 5 0.2 99.9 // ... // ----------- // 303773 // // If we consider only the last 100 microgroomings, when the css // file is full, we'll have the following numbers showing that most // microgroomings (61%) are still done in a single pass, almost 90% // is done in 1 or 2 passes and 97% are done in at most 3 passes: // // # microgrommings passes % accum. % // 61 1 61 61 // 27 2 27 88 // 9 3 9 97 // 3 4 3 100 // --- // 100 // // So, it's not so slow. Anyway, a better algorithm could be // implemented using 2 additional arrays, with MICROGROOM_STOP_AFTER // positions each, to store the indexes of the candidate buckets // found with distance equal to 1 or 2 while we scan for distance 0. // Those with distance 0 are zeroed immediatelly. If none with // distance 0 is found, we'll zero the indexes stored in the first // array. Again, if none is found in the first array, we'll try the // second one. Finally, if none is found in both arrays, the loop // will continue until one bucket is zeroed. // // But now comes the question: do the numbers above justify the // additional code/work? I'll try to find out the answer // implementing it :), but this has low priority for now. // // -- Fidelis Assis // // try features in their right place first max_distance = 1; /* zero up to 50% of packlen */ /* max_zeroed_buckets = (long) (0.5 * packlen + 0.5); */ max_zeroed_buckets = microgroom_stop_after; zeroed_countdown = max_zeroed_buckets; /*fprintf(stderr, "packstart: %ld, packlen: %ld, max_zeroed_buckets: %ld\n", packstart, packlen, max_zeroed_buckets); */ // while no bucket is zeroed... while (zeroed_countdown == max_zeroed_buckets) { /* fprintf(stderr, "Start: %ld, stop_after: %ld, max_distance: %ld\n", packstart, microgroom_stop_after, max_distance); */ i = packstart; while (BUCKET_IN_CHAIN (h[i]) && zeroed_countdown > 0) { // check if it's a candidate if (GET_BUCKET_VALUE (h[i]) == min_value && (!BUCKET_IS_LOCKED (h[i]) || (groom_any != 0))) { // if it is, check the distance distance = i - BUCKET_HASH (h[i]) % header->buckets; if (distance < 0) distance += header->buckets; if (distance < max_distance) { BUCKET_RAW_VALUE (h[i]) = 0; zeroed_countdown--; } } i++; if (i >= header->buckets) i = 0; } // if none was zeroed, increase the allowed distance between the // candidade's position and its right place. if (zeroed_countdown == max_zeroed_buckets) max_distance++; } /* fprintf (stderr, "Leaving microgroom: %ld buckets zeroed at distance %ld\n", microgroom_stop_after - zeroed_countdown, max_distance - 1); */ // now we pack the buckets crm_osbf_packcss (header, packstart, packlen); } void crm_osbf_packcss (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long packstart, unsigned long packlen) { // How we pack... // // We look at each bucket, and attempt to reinsert it at the "best" // place. We know at worst it will end up where it already is, and // at best it will end up lower (at a lower index) in the file, except // if it's in wraparound mode, in which case we know it will not get // back up past us (since the file must contain at least one empty) // and so it's still below us in the file. OSBF_FEATUREBUCKET_STRUCT *h; // make h point to the first feature bucket h = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start; if (packstart + packlen <= header->buckets) // no wraparound in this case { crm_osbf_packseg (header, packstart, packlen); } else // wraparound mode - do it as two separate repacks { crm_osbf_packseg (header, packstart, (header->buckets - packstart)); crm_osbf_packseg (header, 0, (packlen - (header->buckets - packstart))); }; } void crm_osbf_packseg (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long packstart, unsigned long packlen) { unsigned long ifrom, ito; unsigned long thash, tkey; OSBF_FEATUREBUCKET_STRUCT *h; // make h point to the first feature bucket h = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start; if (internal_trace) fprintf (stderr, " < %ld %ld >", packstart, packlen); // Our slot values are now somewhat in disorder because empty // buckets may now have been inserted into a chain where there used // to be placeholder buckets. We need to re-insert slot data in a // bucket where it will be found. for (ifrom = packstart; ifrom < packstart + packlen; ifrom++) { // Now find the next bucket to place somewhere thash = BUCKET_HASH (h[ifrom]); tkey = BUCKET_KEY (h[ifrom]); if (GET_BUCKET_VALUE (h[ifrom]) == 0) { if (internal_trace) fprintf (stderr, "X"); } else { ito = thash % header->buckets; // fprintf (stderr, "a %ld", ito); while (BUCKET_IN_CHAIN (h[ito]) && !BUCKET_HASH_COMPARE (h[ito], thash, tkey)) { ito++; if (ito >= header->buckets) ito = 0; } // found an empty slot, put this value there, and zero the // original one. Sometimes this is a noop. We don't care. if (ito != ifrom) { BUCKET_HASH (h[ito]) = thash; BUCKET_KEY (h[ito]) = tkey; // move value and lock together BUCKET_RAW_VALUE (h[ito]) = BUCKET_RAW_VALUE (h[ifrom]); // clean "from" bucket BUCKET_HASH (h[ifrom]) = 0; BUCKET_KEY (h[ifrom]) = 0; BUCKET_RAW_VALUE (h[ifrom]) = 0; } if (internal_trace) { if (ifrom == ito) fprintf (stderr, "="); if (ito < ifrom) fprintf (stderr, "<"); if (ito > ifrom) fprintf (stderr, ">"); }; }; }; } /* get next bucket index */ unsigned long crm_osbf_next_bindex (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long hindex) { hindex++; if (hindex >= header->buckets) hindex = 0; return hindex; } /* get index of the last bucket in a chain */ unsigned long crm_osbf_last_in_chain (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long hindex) { unsigned long wraparound; OSBF_FEATUREBUCKET_STRUCT *hashes; hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start; /* if the bucket is not in a chain, return an index */ /* out of the buckets space, equal to the number of */ /* buckets in the file to indicate an empty chain */ if (!BUCKET_IN_CHAIN (hashes[hindex])) return header->buckets; wraparound = hindex; while (BUCKET_IN_CHAIN (hashes[hindex])) { hindex++; if (hindex >= header->buckets) hindex = 0; /* if .cfc file is full return an index out of */ /* the buckets space, equal to number of buckets */ /* in the file, plus one */ if (hindex == wraparound) return header->buckets + 1; } hindex = crm_osbf_prev_bindex (header, hindex); return hindex; } /* get previous bucket index */ unsigned long crm_osbf_prev_bindex (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long hindex) { if (hindex == 0) hindex = header->buckets - 1; else hindex--; return hindex; } /* get index of the first bucket in a chain */ unsigned long crm_osbf_first_in_chain (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long hindex) { unsigned long wraparound; OSBF_FEATUREBUCKET_STRUCT *hashes; hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start; /* if the bucket is not in a chain, return an index */ /* out of the buckets space, equal to the number of */ /* buckets in the file to indicate an empty chain */ if (!BUCKET_IN_CHAIN (hashes[hindex])) return header->buckets; wraparound = hindex; while (BUCKET_IN_CHAIN (hashes[hindex])) { if (hindex == 0) hindex = header->buckets - 1; else hindex--; /* if .cfc file is full return an index out of */ /* the buckets space, equal to number of buckets */ /* in the file, plus one */ if (hindex == wraparound) return header->buckets + 1; } return crm_osbf_next_bindex (header, hindex); } unsigned long crm_osbf_find_bucket (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long hash, unsigned long key) { OSBF_FEATUREBUCKET_STRUCT *hashes; unsigned long hindex, start; hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start; hindex = start = hash % header->buckets; while (!BUCKET_HASH_COMPARE (hashes[hindex], hash, key) && !EMPTY_BUCKET (hashes[hindex])) { hindex = crm_osbf_next_bindex (header, hindex); /* if .cfc file is completely full return an index */ /* out of the buckets space, equal to number of buckets */ /* in the file, plus one */ if (hindex == start) return header->buckets + 1; } /* return the index of the found bucket or, if not found, * the index of a free bucket where it could be put */ return hindex; } void crm_osbf_update_bucket (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long bindex, int delta) { OSBF_FEATUREBUCKET_STRUCT *hashes; hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start; /* fprintf (stderr, "Bucket updated at %lu, hash: %lu, key: %lu, value: %d\n", bindex, hashes[bindex].hash, hashes[bindex].key, delta); */ if (delta > 0 && GET_BUCKET_VALUE (hashes[bindex]) + delta >= OSBF_FEATUREBUCKET_VALUE_MAX - 1) { SETL_BUCKET_VALUE (hashes[bindex], OSBF_FEATUREBUCKET_VALUE_MAX - 1); } else if (delta < 0 && GET_BUCKET_VALUE (hashes[bindex]) <= -delta) { BUCKET_RAW_VALUE (hashes[bindex]) = 0; BUCKET_HASH (hashes[bindex]) = 0; BUCKET_KEY (hashes[bindex]) = 0; /* pack chain */ { long i, j, packlen; i = crm_osbf_next_bindex (header, bindex); j = crm_osbf_last_in_chain (header, i); /* if there's a valid chain tail starting at i, pack it */ if (j < header->buckets) { if (j >= i) packlen = j - i + 1; else packlen = header->buckets + 1 - (i - j); crm_osbf_packcss (header, i, packlen); } } } else { SETL_BUCKET_VALUE (hashes[bindex], GET_BUCKET_VALUE (hashes[bindex]) + delta); } } void crm_osbf_insert_bucket (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long bindex, unsigned long hash, unsigned long key, int value) { unsigned long hindex, distance; OSBF_FEATUREBUCKET_STRUCT *hashes; if (microgroom_chain_length == 0) microgroom_chain_length = OSBF_MICROGROOM_CHAIN_LENGTH; hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start; /* "right" bucket position */ hindex = hash % header->buckets; /* distance from right position to free position */ distance = (bindex >= hindex) ? bindex - hindex : header->buckets - (hindex - bindex); if ((osbf_microgroom != 0) && (value > 0)) while (distance > microgroom_chain_length) { /* fprintf (stderr, "hindex: %lu, bindex: %lu, distance: %lu\n", hindex, bindex, distance); */ crm_osbf_microgroom (header, crm_osbf_prev_bindex (header, bindex)); /* get new free bucket index */ bindex = crm_osbf_find_bucket (header, hash, key); distance = (bindex >= hindex) ? bindex - hindex : header->buckets - (hindex - bindex); } /* fprintf (stderr, "new bucket at %lu, hash: %lu, key: %lu, distance: %lu\n", bindex, hash, key, distance); */ SETL_BUCKET_VALUE (hashes[bindex], value); BUCKET_HASH (hashes[bindex]) = hash; BUCKET_KEY (hashes[bindex]) = key; } static OSBF_HEADER_UNION hu; int crm_osbf_create_cssfile (char *cssfile, unsigned long buckets, unsigned long major, unsigned long minor, unsigned long spectrum_start) { FILE *f; long i; OSBF_FEATUREBUCKET_STRUCT feature = { 0, 0, 0 }; if (user_trace) fprintf (stderr, "Opening file %s for read/write\n", cssfile); f = fopen (cssfile, "wb"); if (!f) { fatalerror ("Couldn't open the new .cfc file for writing; file = ", cssfile); }; // Set the header. *((unsigned long *) hu.header.version) = major; // quick hack for now... hu.header.flags = minor; hu.header.learnings = 0; hu.header.buckets = buckets; hu.header.buckets_start = OSBF_CSS_SPECTRA_START; // Write header if (fwrite (&hu, sizeof (hu), 1, f) != 1) { fatalerror (" Couldn't initialize the .cfc file header; file = ", cssfile); } // Initialize CSS hashes - zero all buckets for (i = 0; i < buckets; i++) { // Write buckets if (fwrite (&feature, sizeof (feature), 1, f) != 1) { fatalerror (" Couldn't initialize the .cfc buckets; file = ", cssfile); } } fclose (f); return (EXIT_SUCCESS); } crm114-20100106-BlameMichelson.src/classifymail.crm0000755000000000017500000000315611321154266020170 0ustar rootwsy#! /usr/bin/crm # # classifymail.crm - a quick classify only # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # this ISOLATE will guarantee that :fileprefix: exists, and keep it's # prior (commandline) value if it does, and an empty string if it doesnt isolate (:fileprefix:) # This is the code to read the per-user configuration. Note # that because this happens during the run, it will _override_ # any comand line arguments that get set. { isolate (:option_txt:) isolate (:ev:) isolate (:verbose_startup:) # read in the mail filter configuration file. input [:*:fileprefix:mailfilter.cf] (:option_txt:) # output /options == :*:option_txt:\n/ # reset loop for matching to start of :option_txt: match [:option_txt:] // # and loop till there are no more options. { # find a line that looks like a parameter setting... match < fromend nomultiline > (:line: :name: :value:) \ [:option_txt:] /^([[:graph:]]+).*\/(.*)\/.*$/ { # don't execute the assign if there's a # at the start of the line. match [:name:] /^\x23/ { # debugging print match [:verbose_startup:] /SET/ output / :*:name:\n :*:value:\n/ } isolate (:*:name:) /:*:value:/ } liaf } } # now do the actual work { # match (:data:) /.*/ match (:data:) [:_dw: 0 :*:decision_length:] /.*/ isolate (:stats:) { classify [:data:] <:*:clf:> ( :*:fileprefix:nonspam.css | :*:fileprefix:spam.css ) (:stats:) /:*:lcr:/ output / file nonspam.css matches better \n:*:stats::*:_nl:/ exit /0/ } output / file spam.css matches better \n:*:stats::*:_nl:/ exit /1/ } crm114-20100106-BlameMichelson.src/blacklist.mfp.example0000644000000000017500000000000011321154266021071 0ustar rootwsycrm114-20100106-BlameMichelson.src/mailfilter.crm0000755000000000017500000012612111321154266017636 0ustar rootwsy#! /usr/bin/crm # -(learnspam learnnonspam learnfile stats_only config spamcss nonspamcss fileprefix force unlearn cache) # # mailfilter.crm - Statistical mail sorter # # Note to SunOS and FreeBSD users - do not place command arguments of # "-([arguments])" format on the first line of this program # or you will not get what you expect. This is due to a kernel # difference in how a bangline should be dealt with. # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # A statistical mail sorter with mail-to-yourself commanding # # --->>> Design Philosophy ( do these IN ORDER ) # # * if --fileprefix is specified, all filenames EXCEPT --config # are prefixed with that (You need a trailing slash on the prefix # if it is a directory name.) # * if --config , grab the config file from the specified place. # * Load the mailfilter.cf config file from wherever config or # fileprefix points (--config overrides --fileprefix). # * If --spamcss is specified, use that as the spam.css file # * If --nonspamcss is specified, use that as the nonspam.css file # * If --learnspam, learn as spam and exit. # * If --learnnonspam, learn as nonspam and exit # * If --force, force-feed the learning # * If --learnfile, use :learnfile:.css and :learnfile:text.txt # * If --stats_only, do normal classification but don't do any # forwarding, only output the status on stdout and return # the exit code. # # * check for the "command word", if present, execute the command # # * check to see if any of the whitelist patterns apply. If so, # accept the mail to /var/spool/the_user (which is actually to # be found at /var/spool/mail/:*:_env_USER: # # * check to see if any of the blacklist patterns apply. If so, # flush the mail to the "blacklisted" file. # # * check to see if it's commanded to be learned as a spam or a # nonspam model. If so, learn it to the appropriate .css (Crm # Sparse Spectra) file # # * run the email through the classifier. If the classifier thinks # it's good, send it to the /var/spool/mail/the_user file, else # send it to the "doubtful" file. # ############################################################## # # --- uncomment this if you want to include a "forced" # configuration file --- # insert mailfilterconfig.crm # # # --- These vars must have a value, or else we'll get errors ---- # isolate (:classifier_reason:) /no reason yet/ # isolate (:classify_status:) // # isolate (:our_exit_code:) /0/ # isolate (:stats:) / pR: 0.000000 / # isolate (:pr:) / pR: 0.00000/ # isolate (:subj_text:) / (None) / # isolate (:add_extra_stuff:) // # isolate (:decision_length:) /4096/ # isolate (:cache:) // isolate (:cachedir:) // isolate (:cacheid:) // isolate (:msg_hash:) // isolate (:learnspam:) isolate (:learnnonspam:) isolate (:learnfile:) isolate (:stats_only:) isolate (:automatic_training:) # # Isolate these email addresses, and give them values, # in case the user doesn't. isolate (:reject_address:) // isolate (:fail_priority_mail_to:) // isolate (:fail_blacklist_mail_to:) // isolate (:fail_SSM_mail_to:) // isolate (:log_rejections:) // # # this ISOLATE will guarantee that :fileprefix: exists, and keep it's # prior (commandline) value if it does, and an empty string if it doesnt isolate (:fileprefix:) # # This ISOLATE will guarantee that :force: will exist, and keep the # commandline value ("SET") , or the null string if the user doesn't # use --force on the command line. isolate (:force:) # # This ISOLATE will guarantee that :unlearn: will exist, and will keep # the commandline value ("SET") or the null string if the user doesn't # use --unlearn on the command line. isolate (:unlearn:) # # now, :clf: is the classify & learn flags; note that we have two # separate flags here in a bizarre chain. The reason is that :unlearn: # can have the value "SET", whereas :rft: needs "refute" isolate (:clf:) // # # and someplace to catch mailtrainer if we need it. isolate (:mailtrainer_output:) // # ##################################################################### # # This is the code to read the per-user configuration. Note # that because this happens during the run, it will _override_ # any comand line arguments that get set. { isolate (:option_txt:) isolate (:ev:) isolate (:verbose_startup:) isolate (:config:) # # Part 1 - read in the options/configuration file # { { match [:config:] /.+/ input [:*:config:] (:option_txt:) } alius { # read in the standard mail filter configuration file. input [:*:fileprefix:mailfilter.cf] (:option_txt:) } } # # # reset loop for matching to start of :option_txt: match [:option_txt:] // # and loop till there are no more options. { # find a line that looks like a parameter setting... match < fromend nomultiline > (:line: :name: :value:) \ [:option_txt:] /^[ ]*(:[[:graph:]]+:)[ \t]+\/(.*)\// { # don't execute the assign if there's # a # at the start of the line. match [:name:] /^\x23/ { # Verbose startup? match [:verbose_startup:] /SET/ output / :*:name:\n :*:value:\n/ } isolate (:*:name:) /:*:value:/ } liaf } } # # Now, a tricky bit - we need to add "unlearn" to the :clf: # if it was in the line params - but we have to append, not replace, # because :clf: (CLassfier Flags) also contains the classifier we use. # { match [:unlearn:] /SET/ alter (:clf:) /:*:clf: refute/ } # # # Do a quick check- has the password been changed or not? If it's # still the default, put in something that will be well-nigh unguessable # (esp. since it will contain recieved headers that the sender cannot # see nor control.) { match [:spw:] /DEFAULT_PASSWORD/ # yes, it's the same as default. So we scramble it just so # nobody can hack in hash (:spw:) /:*:_env_string::*:_dw:/ } ############################################################# # # Set up the addresses that we might need to mail to # isolate (:reject_address:) /:*:general_fails_to:/ { match [:fail_priority_mail_to:] /[[:graph:]]/ alter (:fail_priority_mail_to:) /:*:general_fails_to:/ } { match [:fail_blacklist_mail_to:] /[[:graph:]]/ alter (:fail_blacklist_mail_to:) /:*:general_fails_to:/ } { match [:fail_SSM_mail_to:] /[[:graph:]]/ alter (:fail_SSM_mail_to:) /:*:general_fails_to:/ } ############################################################### # Does the user want us to log all incoming mail? This is handy for # testing and auditing purposes. { match [:log_to_allmail.txt:] /yes/ output [:*:fileprefix:allmail.txt] /:*:_dw:/ } ############################################################### # # Is text cacheing turned on? { match [:text_cache:] /./ { ### If the text_cache dir isn't there, create it # and it's subdirectories. # isolate (:tmp:) // syscall () (:tmp:) /ls :*:text_cache: 2>&1 / match [:tmp:] /texts/ syscall () () /mkdir -p :*:text_cache: / syscall () () /mkdir -p :*:text_cache:\/texts / syscall () () /mkdir :*:text_cache:\/prob_good / syscall () () /mkdir :*:text_cache:\/prob_spam / syscall () () /mkdir :*:text_cache:\/known_good / syscall () () /mkdir :*:text_cache:\/known_spam / syscall () () /mkdir :*:text_cache:\/empty / } # # Yes, text cacheing is on. Save the input as is in the cache. # isolate (:system_time:) // syscall () (:system_time:) /date +%Y%m%d_%H%M%S_%N / match [:system_time:] (:: :cacheid:) /([[:graph:]]+)..../ hash (:msg_hash:) /:*:_dw:/ alter (:cacheid:) /sfid-:*:cacheid:_:*:msg_hash:/ # As long as this isn't a "learn" run, nor a "stats-only" run, # we should save the text of this message in the text cache directory. # Note to self: eventually this should also work with the # command spam nonspam stuff { match [:stats_only:] /SET/ match [:learnspam:] /SET/ match [:learnnonspam:] /SET/ output [:*:text_cache:\/texts\/:*:cacheid:] /:*:_dw:/ } } ############################################################## # # Grab the text that we're going to actually work with. # # We copy this into m_text - the "mutilated text". It # will become an annotated _copy_ of the incoming text, # with whatever changes we think will help us classify better. # # We clip m_text to be the first :decision_length: characters of # the incoming mail. # match (:m_text:) [:_dw: 0 :*:decision_length:] /.*/ isolate (:m_text:) # # :b_text: is the text with base64's expanded. isolate (:b_text:) /:*:m_text:/ # # :i_text: is the text with Hypertextus Interruptus removed. isolate (:i_text:) /:*:m_text:/ # # # To start with, the commanded text is assumed to be the entire input. # THEN # If there's a command followed by text, we save the text so we can # put that, and _only_ that, into the .txt corpi. { isolate (:cmd_txt:) /:*:_dw:/ match (:: :cmd_txt:) [:_dw:] /command :*:spw: [^\n]*\n(.*)/ } # # # do we do any expansions? { # expansion 1: - do we perform base64 expansions? { { match [:do_base64:] /yes/ { # yes, expand base64's if there are any # # Note: some spams don't even bother to use # a 'Content-Transfer-Encoding' marker, # and even fewer use Content-Type: text/whatever # so we have to sort of wing it, when to expand # what _might_ be base64 and when to ignore it. # For now, if it says it's a base64, it gets # expanded, no matter what the type. Maybe # someday someone will put in a lockout for # things like .jpg files, .doc files, etc. # isolate (:exp_text:) match [:b_text:] (:a: :h: :b:) \ /(Content-Transfer-Encoding): base64(.*)/ match (:c:) [:b:] \ /([a-zA-Z0-9+=!\/]+:*:_nl:){2,200}/ # syscall (:*:c:) (:exp_text:) /:*:mime_decoder: / # and stuff the result back into b_text for # classification right in context. alter (:c:) /:*:exp_text:/ # and mark this piece of mime as "prior". alter (:h:) /Content-Transfer-Prior-Encoding/ # repeat till no more Mime base64 encodings liaf } } alius { # if no base64 expansions enabled, empty out :b_text: # alter (:b_text:) // } } # # If we had expansions, bust the html contents out of them, otherwise # ignore b_text as it's redundant { { match [:b_text:] /Content-Transfer-Prior-Encoding/ alter (:i_text:) /:*:b_text:/ } alius { # if :b_text: _didn't_ have a base64, it's useless alter (:b_text:) // } } # expansion 2 : do we bust HTML comments ( a.k.a. # hypertextus interruptus) out? { match [:undo_interruptus:] /yes/ isolate (:commentbin:) // { match [:i_text:] (:comment:) // alter (:commentbin:) /:*:commentbin: :*:comment:/ alter (:comment:) // liaf } # if we had at least 80 characters worth of comments, then # it's worth using the decommented text, else not. # (this my personal judgement call) { { match [:commentbin:] /(.){80,}/ } alius { alter (:i_text:) // } } } } # and reassemble the mucked-over text into the :m_text: var, always # with the base64's expanded, then a second decommented copy # { isolate (:m_text:) \ /:*:m_text: :*:_nl: :*:b_text: :*:_nl: :*:i_text: :*:_nl:/ } ######################################################### # # Do we want to do any rewrites before running? # { match [:rewrites_enabled:] /yes/ isolate (:rewrites:) input (:rewrites:) [:*:fileprefix:rewrites.mfp] # reset matching on rewrites to start of string - if no string, no more # processing of rewrites !! match [:rewrites:] // # # { # Grab the next regex; turn the one-per-line patterns into a # regex and a replacement string. # First, do the line-spanning regexes. match (:ch: :fr: :to:) [:rewrites:] /(.+)>-->(.*)/ # see if the "fr" regex matches anywhere { match [:m_text:] (:place:) /:*:fr:/ # Yep, it matched... alter it and do it again # alter (:place:) /:*:to:/ liaf } # Nope, didn't match... grab the next regex and try again, liaf } # # reset back to the start of the rewrites. # match [:rewrites:] // # # and do it again for non-line-spanners { # Go through and do it again, except this time do it for # the non-line-spanning regexes. match (:ch: :fr: :to:) [:rewrites:] /(.+)>->(.*)/ # see if the "fr" regex matches anywhere { match [:m_text:] (:place:) /:*:fr:/ # Yep, it matched... alter it and do it again # alter (:place:) /:*:to:/ liaf } # Nope, didn't match... grab the next regex and try again, liaf } } # done with rewrites. ################################################################### # # Command Dispatch processing starts here # # ---------do we have a --learnspam or --learnnonspam command line key? # match (:text:) [:m_text:] /.*/ isolate (:c:) // isolate ( :spamcss: :nonspamcss: ) { match [:spamcss:] /./ alter (:spamcss:) /spam.css/ } { match [:nonspamcss:] /./ alter (:nonspamcss:) /nonspam.css/ } { match [:learnspam:] /SET/ goto /:learnspamhere:/ } { match [:learnnonspam:] /SET/ goto /:learnnonspamhere:/ } { match (:trash: :file:) [:learnfile:] /(.+)/ goto /:learntofilehere:/ } # # #------------ Are we enabled for "inoculations" via email? # { match [:inoculations_enabled:] /yes/ # # see if we have an inoculation header. # match [:m_text:] \ /Inoculation-Sender: ([[:graph:]]+)/ (:x: :sender:) match [:m_text:] \ /Inoculation-Type: ([[:graph:]]+)/ (:x: :type:) match [:m_text:] \ /Inoculation-Authentication: (.*)$/ (:x: :auth:) # # See if the sender is in our list, and if so, what is their secret? # isolate (:inoculation_passwd:) // input (:inoculation_passwd:) [:*:fileprefix:inoc_passwd.txt] match [:inoculation_passwd:] \ /:*:sender: :*:type: (.*)/ (:x: :secret:) # # We now have the shared secret, calculate the checksum we should have # # grab the body... match /\n\n(.*)/ (:x: :body:) # # and calculate the hash. isolate (:md5out:) syscall (:*:secret::*:_nl::*:body:) (:md5out:) /md5sum/ match [:md5out:] /([[:graph:]]+)/ (:x: :md5out:) # # does this hash match with the given hash? match [:auth:] /:*:md5out:/ # # Yes, it matched. It's a real innoculation. # # grab the text we want to actually learn (this is the payload) match [:m_text:] (:x: :text:) /\n\n(.*)/ # # and learn it appropriately { match [:type:] /nonspam/ goto /:learnnonspamhere:/ } alius { match [:type:] /spam/ goto /:learnspamhere:/ } } # # -------------check for the COMMAND WORD ---------- # { # # grab the password as :pw:, and any arg(s) as :c: # match (:z: :pw: :c: ) [:_dw:] /^command ([[:graph:]]+) (.*)/ # # check the password. If it's invalid, FAIL out of this bracket set # and just treat this as ordinary (non-command) mail. match [:pw:] /:*:spw:/ { # was it a command to add something to the whitelist? match (:q: :a:) [:c:] /whitelist (.*)/ output [:*:fileprefix:whitelist.mfp] /:*:a::*:_nl:/ alter (:z:) /*** :*:z: *** :*:_nl:Whitelist command executed! :*:_nl:/ accept exit /:*:accepted_mail_exit_code:/ } { # was it a command to add something to the blacklist? match (:q: :a:) [:c:] /blacklist (.*)/ output [:*:fileprefix:blacklist.mfp] /:*:a::*:_nl:/ alter (:z:) /*** :*:z: *** :*:_nl:Blacklist command executed! :*:_nl:/ accept exit /:*:accepted_mail_exit_code:/ } # # Did the user specify command "force"? # { match [:c:] /force/ # # yep, so we set the "force" on. alter (:force:) /SET/ } # # Did the user specify command "unlearn"? # { match [:c:] /unlearn/ # # yep, so we set the "force" on. alter (:unlearn:) /SET/ } # # Now, if :unlearn: is set, by either bashline or command, we # set the :clf: flag to be "refute". Otherwise, we set it to # be what it was before. # { match [:unlearn:] /SET/ alter (:clf:) /:*:clf: refute/ } # # Now, the big mahonka. Learn as nonspam, or as spam # (note the three subpaths - one each for non-forced, forced, and # non-forced error messages) # { # was it a command to learn something as nonspam? match [:c:] /nonspam/ match (:z: :text:) [:m_text:] /:*:_nl:command [[:graph:]]+ nonspam(.*)/ # and learn it as nonspam :learnnonspamhere: { # Are we supposed to use the cached version? { match [:c:] /cache/ alter (:cache:) /SET/ } match [:cache:] /SET/ # can also be set on command line # yes - so we use mailtrainer.crm to do the training { match (:: :fileid:) /X-CRM114-CacheID: ([[:graph:]]+)/ # check- does the cached file exist? syscall () (:tmp:) /ls :*:text_cache:\/texts\/:*:fileid:/ match [:tmp:] /:*:fileid:/ # yes, it exists - go on with the learning method # and remember this file on a permanent basis syscall /ln :*:text_cache:\/texts\/:*:fileid: :*:text_cache:\/known_good\/:*:fileid: / # output / \n DOING: crm mailtrainer.crm --good=:*:text_cache:\/texts\/:*:fileid: --spam=:*:text_cache:\/empty\/ \n / () (:mailtrainer_output:) syscall /crm mailtrainer.crm --fileprefix :*:fileprefix: --good=:*:text_cache:\/texts\/:*:fileid: --spam=:*:text_cache:\/empty\/ / () (:mailtrainer_output:) # output /mailtrainer output: ':*:mailtrainer_output:'\n/ # and remove it from the prob_* directories, as # now it's known syscall /rm -rf :*:text_cache:\/prob_spam\/:*:fileid:/ syscall /rm -rf :*:text_cache:\/prob_good\/:*:fileid:/ # # now it's trained; put in a marker in the headers call /:mungmail_delete:/ [X-CRM114-Status: ] call /:mungmail_delete:/ [X-CRM114-Unsure: ] call /:mungmail_add:/ [X-CRM114-Action: LEARNED AND CACHED NONSPAM ] # Insert the training report into the msgbody if desired { match [:add_mailtrainer_report:] /yes/ match (:: :firstline:) /.*(.)/ match (:: :firstline:) /\n\n()/ alter (:firstline:) / :*:mailtrainer_output:\n-----\n/ } accept exit /:*:accepted_mail_exit_code:/ } { # No, it didn't exist. Add an error message header. call /:mungmail_add:/ [X-CRM114-ERROR: No cached text with that cacheID, so nothing done!] accept exit /:*:accepted_mail_exit_code:/ } } { # No cacheing, so we learn "natively" # # Verify that we need to learn this first (TOE strategy) classify <:*:clf:> [:text:] /:*:lcr:/ \ (:*:fileprefix::*:nonspamcss: :*:fileprefix::*:spamcss: ) \ (:classify_status:) match [:classify_status:] \ /^#0.* pR: ([-. 0-9]+)/ (:: :pr:) eval /:@: :*:pr: < :*:thick_threshold: :/ output [:*:fileprefix:nonspamtext.txt] \ /\n\n:*:cmd_txt:\n/ # # write out the pre-mutilation text, with newlines # learn <:*:clf:> (:*:fileprefix::*:nonspamcss:) [:text:] /:*:lcr:/ call /:mungmail_add:/ [X-CRM114-Action: LEARNED NONSPAM] call /:mungmail_unique:/ [X-CRM114-Status: Good (Learn)] accept exit /:*:accepted_mail_exit_code:/ accept exit /:*:accepted_mail_exit_code:/ } alius { # # Did the user specify "--force" on the command line? match [:force:] /SET/ output [:*:fileprefix:nonspamtext.txt] /\n\n:*:cmd_txt:\n/ # # write out the pre-mutilation text, with newlines # learn < :*:clf: > (:*:fileprefix::*:nonspamcss:) [:text:] /:*:lcr:/ # syscall (:*:_dw:) (:_dw:) \ # /formail -A "X-CRM114-Action: LEARNED NONSPAM (FORCED) :*:clf:"/ call /:mungmail_add:/ [X-CRM114-Action: LEARNED NONSPAM (FORCED)] call /:mungmail_unique:/ [X-CRM114-Status: Good (Learn)] accept exit /:*:accepted_mail_exit_code:/ } alius { call /:mungmail_add:/ [X-CRM114-Action: LEARN AS NONSPAM UNNECESSARY- ALREADY CLASSIFIED CORRECTLY - NO ACTION TAKEN] accept exit /:*:accepted_mail_exit_code:/ } } { # was it a command to learn something as spam? match [:c:] /spam/ match (:z: :text:) [:m_text:] /:*:_nl:command [[:graph:]]+ spam(.*)/ # and learn it as spam :learnspamhere: { # Are we supposed to use the cached version? { match [:c:] /cache/ alter (:cache:) /SET/ } match [:cache:] /SET/ # can also be set on command line # check- does the cached file exist? { # yes - so we use mailtrainer.crm to do the training match (:: :fileid:) /X-CRM114-CacheID: ([[:graph:]]+)/ syscall () (:tmp:) /ls :*:text_cache:\/texts\/:*:fileid:/ match [:tmp:] /:*:fileid:/ # remember this file on a permanent basis syscall /ln :*:text_cache:\/texts\/:*:fileid: :*:text_cache:\/known_spam\/:*:fileid: / #output /DOING: crm mailtrainer.crm --spam=:*:text_cache:\/known_spam\/:*:fileid: --good=:*:text_cache:\/empty\/ / () (:mailtrainer_output:) syscall /crm mailtrainer.crm --fileprefix :*:fileprefix: --spam=:*:text_cache:\/known_spam\/:*:fileid: --good=:*:text_cache:\/empty\/ / () (:mailtrainer_output:) #output /mailtrainer output: ':*:mailtrainer_output:'\n/ # and remove it from the prob_* directories, as # now it's known syscall /rm -rf :*:text_cache:\/prob_spam\/:*:fileid:/ syscall /rm -rf :*:text_cache:\/prob_good\/:*:fileid:/ # # now it's trained; put in a marker in the headers call /:mungmail_delete:/ [X-CRM114-Status: ] call /:mungmail_delete:/ [X-CRM114-Unsure: ] call /:mungmail_add:/ [X-CRM114-Action: LEARNED AND CACHED SPAM] # Insert the training report in the msgbody, if desired { match [:add_mailtrainer_report:] /yes/ match (:: :firstline:) /.*(.)/ match (:: :firstline:) /\n\n()/ alter (:firstline:) / :*:mailtrainer_output:\n ------ \n/ } accept exit /:*:accepted_mail_exit_code:/ } alius { call /:mungmail_add:/ [X-CRM114-ERROR: No cached text with that cacheID, so nothing done! ] accept exit /:*:accepted_mail_exit_code:/ } } { # Not cached... # # Verify that we need to learn this first (TOE strategy) classify <:*:clf:> [:text:] /:*:lcr:/ \ (:*:fileprefix::*:spamcss: :*:fileprefix::*:nonspamcss: )\ (:classify_status:) match [:classify_status:] \ /^#0.* pR: ([-. 0-9]+)/ (:: :pr:) eval /:@: :*:pr: < :*:thick_threshold: : / # # write out the pre-mutilation text, with newlines # output [:*:fileprefix:spamtext.txt] /\n\n:*:cmd_txt: \n/ learn < :*:clf:> (:*:fileprefix::*:spamcss:) [:text:] /:*:lcr:/ call /:mungmail_add:/ [X-CRM114-Action: LEARNED SPAM] call /:mungmail_unique:/ [X-CRM114-Status: Good (Spam Learn)] accept exit /:*:accepted_mail_exit_code:/ } alius { # Did the user specify "--force" on the command line? match [:force:] /SET/ # # write out the pre-mutilation text, with newlines # output [:*:fileprefix:spamtext.txt] /\n\n:*:cmd_txt: \n/ learn < :*:clf:> (:*:fileprefix::*:spamcss:) [:text:] /:*:lcr:/ call /:mungmail_add:/ [X-CRM114-Action: LEARNED SPAM (FORCED)] call /:mungmail_unique:/ [X-CRM114-Status: Good (Spam Learn)] accept exit /:*:accepted_mail_exit_code:/ } alius { call /:mungmail_add:/ [X-CRM114-Action: LEARN AS SPAM UNNECESSARY- ALREADY CLASSIFIED CORRECTLY - NO ACTION TAKEN] call /:mungmail_unique:/ [X-CRM114-Status: Good (Spam Learn)] accept exit /:*:accepted_mail_exit_code:/ } } { # was it a command to learn something as an arbitrary type? # Note: the files this generates don't get used for anything unless # you use --spamcss and --nonspamcss in your own scripts. # # Note: these "learns" are a-priori "force", since we don't know # what other .css files we should compare this text to. # match [:c:] /learn/ match (:z: :learnfile: :text:) [:m_text:] /:*:_nl:command [[:graph:]]+ learn ([[:graph:]]+)(.*)/ # and learn it :learntofilehere: output [:*:fileprefix::*:learnfile:text.txt] /:*:text:/ learn < :*:clf:> (:*:fileprefix::*:learnfile:.css) [:text:] /:*:lcr:/ # syscall (:*:_dw:) (:_dw:) /formail -A "X-CRM114-Action: LEARNED :*:file: :*:clf:"/ call /:mungmail_add:/ [X-CRM114-Action: LEARNED :*:file:] call /:mungmail_unique:/ [X-CRM114-Status: Good (Learn)] accept exit /:*:accepted_mail_exit_code:/ } } # # # George's Circuit Breaker - if the mail has already been processed # by CRM114, then send it directly to output, without further # processing. # # WE DON'T RISK THIS ANY MORE - WITH ~ A MILLION USERS, WE'RE NOW # A TARGET FOR SPAMMERS TO USE THIS HACK. #{ # match /X-CRM114/ # alter (:classifier_reason:) \ # / This mail seems to have already been processed by CRM114. / # alter (:stats:) / pR: 999.99 / # goto /:looks_good:/ #} # none of the above - classify this incoming mail instead. # first according to priority action list, # then according to whitelist, # then according to blacklist, # then according to the CRM sparse spectral classifier. # # check it against the priority action list- this list is # of the form of a + or -, then a pattern. + means accept, # - means reject. These are executed in order (which is # different from whitelist or blacklist in that they occur # in order given, not whitelist-then-blacklist. The priority # action list is tried before whitelist or blacklist. # isolate (:priolist:) input (:priolist:) [:*:fileprefix:priolist.mfp] # reset matching on :priolist: to the start of the string match [:priolist:] // # # { #... Grab the next regexturn the one-per-line patterns into a regex match (:w: :pm: :pat:) [:priolist:] /(.)(.+)/ #... see if this regex matches the incoming mail { match (:reason:) /:*:pat:/ # Yep, it matched... branch based on pm # { match [:pm:] /[+]/ # put in a little tag saying why prio-listed alter (:classifier_reason:) /** ACCEPT: CRM114 Priority Whitelisted by: :*:reason: **:*:_nl:/ alter (:stats:) / pR: 999.99 / goto /:looks_good:/ } # No, we didn't have a +, was it a '-'? { match [:pm:] /[-]/ alter (:classifier_reason:) /** REJECT: CRM114 Priority Blacklisted by: :*:reason: **:*:_nl:/ alter (:reject_address:) /:*:fail_priority_mail_to:/ { match [:log_rejections:] /yes/ output [:*:fileprefix:rejected_by_blacklist.txt] /:*:_dw:/ } alter (:stats:) / pR: -999.99 / goto /:looks_bad:/ } } # Nope, didn't match as a priority... grab the next regex liaf } # # # check it against the whitelist... load the whitelist... { isolate (:whitelist:) input (:whitelist:) [:*:fileprefix:whitelist.mfp] # reset matching on whitelist to start of string match [:whitelist:] // } # # { #... Grab the next regexturn the one-per-line patterns into a regex match (:waste: :whregex:) [:whitelist:] /(.+)/ #... see if this regex matches the incoming mail { match (:reason:) /:*:whregex:/ # Yep, it matched... whitelist this email # # put in a little tag saying why whitelisted: alter (:classifier_reason:) /** ACCEPT: CRM114 Whitelisted by: :*:reason: **:*:_nl:/ alter (:_dw:) /:*:_dw:\n\n ** CRM114 Whitelisted by: :*:reason: **:*:_nl:/ alter (:stats:) / pR: 999.99 / goto /:looks_good:/ } # Nope, didn't match... grab the next regex and try again, liaf } # # No joy, maybe we should blacklist it. # # check it against the blacklist { isolate (:blacklist:) input (:blacklist:) [:*:fileprefix:blacklist.mfp] # reset matching on blacklist to start of string match [:blacklist:] // } # { #... Grab the next regexturn the one-per-line patterns into a regex match (:waste: :blregex:) [:blacklist:] /(.+)/ #... see if this regex matches the incoming mail { match (:reason:) /:*:blregex:/ # Yep, it matched... blacklist this email # # put in a little tag saying why blacklisted alter (:classifier_reason:) /** REJECT: CRM114 Blacklisted by: :*:reason: ** :*:_nl:/ alter (:reject_address:) /:*:fail_blacklist_mail_to:/ { match [:log_rejections:] /yes/ output [:*:fileprefix:rejected_by_blacklist.txt] /:*:_dw:/ } alter (:stats:) / pR: -999.99 / goto /:looks_bad:/ } # Nope, didn't match... grab the next regex and try again liaf } # # # # End of blacklist processing. # # # All else has failed- we now run our CLASSIFY algorithm # to make our best guess. # # { # Run the CSS classifier against the "expanded" text - # if it classifies as SPAM # then reject it as SPAM. # { classify <:*:clf:> ( :*:fileprefix::*:nonspamcss: | :*:fileprefix::*:spamcss: ) ( :stats: ) [:m_text:] /:*:lcr:/ } # Now we grab the pR and if it's greater than the minus # threshold, we send it to "good". Otherwise, it goes to bad. { match [:stats:] (:d: :pval:) /pR: (.*)/ eval /:@: :*:pval: > ( 0.0 - :*:thick_threshold: ) : / alter (:classifier_reason:) /** ACCEPT: CRM114 PASS :*:clf: Matcher ** :*:_nl::*:stats:/ goto /:looks_good:/ } alter (:classifier_reason:) /** REJECT: CRM114 FAIL :*:clf: Matcher ** :*:_nl::*:stats:/ alter (:reject_address:) /:*:fail_SSM_mail_to:/ { match [:log_rejections:] /yes/ output [:*:fileprefix:rejected_by_css.txt] /:*:_dw:/ } goto /:looks_bad:/ } # # # Final wrap-up routines - dispose of the mail as appropriate. # { :looks_bad: # is this a :stats_only: run (i.e. for CAMRAM) { match [:stats_only:] /SET/ match [:stats:] (:d: :pval:) /pR: (.*)/ output /:*:pval: :*:_nl:/ alter (:our_exit_code:) /:*:rejected_mail_exit_code:/ goto /:finish_up:/ } # not stats_only.... we're doing major output. # save unprocessed text by symlink in the text cache if needed. { match [:text_cache:] /./ syscall () () /ln :*:text_cache:\/texts\/:*:cacheid: :*:text_cache:\/prob_spam\/:*:cacheid:/ } # and write out the long-form message too. { { match [:add_headers:] /yes/ { match [:stats:] (:pr:) /pR: .*$/ } call /:mungmail_add:/ [X-CRM114-Version: :*:_crm_version: MF-:*:_pgm_hash: ] call /:mungmail_unique:/ [X-CRM114-CacheID: :*:cacheid: ] call /:mungmail_unique:/ [X-CRM114-Status: SPAM ( :*:pr: )] } # # Now, get the Subject: line. If none, make one. { { match (:subject_line: :subj_text:) \ /^Subject: (.*)/ } alius { match (:end_of_headers:) /\n\n/ alter (:end_of_headers:) /\nSubject: (none)\n\n/ match (:subject_line: :subj_text:) /^Subject: (.*)/ } } { # # If we are re-sending this, we want to de-fang the # subject, otherwise we don't. match [:reject_address:] /[a-zA-Z0-9]/ # Paolo P. suggests this alteration to avoid subversion # by enclosing an alternate target in "marks". We always # have to do this. { match (:dq:) [:subj_text:] /\$/ alter (:dq:) /USD/ liaf } { match (:dq:) [:subj_text:] /[^-a-zA-Z0-9!., ]/ alter (:dq:) // liaf } # # We isolate subj_text here, so if later syscalls move # things, the subject text used in "mail" is still OK. isolate (:subj_text:) } # # If the user asked for a spam-flagging string, put the flagging # string into the subject. # { match [:spam_flag_subject_string:] /./ alter (:subj_text:) \ /:*:spam_flag_subject_string: :*:subj_text:/ } { match [:add_extra_stuff:] /text/ # get rid of any first-column 'From's as they are message breaks! # this isn't necessary if we're mailing to someplace else... { match (:f:) [:m_text:] /^From/ alter (:f:) />:*:f:/ liaf } alter (:_dw:) /:*:_dw:-=-Extra Stuff-=-\n\n:*:m_text: -0-0-0- :*:_nl:/ } { match [:add_extra_stuff:] /attachment/ # get rid of any first-column 'From's as they are message breaks! # this isn't necessary if we're mailing to someplace else... { match (:f:) [:m_text:] /^From/ alter (:f:) / :*:f:/ liaf } isolate (:content_type:) // # syscall (:*:_dw:) (:content_type:) /formail -X "Content-Type"/ call /:mungmail_extract:/ [Content-type] (:content_type:) isolate (:content_transfer_encoding:) // # syscall (:*:_dw:) (:content_transfer_encoding:) /formail -X "Content-Transfer-Encoding"/ call /:mungmail_extract:/ [Content-Transfer-Encoding] (:content_transfer_encoding:) # syscall (:*:_dw:) (:_dw:) /formail -A "MIME-Version: 1.0"/ # syscall (:*:_dw:) (:_dw:) /formail -A "Content-Type: multipart\/mixed\; boundary=Attachment_Quote_Boundary_1234567890\n--Attachment_Quote_Boundary_1234567890\n:*:content_type::*:content_transfer_encoding:"/ call /:mungmail_add:/ ["Content-Type: multipart\/mixed\; boundary=Attachment_Quote_Boundary_1234567890\n--Attachment_Quote_Boundary_1234567890\n:*:content_type::*:content_transfer_encoding:] alter (:_dw:) /:*:_dw::*:_nl:\ --Attachment_Quote_Boundary_1234567890 :*:_nl:\ Content-Type: text\/plain :*:_nl:\ Content-Transfer-Encoding: quoted-printable \n\n\n:*:m_text:\ \n--Attachment_Quote_Boundary_1234567890--\n/ } # # # Decide if we forward or if we just output it. { { # if this match succeeds, we should forward-to-an-address? # Yes, but only if we _have_ a forward-to address. match [:reject_address:] /[a-zA-Z0-9]/ { # -- put the classifier reason in as the first thing! match [:add_verbose_stats:] /yes/ alter (:_dw:) /:*:_nl: :*:classifier_reason::*:_nl: :*:_dw: / } syscall (:*:_dw:) /mail :*:reject_address: -s ':*:subj_text:'/ } alius { { # -- put the classifier reason in at the end of the headers match [:add_verbose_stats:] /yes/ match (:start_of_data:) /\n\n/ alter (:start_of_data:) /\n\n :*:classifier_reason: \n / } accept } } } alter (:our_exit_code:) /:*:rejected_mail_exit_code:/ goto /:finish_up:/ } # # and here's where we accept something as good email. { :looks_good: # is this a :stats_only: run (i.e. for CAMRAM) { match [:stats_only:] /SET/ match [:stats:] (:d: :pval:) /pR: (.*)/ output /:*:pval: :*:_nl:/ alter (:our_exit_code:) /:*:accepted_mail_exit_code:/ goto /:finish_up:/ } # Not stats-only; do the full output thing. # save unprocessed txt by symlink in the text cache if needed. { match [:text_cache:] /./ syscall () () /ln :*:text_cache:\/texts\/:*:cacheid: :*:text_cache:\/prob_good\/:*:cacheid:/ } # and generate up a pretty mail-out report. { match [:add_verbose_stats:] /yes/ alter (:_dw:) /:*:_dw: :*:_nl: :*:classifier_reason: :*:_nl:/ } { match [:add_headers:] /yes/ { match [:stats:] (:pr:) /pR: .*$/ } # syscall (:*:_dw:) (:_dw:) /formail -A "X-CRM114-Version: :*:_crm_version: MF-:*:_pgm_hash: " -A "X-CRM114-Status: Good ( :*:pr: \)"/ call /:mungmail_add:/ [X-CRM114-Version: :*:_crm_version: MF-:*:_pgm_hash: [:*:pr:]] call /:mungmail_unique:/ [X-CRM114-CacheID: :*:cacheid: ] call /:mungmail_unique:/ [X-CRM114-Status: Good ( :*:pr: )] { # Maybe we need to tag it as unsure? Note that since mail # that scores > -thresh (but still < 0)goes out the "good" pipe, # some "spammy" email might come through here. match [:stats:] (:d: :pval:) /pR: (.*)/ eval /:@: :*:pval: < :*:thick_threshold: :/ call /:mungmail_unique:/ [X-CRM114-Status: UNSURE (:*:pval:) This message is 'unsure'; please train it! ] } } { match [:add_extra_stuff:] /text/ # get rid of any first-column 'From's as they are message breaks! # this isn't necessary if we're mailing to someplace else... { match (:f:) [:m_text:] /^From/ alter (:f:) / :*:f:/ liaf } alter (:_dw:) /:*:_dw:-=-Extra Stuff-=-\n\n :*:m_text: -0-0-0- \n/ } { match [:add_extra_stuff:] /attachment/ # get rid of any first-column 'From's as they are message breaks! # this isn't necessary if we're mailing to someplace else... { match (:f:) [:m_text:] /^From/ alter (:f:) / :*:f:/ liaf } isolate (:content_type:) // # syscall (:*:_dw:) (:content_type:) /formail -X "Content-Type"/ call /:mungmail_extract:/ (:content_type:) [Content-Type] isolate (:content_transfer_encoding:) // # syscall (:*:_dw:) (:content_transfer_encoding:) /formail -X "Content-Transfer-Encoding"/ call /:mungmail_extract:/ (:content_transfer_encoding:) [Content-Transfer-Encoding:] # syscall (:*:_dw:) (:_dw:) /formail -A "MIME-Version: 1.0"/ call /:mungmail_add:/ [MIME-Version: 1.0] # syscall (:*:_dw:) (:_dw:) /formail -A "Content-Type: multipart\/mixed\; boundary=Attachment_Quote_Boundary_1234567890\n--Attachment_Quote_Boundary_1234567890\n:*:content_type::*:content_transfer_encoding:"/ # syscall (:*:_dw:) (:_dw:) /formail -U "Content-Type"/ call /:mungmail_unique:/ [Content-Type:] alter (:_dw:) /:*:_dw::*:_nl:\ --Attachment_Quote_Boundary_1234567890 :*:_nl:\ Content-Type: text\/plain :*:_nl:\ Content-Transfer-Encoding: quoted-printable \n\n\n:*:m_text:\ \n--Attachment_Quote_Boundary_1234567890--\n/ } accept alter (:our_exit_code:) /:*:accepted_mail_exit_code:/ goto /:finish_up:/ } # # Here's where we finish up processing in all the paths. # :finish_up: { # ---- should we consider automatic training? match [:automatic_training:] /yes/ # bounce out if we've already auto-trained this email match /AUTO-TRAINED/ isolate (:msghash:) hash (:msghash:) /:*:_dw:/ # pick one in 16- here, if the second-to-last digit is a 0 match [:msghash:] /......0./ # # out put autotraining... # Yep... we should use this for autotraining # do we auto-train on acceptance? { match [:classifier_reason:] /ACCEPT/ # it wasn't spam... autotrain it "nonspam" output [:*:fileprefix:nonspamtext.txt] /:*:text:/ learn (:*:fileprefix::*:nonspamcss:) [:m_text:] /:*:lcr:/ goto /:autotrain_finish:/ } # or do we autotran on rejection { match [:classifier_reason:] /REJECT/ # it was spam... autotrain it "spam" output [:*:fileprefix:spamtext.txt] /:*:text:/ learn (:*:fileprefix::*:spamcss:) [:m_text:] /:*:lcr:/ goto /:autotrain_finish:/ } :autotrain_finish: { { match [:autotrain_address:] /../ # syscall (:*:classifier_reason: :*:_nl: :*:_dw:) /mail -s "AUTO-TRAINED email - please check" :*:autotrain_address:/ } alius { # there was no autotrain address, so we just accept it. match (:subj:) /Subject:/ alter (:subj:) /Subject: AUTO_TRAINED- please check! .../ accept } } } :exit_here: exit /:*:our_exit_code:/ ################################################### # # # This is Mungmail - these are the replacement routines for # formail(), to remove dependency on formail() being in every # distribution # # # Add a new header :mungmail_add: (:new_header:) { # Grab the current headers call /:mungmail_grab_current_headers:/ alter (:current_headers:) /:*:current_headers::*:new_header:\n/ return } # # extract a header (first of them found) # :mungmail_extract: (:header_name:) { # Extract the header with the given field type, and # return that. Note that we add the colon here; don't # put it into the desired_header string. # call /:mungmail_grab_current_headers:/ { match [:current_headers:] (:: :desired_header:) \ /(?:^|\n)(:*:header_name: *: ([^\n]|\n[[:space:]])*)/ return /:*:desired_header:/ } return // } # # delete all current headers of this type, insert ours instead. # :mungmail_delete: (:new_header:) { call /:mungmail_grab_current_headers:/ { match (:new_header_type:) [:new_header:] /[[:graph:]]+/ } # # a LIAF-loop to delete any header (including continuations) that # has a type that matches the new_header_type. { match [:current_headers:] (:kill_this_line:) \ /:*:new_header_type: ([^\n]|\n[[:space:]])*\n/ alter (:kill_this_line:) // liaf } return } # delete all current headers of this type, insert ours instead. # :mungmail_unique: (:new_header:) { call /:mungmail_grab_current_headers:/ { match (:new_header_type:) [:new_header:] /[[:graph:]]+/ } call /:mungmail_delete:/ [:*:new_header:] call /:mungmail_add:/ [:*:new_header:] return } # # Helper routine to get the current mail headers of :_dw: # :mungmail_grab_current_headers: { { # Grab everything before the first \n\n match (:: :current_headers:) /(([^\n]+\n)+)\n/ # output /-A-->:*:current_headers:<---\n/ return } # if we got here, it wasn't a real message (void body, and/or no # doubled newline) but it might still have useful text anyway. # Is there a final newline? { match (:current_headers:) /^.*\n$/ # output /-B-->:*:current_headers:<---\n/ return } # if we got to here, then there wasn't even a final newline. # That's a violation of RFC, we'll add it. { alter (:_dw:) /:*:_dw:\n/ match (:current_headers:) /.+/ # output /-C-->:*:current_headers:<---\n/ return } fault / Couldn't manage to find the headers, though I tried really hard\n/ } trap (:broken_program_message:) /.*/ { accept output /:*:_nl: Aw, crud. mailfilter.crm broke. Here's the error: :*:_nl:/ output /:*:broken_program_message:/ output [stderr] /:*:_nl: ERROR: mailfilter.crm broke. Here's the error: :*:_nl:/ output [stderr] /ERROR: :*:broken_program_message:/ } exit /:*:program_fault_exit_code:/ crm114-20100106-BlameMichelson.src/reto_procmailrc.recipe0000644000000000017500000000535511321154266021362 0ustar rootwsy# # reto_procmail.recipe - Check message with crm114 # # Copyright 2004 Reto Lichtensteiger # Copyright 2004-2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # # NOTE: This recipe invokes crm114 on a message. It does NOT filter # the message to a spam folder. How you choose to manage that task # up to you. # CRM="/usr/local/bin/crm" CRMDIR="${HOME}/.crm114" UTRACE="" # -t turns on crm script tracing CRMVERSION="`${CRM} '-{ output /:*:_crm_version:/}' < /dev/null` [stats_only]" # # It is possible for crm114 to DoS if it receives a very large message. # To prevent this, the author has coded an upper bound to the size # message that mailfilter.crm will accept. # # We can prevent messages from hitting that limit by only feeding a # portion of the message to crm114, but this has a slightly more # subtle problem in that messages are often fed to crm114 as filters, # which means procmail expects the message to be modified. Using # head(1) to limit the message size going into a filter recipe will # truncate the actual message, not just a copy. "Oops" # # This recipe set uses crm114 as a direct "f"ilter if the message is # smaller than $BIGGEST, otherwise the message is truncated with head # and crm114 is set (via "--stats_only" switch) to report only. The # output from crm114 when run in this mode is captured and formail is # used to put the same headers in place that crm114 would normally # add. # BIGGEST="20000" :0 * < ${BIGGEST} { :0 fw | ${CRM} ${UTRACE} -u ${CRMDIR} mailfilter.crm # if crm114 failed, return the mail to the queue where the MTA will # retry delivery at a later date. (EX_TEMPFAIL in /usr/include/sysexits.h) :0 e { EXITCODE=75 } } # # If the first recipe doesn't match the size test, then the message is # larger than ${BIGGEST} # :0 E { # For programs invoked via backtick operator, procmail puts the mail # message on stdin. The line below feeds the mail message through # head to truncate it and then into crm114. CRMOUT="`head -c ${BIGGEST} | ${CRM} -u ${CRMDIR} mailfilter.crm --stats_only\`" # If CRMOUT is undefined (or empty) then crm114 failed. Set the exit # code to 75, so that procmail uses that as an exitcode. THe MTA # that invoked procmail will see that as "TEMPFAIL" and requeue the # message. THe script then unsets the HOST variable to tell procmail # to exit. :0 * CRMOUT ?? ^^^^ { EXITCODE=75 HOST } :0 * CRMOUT ?? ^- { VALUE="SPAM" } :0 E { VALUE="Good" } :0 f | formail -A "X-CRM114-Version: ${CRMVERSION}" \ -A "X-CRM114-Status: ${VALUE} ( pR: ${CRMOUT})" CRMOUT VALUE } :0 * ^X-CRM114-Status: SPAM { LOG="SPAM: CRM114 (Sure)${NL}" } # Unset variables CRM CRMDIR CRMVERSION UTRACE crm114-20100106-BlameMichelson.src/crm_math_exec.c0000644000000000017500000006075211321154266017751 0ustar rootwsy// crm_math_exec.c - CRM114 language math processing // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" static int math_formatter ( double value, char *format, char *buf, long buflen); // // strmath - evaluate a string for the mathematical result, // returning the length of the valid string. // long strmath (char *buf, long inlen, long maxlen, long *retstat) { long status; long old_internal_trace; old_internal_trace = internal_trace; if (inlen < 0) { fatalerror5 ("Bug in caller to strmath() - it makes no sense to", " have a negative length string! \n", CRM_ENGINE_HERE); internal_trace = old_internal_trace; return (0); }; // Check for first-character control of Algebraic v. RPN if (buf[0] == 'A') { // internal_trace = 1; memmove (buf, &buf[1], inlen-1); buf[inlen-1] = '\0'; status = stralmath (buf, inlen-1, maxlen, retstat); internal_trace = old_internal_trace; return (status); } if (buf[0] == 'R') { // Do we want to do selective tracing? // internal_trace = 1; memmove (buf, &buf[1], inlen-1); buf[inlen-1] = '\0'; status = strpnmath (buf, inlen-1, maxlen, retstat); internal_trace = old_internal_trace; return (status); } // No first-character control, so use q_expansion_mode to control. if (q_expansion_mode == 0 || q_expansion_mode == 2) { return (stralmath (buf, inlen, maxlen, retstat)); } else { return (strpnmath (buf, inlen, maxlen, retstat)); } } // strpnmath - do a basic math evaluation of very simple expressions. // // This does math, in RPN, on a string, and returns a string value. // long strpnmath (char *buf, long inlen, long maxlen, long *retstat) { double stack [DEFAULT_MATHSTK_LIMIT]; // the evaluation stack double sd; // how many 10^n's we've seen since a decimal long od; // output decimal flag long ip, op; // in string pointer, out string pointer long sp; // stack pointer - points to next (vacant) space long sinc; // stack incrment enable - do we start a new number long errstat; // error status char outformat[64]; // output format long outstringlen; // start off by initializing things ip = 0; // in pointer is zero op = 0; // output pointer is zero sp = 0; // still at the top of the stack od = 0; // no decimals seen yet, so no flag to output in decimal sinc = 0; // no autopush. outformat[0] = '\0'; // now our number-inputting hacks stack[sp] = 0.0 ; sd = 1.0; // all initialized... let's begin. if (internal_trace) fprintf (stderr, "Math on '%s' len %ld retstat %lx \n", buf, inlen, (long) retstat); for (ip = 0; ip < inlen; ip++) { if (internal_trace) fprintf (stderr, "ip = %ld, sp = %ld, stack[sp] = %f, ch='%c'\n", ip, sp, stack[sp], buf[ip]); if (sp < 0) { errstat = nonfatalerror5 ("Stack Underflow in math evaluation", "", CRM_ENGINE_HERE); return (0); }; if (sp >= DEFAULT_MATHSTK_LIMIT) { errstat=nonfatalerror5 ("Stack Overflow in math evaluation.\n " "CRM114 Barbie says 'This math is too hard'.", buf, CRM_ENGINE_HERE); return (0); }; switch (buf[ip]) { // // a digit,or maybe a number - big change - we now use strtod // case '.': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '-': case '+': { char *frejected; // handle the case of a minus sign that isn't a unary -. if (buf[ip] == '-' && !( isdigit (buf[ip+1]))) { if (sp > 0) { sp--; stack[sp] = stack[sp] - stack[sp+1]; sinc = 1; } break; }; if (buf[ip] == '+' && !( isdigit (buf[ip+1]))) { if (sp > 0) { sp--; stack[sp] = stack[sp] + stack[sp+1]; sinc = 1; } break; }; // Neither unary +/- so we use strtod to convert // the string we're looking at to floating point. sp++; stack[sp] = strtod ( &buf[ip], &frejected); if (user_trace) fprintf (stderr, "got number: %e\n", stack[sp]); // // Now, move [ip] over to accomodate characters used. // (the -1 is because there's an auto-increment in the big // FOR-loop) ip = ((unsigned long) frejected) - ((unsigned long) buf ) - 1; } break; // // and now the standard math operators (except for - and + above) // case '*': { if (sp > 0) { sp--; stack[sp] = stack[sp] * stack[sp+1]; sinc = 1; } }; break; case '/': { if (sp > 0) { sp--; // don't worry about divide-by-zero, we get INF in IEEE. stack[sp] = stack[sp] / stack[sp+1]; sinc = 1; } }; break; case '%': { if (sp > 0) { sp--; stack[sp] = ((long long) stack[sp]) % ((long long)stack[sp+1]); sinc = 1; } }; break; case '^': // exponentiation - for positive bases, neg base + int exp. if (sp > 0) { sp--; if (stack[sp] < 0.0 && ((long long)(stack[sp+1]))/1 != stack[sp+1]) { stack[sp] = stack[sp] / 0.0; } else stack[sp] = pow (stack[sp], stack[sp+1]); if (internal_trace) fprintf (stderr, "exp out: %lf\n", stack[sp]); sinc = 1; } break; case 'v': // logs as BASE v ARG; (NaN on BASE <= 0) if (sp > 0) { sp--; if (stack[sp] <= 0.0 ) { stack[sp] = stack[sp] / 0.0 ; } else stack[sp] = log (stack[sp+1]) / log (stack[sp]); sinc = 1; } break; case '=': { if (sp > 0) { sp--; if (stack[sp] == stack[sp+1]) { if (retstat) *retstat = 0; stack[sp] = 1; } else { if (retstat) *retstat = 1; stack[sp] = 0; }; sinc = 1; } }; break; case '!': { if (sp > 0 && buf[ip+1] == '=') { ip++; // gobble up the equals sign sp--; if (stack[sp] != stack[sp+1]) { if (retstat) *retstat = 0; stack[sp] = 1; } else { if (retstat) *retstat = 1; stack[sp] = 0; }; sinc = 1; } }; break; case '>': { if (buf[ip+1] == '=') { ip++; // gobble up the equals sign too... if (sp > 0) { sp--; if (stack[sp] >= stack[sp+1]) { if (retstat) *retstat = 0; stack[sp] = 1; } else { if (retstat) *retstat = 1; stack[sp] = 0; }; sinc = 1; } } else { if (sp > 0) { sp--; if (stack[sp] > stack[sp+1]) { if (retstat) *retstat = 0; stack[sp] = 1; } else { if (retstat) *retstat = 1; stack[sp] = 0; }; sinc = 1; } }; } break; case '<': { if (buf[ip+1] == '=') { ip++; // gobble up the equals sign if (sp > 0) { sp--; if (stack[sp] <= stack[sp+1]) { if (retstat) *retstat = 0; stack[sp] = 1; } else { if (retstat) *retstat = 1; stack[sp] = 0; }; sinc = 1; } } else { if (sp > 0) { sp--; if (stack[sp] < stack[sp+1]) { if (retstat) *retstat = 0; stack[sp] = 1; } else { if (retstat) *retstat = 1; stack[sp] = 0; }; sinc = 1; } }; }; break; case 'e': case 'E': case 'f': case 'F': case 'g': case 'G': case 'x': case 'X': // User-specified formatting; use the user's // top-of-stack value as a format. // { if (sp > 0) { char tempstring [2048]; tempstring[0] = '\0'; sp--; // Special case - if the format is an integer, add a ".0" // to the format string so we get integer output. if ( buf[ip] == 'x' || buf[ip] == 'X') { snprintf (outformat, 63, "%%%g.0ll%c", stack[sp+1], (short) buf[ip] ); } else { if ( ((long)stack[sp+1]) / 1 == stack[sp+1]) { snprintf(outformat, 63, "%%%g.0%c", stack[sp+1], buf[ip]); } else { snprintf(outformat, 63, "%%%g%c", stack[sp+1], buf[ip]); }; }; if (internal_trace) fprintf (stderr, "Format string -->%s<-- \n", outformat); stack[sp+1] = 0; if (buf[ip] != 'x' && buf[ip] != 'X') { snprintf (tempstring, 2047, outformat, stack[sp]); if (internal_trace) fprintf (stderr, "Intermediate result string -->%s<-- \n", tempstring); } else { long long intpart ; intpart = ((long long) stack[sp]) / 1; snprintf (tempstring, 2047, outformat, intpart); if (internal_trace) fprintf (stderr, "Intermediate hex result string -->%s<-- \n", tempstring); }; // And now do the back conversion of the result. // Note that X formatting (hexadecimal) does NOT do the // back conversion; the only effect is to store the // format string for later. if (buf[ip] != 'x' && buf[ip] != 'X') stack[sp] = strtod (tempstring, NULL); } }; break; case ' ': case '\n': case '\t': // // a space is just an end-of-number - push the number we're // seeing. { sinc = 1; }; break; case '(': case ')': // why are you using parenthesis in RPN code?? { nonfatalerror5 ("It's just silly to use parenthesis in RPN!", " Perhaps you should check your setups?", CRM_ENGINE_HERE); sinc = 1; }; break; default: { char bogus[4]; bogus[0] = buf[ip]; bogus[1] = '\000'; nonfatalerror5 (" Sorry, but I can't do RPN math on this un-mathy " "character: ", bogus, CRM_ENGINE_HERE); sinc = 1; }; break; }; }; if (internal_trace) { fprintf (stderr, "Final qexpand state: ip = %ld, sp = %ld, stack[sp] = %f, ch='%c'\n", ip, sp, stack[sp], buf[ip]); if (retstat) fprintf (stderr, "retstat = %ld\n", *retstat); }; // now the top of stack contains the result of the calculation. // fprintf it into the output buffer, and we're done. outstringlen = math_formatter ( stack[sp], outformat, buf, maxlen) ; return (outstringlen); } ///////////////////////////////////////////////////////////////// // // Here's where we format a floating point number so it's "purty". // Note that if "format" is NULL, or a null string, we do smart // formatting on the number itself. // // int math_formatter ( double value, char *format, char *buf, long buflen) { long outlen; // If the user supplied a format, use that. // if (format && format[0] != '\0') { // // special case - if the user supplied an x or X-format, that's // preconversion to integer; use that strlen() does not count // the null termination. if (format[strlen(format)-1] == 'x' || format[strlen(format)-1] == 'X') { long long equiv ; if (internal_trace) fprintf (stderr, "Final hex format: %s\n", format ); equiv = value * 1; outlen = snprintf (buf, buflen, format, equiv); return (outlen); }; // // Nothing so special; use the user format as it is. if (internal_trace) fprintf (stderr, "Final format: %s\n", format ); outlen = snprintf (buf, buflen, format, value); return (outlen); }; // Nope - we didn't get a preferred formatting, so here's the // adaptive smart code. // // print out 0 as 0 // if (value == 0.0 ) { outlen = snprintf (buf, buflen, "0"); goto formatdone; } // // use E notation if bigger than 1 trillion // if (value > 1000000000000.0 || value < -1000000000000.0 ) { outlen = snprintf (buf, buflen, "%.5E", value); goto formatdone; } // // use E notation if smaller than .01 // if ( value < 0.01 && value > -0.01 ) { outlen = snprintf (buf, buflen, "%.5E", value); goto formatdone; } // // if an integer value, print with 0 precision // if (((long)(value*2.0) / 2) == value) { outlen = snprintf (buf, buflen, "%.0f", value); goto formatdone; } // // if none of the above, print with five digits precision // outlen = snprintf (buf, buflen, "%.5f", value); // // // one way or another, once we're here, we've sprinted it. formatdone: if (internal_trace) fprintf (stderr, "math_formatter outlen = %ld\n", outlen); return (outlen); } //////////////////////////////////////////////////////////////////// // // Alternative implementation of the uglyness that is string math. // // This version uses two stacks (left arg, op) and a single scalar // rightarg. Partial computations are kept on the leftarg and op // stack. The current stack status is held in validstack, and is // the OR of LEFTVALID, OPVALID, and RIGHTVALID. // #define LEFTVALID 0x1 #define OPVALID 0x2 #define RIGHTVALID 0x4 long stralmath (char *buf, long inlen, long maxlen, long *retstat) { double leftarg [DEFAULT_MATHSTK_LIMIT] ; // left float arg long opstack [DEFAULT_MATHSTK_LIMIT]; // operand double rightarg; // right float arg long validstack [DEFAULT_MATHSTK_LIMIT]; // validity markers long sp; // stack pointer long ip, op; // input and output pointer long errstat; // error status char *frejected; // done loc. for a strtod. char outformat[256]; // how to format our result long state; // Local copy of state, in case // retstat is NULL (not used) // Start off by initializing things ip = 0; op = 0; sp = 0; outformat[0] = '\0'; state = 0; // Set up the stacks // leftarg [0] = 0.0; rightarg = 0.0; opstack [0] = '\0'; validstack [0] = 0; // initialization done... begin the work. if (internal_trace) fprintf (stderr, "Starting Algebraic Math on '%s' (len %ld)\n", buf, inlen); for (ip = 0; ip < inlen; ip++) { // Debugging trace if (internal_trace) fprintf (stderr, "ip = %ld, sp = %ld, L=%f, Op=%c, R=%f, V=%x next='%c'\n", ip, sp, leftarg[sp], (short) opstack[sp], rightarg, (short) validstack[sp], buf[ip]); // Top of the loop- we're a state machine driven by the top of // the stack's validity. if (sp >= DEFAULT_MATHSTK_LIMIT) { errstat = nonfatalerror5 ("Stack Overflow in math evaluation. ", "CRM114 Barbie says 'This math is too hard'.", CRM_ENGINE_HERE); if (retstat) *retstat = 0; return (0); }; switch (validstack[sp]) { case (0): // empty top of stack; can accept either number or monadic operator if (internal_trace) fprintf (stderr, "stacktop empty\n"); switch (buf[ip]) { // Monadic operators and numbers case '-': case '+': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '.': case ',': // for those locales that use , not . as decimal { if (internal_trace) fprintf (stderr, "found left numeric\n"); leftarg[sp] = strtod (&buf[ip], &frejected); if (user_trace) fprintf (stderr, " Got left arg %e\n", leftarg[sp]); ip = ((unsigned long) frejected) - ((unsigned long) buf) - 1; validstack[sp] = LEFTVALID; }; break; case '(': { if (internal_trace) fprintf (stderr, "Open Paren - start new math stack level\n"); sp++; leftarg[sp] = 0.0; rightarg = 0.0; opstack[sp] = 0; validstack[sp] = 0; } break; // deal with a possible rightarg strtod situation case ' ': break; default: errstat = nonfatalerror5 ("Math expression makes no sense", " (need to have a number here).", CRM_ENGINE_HERE); if (retstat) *retstat = 0; return (0); break; }; break; // if left arg is valid; next thing must be an operator; // however op then op is also valid and should form composite // operators like '>=' and '!=' (see below). case (LEFTVALID): if (internal_trace) fprintf (stderr, "leftvalid\n"); switch (buf[ip]) { case '-': case '+': case '*': case '/': case '%': case '>': case '<': case '=': case '!': case '^': case 'v': case 'e': case 'E': case 'f': case 'F': case 'g': case 'G': case 'x': case 'X': { if (internal_trace) fprintf (stderr, "found op\n"); opstack[sp] = ( buf[ip] & 0xFF ); validstack[sp] = LEFTVALID | OPVALID; // is the next char also an op? If so, gobble it up? switch (buf[ip+1]) { case '=': if (internal_trace) fprintf (stderr, "two-char operator\n"); opstack[sp] = ((opstack[sp] << 8) | buf[ip+1]); ip++; }; }; break; case ')': // close paren pops the stack, and returns the left arg // to "whereever", which might be leftarg stack, or rightarg if (internal_trace) fprintf (stderr, "close parenthesis, pop stack down\n"); sp--; if (validstack[sp] == (LEFTVALID | OPVALID)) { rightarg = leftarg [sp+1]; validstack[sp] = LEFTVALID | OPVALID | RIGHTVALID; } else { leftarg[sp] = leftarg [sp+1]; validstack[sp] = LEFTVALID; }; break; case ' ': break; default: errstat = nonfatalerror5 ("Math needs an operator in: ", buf, CRM_ENGINE_HERE); if (retstat) *retstat = 0; return (0); break; } break; case (LEFTVALID | OPVALID): // left arg and op are both valid; right now we can have // an enhanced operator (next char is also an op) if (internal_trace) fprintf (stderr, "left + opvalid \n"); switch (buf[ip]) { case '(': { if (internal_trace) fprintf (stderr, "Open Paren - start new math stack level\n"); sp++; leftarg[sp] = 0.0; rightarg = 0.0; opstack[sp] = 0; validstack[sp] = 0; } break; // deal with a possible rightarg strtod situation case '-': case '+': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '.': case ',': { rightarg = strtod (&buf[ip], &frejected); if (internal_trace) fprintf (stderr, " Got right arg %e\n", rightarg); ip = ((unsigned long) frejected) - ((unsigned long) buf) - 1; validstack[sp] = validstack[sp] | RIGHTVALID; }; case ' ': break; default: errstat = nonfatalerror5 ("Math is missing a number in: ", buf, CRM_ENGINE_HERE); if (retstat) *retstat = 0; return (0); break; }; }; ////////////////////////////////////////////////// // // Now if we have a left-op-right situation, and can // execute the operator right here and now. // while (validstack[sp] == (LEFTVALID | OPVALID | RIGHTVALID) ) { if (internal_trace) fprintf (stderr, "Executing %c operator\n", (short)opstack[sp]); switch (opstack[sp]) { // Math operators case '+': leftarg[sp] = leftarg[sp] + rightarg; break; case '-': leftarg[sp] = leftarg[sp] - rightarg; break; case '*': leftarg[sp] = leftarg[sp] * rightarg; break; case '/': leftarg[sp] = leftarg[sp] / rightarg; break; case '%': leftarg[sp] = (long long) leftarg[sp] % (long long) rightarg; break; case '^': // since we don't do complex numbers (yet) handle as NaN if (leftarg[sp] < 0.0 \ && ((long long) (rightarg))/1 != rightarg) { leftarg[sp] = leftarg[sp] / 0.0;} else leftarg[sp] = pow (leftarg[sp], rightarg); if (internal_trace) fprintf (stderr, "exp out: %lf\n", leftarg[sp]); break; case 'v': // Logarithm BASE v ARG // Negative bases on logarithms? Not for us! force NaN if (leftarg[sp] <= 0) { leftarg[sp] = leftarg[sp] / 0.0;} else leftarg[sp] = log (rightarg) / log (leftarg[sp]); break; // Relational operators case '<': if (leftarg[sp] < rightarg) { leftarg[sp] = 1; state = 0;} else { leftarg[sp] = 0; state = 1;}; break; case '>': if (leftarg[sp] > rightarg) { leftarg[sp] = 1; state = 0;} else { leftarg[sp] = 0; state = 1;}; break; case '=': if (leftarg[sp] == rightarg) { leftarg[sp] = 1; state = 0; } else { leftarg[sp] = 0; state = 1;}; break; case (('<' << 8) + '='): if (leftarg[sp] <= rightarg) { leftarg[sp] = 1; state = 0;} else { leftarg[sp] = 0; state = 1;}; break; case (('>' << 8) + '='): if (leftarg[sp] >= rightarg) { leftarg[sp] = 1; state = 0;} else { leftarg[sp] = 0; state = 1;}; break; case ( ('!' << 8) + '='): if (leftarg[sp] != rightarg) { leftarg[sp] = 1; state = 0;} else { leftarg[sp] = 0; state = 1;}; break; // Formatting operators case 'e': case 'E': case 'f': case 'F': case 'g': case 'G': case 'x': case 'X': { char tempstring [2048]; if (internal_trace) fprintf (stderr, "Formatting operator %c \n", (short)opstack[sp]); // Do we have a float or an int format? if (opstack[sp] == 'x' || opstack[sp] == 'X') { snprintf (outformat, 255, "%%%g.0ll%c", rightarg, (short) opstack[sp]); } else { if (((long) rightarg) / 1 == rightarg) { snprintf (outformat, 255, "%%%g.0%c", rightarg, (short) opstack[sp]); } else { snprintf (outformat, 255, "%%%g%c", rightarg, (short)opstack[sp]); }; }; if (internal_trace) fprintf (stderr, "Format string -->%s<-- \n", outformat); // A little more funny business needed for // hexadecimal print out, because X format // can't take IEEE floating point as inputs. if (opstack[sp] != 'x' && opstack[sp] != 'X') { if (internal_trace) fprintf (stderr, "Normal convert "); snprintf (tempstring, 2047, outformat, leftarg[sp] ); leftarg[sp] = strtod (tempstring, NULL); validstack[sp] = LEFTVALID; } else { // Note that we actually don't use the // results of octal conversion; the only // effect is to set the final format // string. long long equiv; if (internal_trace) fprintf (stderr, "Oct/Hex Convert "); equiv = leftarg[sp] + 0.0; if (internal_trace) fprintf (stderr, "equiv -->%10lld<-- \n", equiv); snprintf (tempstring, 2047, outformat, equiv); }; }; break; default: errstat = nonfatalerror5 ("Math operator makes no sense in: ", buf, CRM_ENGINE_HERE); if (retstat) *retstat = 0; return (0); break; }; validstack[sp] = LEFTVALID; }; // Check to see that the stack is still valid. if (sp < 0) { errstat = nonfatalerror5 ( "Too many close parenthesis in this math: ", buf, CRM_ENGINE_HERE); if (retstat) *retstat = 0; return (0); }; }; // We made it all the way through. Now return the math formatter result if (internal_trace) fprintf (stderr, "Returning at sp= %ld and value %lf\n", sp, leftarg[sp]); if (retstat) *retstat = state; // Check that we made it all the way down the stack if (sp != 0) { errstat = nonfatalerror5 ("Not enough close parenthesis in this math: ", buf, CRM_ENGINE_HERE); if (retstat) *retstat = 0; return (0); } // All's good, return with a value. { long return_length; return_length = (math_formatter (leftarg[sp], outformat, buf, maxlen )); outformat [return_length] = '\000'; return (return_length); }; } crm114-20100106-BlameMichelson.src/learntest.crm0000755000000000017500000000045011321154266017503 0ustar rootwsy#! /usr/bin/crm # # learntest.crm - learn old .css data into new .css files # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. { match (:data:) /.*/ output /learning to file :*:_arg2: :*:_nl:/ learn (:*:_arg2:) [:data:] /[[:graph:]]+/ } crm114-20100106-BlameMichelson.src/uncaughttraptest.crm0000755000000000017500000000172011321154266021110 0ustar rootwsy#! /usr/bin/crm # # uncaughttraptest.crm - test for uncaught traps # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # test the alius statement { { { output /checking for a foo.../ match /foo/ output /Found a foo :*:_nl:/ } alius { output /no foo... checking for bar.../ match /bar/ output /Found a bar. :*:_nl:/ } alius { output /found neither... continuing :*:_nl:/ fault / plutonium / } { output / dammit! / } } { { output /checking for a baz.../ match /baz/ output /Found a baz :*:_nl:/ { output / damned twice NO SEE THIS // } } alius { output /no baz... checking for wugga.../ match /wugga/ output /Found a wugga. :*:_nl:/ } alius fault / cork / output /found neither baz nor wugga :*:_nl:/ } { output / on the next line, "fault" should be empty.:*:_nl:/ output / thrice damned - fault was ->:*:_fault:<- :*:_nl:/ } } crm114-20100106-BlameMichelson.src/crm114_osbf.h0000644000000000017500000001122611321154266017170 0ustar rootwsy// crm114_osbf.h - This file defines CSS header structure, data and // constants used by the OSBF-Bayes classifier. // Copyright 2004 Fidelis Assis // Copyright 2004-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. #ifndef __CRM114_OSBF_H__ #define __CRM114_OSBF_H__ typedef struct { unsigned long hash; unsigned long key; unsigned long value; } OSBF_FEATUREBUCKET_STRUCT; typedef struct { unsigned char version[4]; unsigned long flags; unsigned long buckets_start; /* offset to first bucket, in bucket size units */ unsigned long buckets; /* number of buckets in the file */ unsigned long learnings; /* number of trainings executed */ } OSBF_FEATURE_HEADER_STRUCT; /* define header size to be a multiple of bucket size with aprox. 4 Kbytes */ #define OSBF_CSS_SPECTRA_START (4096 / sizeof(OSBF_FEATUREBUCKET_STRUCT)) /* complete header */ typedef union { OSBF_FEATURE_HEADER_STRUCT header; /* buckets in header - not really buckets, but the header size is */ /* a multiple of bucket size */ OSBF_FEATUREBUCKET_STRUCT bih[OSBF_CSS_SPECTRA_START]; } OSBF_HEADER_UNION; #define BUCKET_VALUE_MASK 0x0000FFFFLU #define BUCKET_LOCK_MASK 0x80000000LU #define BUCKET_HASH(bucket) (bucket.hash) #define BUCKET_KEY(bucket) (bucket.key) #define BUCKET_RAW_VALUE(bucket) (bucket.value) #define VALID_BUCKET(header, bucket_idx) (bucket_idx < header->buckets) #define GET_BUCKET_VALUE(bucket) ((bucket.value) & BUCKET_VALUE_MASK) #define BUCKET_IS_LOCKED(bucket) ((bucket.value) & BUCKET_LOCK_MASK) #define SETL_BUCKET_VALUE(bucket, val) (bucket.value) = (val) | \ BUCKET_LOCK_MASK #define SET_BUCKET_VALUE(bucket, val) (bucket.value) = val #define LOCK_BUCKET(bucket) (bucket.value) = (bucket.value) | BUCKET_LOCK_MASK #define UNLOCK_BUCKET(bucket) (bucket.value) = (bucket.value) & \ BUCKET_VALUE_MASK #define BUCKET_IN_CHAIN(bucket) (GET_BUCKET_VALUE(bucket) != 0) #define EMPTY_BUCKET(bucket) (GET_BUCKET_VALUE(bucket) == 0) #define BUCKET_HASH_COMPARE(bucket, h, k) ((bucket.hash) == (h) && \ (bucket.key) == (k)) /* CSS file version */ #define SBPH_VERSION 0 #define OSB_VERSION 1 #define CORRELATE_VERSION 2 #define NEURAL_VERSION 3 #define OSB_WINNOW_VERSION 4 #define OSBF_VERSION 5 #define UNKNOWN_VERSION 6 /* Array with pointers to CSS version names, indexed with the CSS file version numbers above. The array is defined in crm_osbf_maintenance.c */ extern char *CSS_version_name[]; /* max feature count */ #define OSBF_FEATUREBUCKET_VALUE_MAX 65535 #define OSBF_DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH 94321 /* max chain len - microgrooming is triggered after this, if enabled */ #define OSBF_MICROGROOM_CHAIN_LENGTH 29 /* maximum number of buckets groom-zeroed */ #define OSBF_MICROGROOM_STOP_AFTER 128 /* minimum ratio between max and min P(F|C) */ #define OSBF_MIN_PMAX_PMIN_RATIO 1 /* max token size before starting "accumulation" of long tokens */ #define OSBF_MAX_TOKEN_SIZE 60 /* accumulate hashes up to this many long tokens */ #define OSBF_MAX_LONG_TOKENS 1000 extern int crm_expr_osbf_bayes_learn (CSL_CELL * csl, ARGPARSE_BLOCK * apb, char *txtptr, long txtoffset, long txtlen); extern int crm_expr_osbf_bayes_classify (CSL_CELL * csl, ARGPARSE_BLOCK * apb, char *txtptr, long txtoffset, long txtlen); extern void crm_osbf_set_microgroom(int value); extern void crm_osbf_microgroom (OSBF_FEATURE_HEADER_STRUCT * h, unsigned long hindex); extern void crm_osbf_packcss (OSBF_FEATURE_HEADER_STRUCT * h, unsigned long packstart, unsigned long packlen); extern void crm_osbf_packseg (OSBF_FEATURE_HEADER_STRUCT * h, unsigned long packstart, unsigned long packlen); extern unsigned long crm_osbf_next_bindex(OSBF_FEATURE_HEADER_STRUCT * header, unsigned long index); extern unsigned long crm_osbf_prev_bindex(OSBF_FEATURE_HEADER_STRUCT * header, unsigned long index); extern unsigned long crm_osbf_find_bucket (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long hash, unsigned long key); extern void crm_osbf_update_bucket (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long bindex, int delta); extern void crm_osbf_insert_bucket (OSBF_FEATURE_HEADER_STRUCT * header, unsigned long bindex, unsigned long hash, unsigned long key, int value); extern int crm_osbf_create_cssfile (char *cssfile, unsigned long buckets, unsigned long major, unsigned long minor, unsigned long spectrum_start); #endif // !__CRM114_OSBF_H__ crm114-20100106-BlameMichelson.src/crm114_sysincludes.h0000644000000000017500000000337511321154266020612 0ustar rootwsy// crm114_sysincludes.h - Files that we include from the system. // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // Files that we include from the system. #ifndef __CRM114_SYSINCLUDES_H__ #define __CRM114_SYSINCLUDES_H__ // autoconf hooks #ifdef HAVE_CONFIG_H #include "config.h" #endif // UNIX and Windows include files #include #include #include #include #include #include #include #include #include #include #include #include // Only TRE regex library is currently supported #include // Normally declared from tre/regex.h //#ifndef REG_LITERAL //#define REG_LITERAL (REG_NOSUB << 1) //#endif // Detect if compilation is occurring in a Microsoft compiler #if (defined (WIN32) || defined (WIN64) || defined (_WIN32) || defined (_WIN64)) #define CRM_WINDOWS #endif #ifndef CRM_WINDOWS // UNIX and Linux specific declarations follow: #include #include #include #include #include #include #else // CRM_WINDOWS // Windows specific declarations follow #define _CRTDBG_MAP_ALLOC #include #include #include #include #include "getopt.h" #define snprintf _snprintf #define stat _stat #define strncasecmp(a,b,n) strnicmp((a), (b), (n)) #define strcasecmp(a,b) stricmp((a), (b)) typedef long int clock_t; #define MAP_FAILED NULL #define PROT_READ 1 #define PROT_WRITE 2 #define MAP_SHARED 1 #define MAP_PRIVATE 2 typedef int pid_t; #define sqrtf sqrt #define msync(a, b, c) FlushViewOfFile(a, b) #define MS_SYNC 0 #endif // CRM_WINDOWS #endif // !__CRM114_SYSINCLUDES_H__ crm114-20100106-BlameMichelson.src/crm_expandvar.c0000644000000000017500000010377311321154266020005 0ustar rootwsy// crm_expandvar.c - expand variables // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *tempbuf; // // crm_nexpandvar - given a string and it's length, go through it // and if there's a variable expansion called for (by the :*: // operator) expand the variable. // // the inputs are a buffer with the NULL-safe string in it, the // length of this string, and the maximum allocated length of the // buffer. This function returns the new length of the buffer. // It will NOT increase the buffer length past maxlen, so // expansions beyond that will cause a nonfatal error and be // aborted. // // Algorithm: // 1) efficiency check- do we need to do any expansions at all. // 2) Start at buf[0], work up to buf[buflen]-3 // 2a) do \n, \r, \a, \xHH and \oOOO // 3) are we looking at :*:? // 4) no: copy 1 character, increment from and to indexes, go to step 3 // 5) yes: skip from index ahead 3, from there to next : is the varname // 6) copy var value to tbuf, incrementing tobuf index. // 7) set from-index to third colon index + 1 // 8) go to 2 (modulo last two chars need copying) // long crm_nexpandvar (char *buf, long inlen, long maxlen) { return (crm_zexpandvar (buf, inlen, maxlen, NULL, CRM_EVAL_ANSI | CRM_EVAL_STRINGVAR | CRM_EVAL_REDIRECT )); } // // crm_qexpandvar is the "full evaluate one pass of everything" mode. long crm_qexpandvar (char *buf, long inlen, long maxlen, long *qex_stat) { return (crm_zexpandvar (buf, inlen, maxlen, qex_stat, CRM_EVAL_ANSI | CRM_EVAL_STRINGVAR | CRM_EVAL_REDIRECT | CRM_EVAL_STRINGLEN | CRM_EVAL_MATH )); } // crm_zexpandvar - "expanded" expandvar. Does all the expansions, // but does not repeat the evaluations. If you want repeats, you // must do that yourself (that way, this function will always // move down the string at least one character, and thus this this // function will always terminate, Nice, that. :) // // the inputs are a buffer with the NULL-safe string in it, the // length of this string, and the maximum allocated length of the // buffer. This function returns the new length of the buffer. // It will NOT increase the buffer length past maxlen, so // expansions beyond that will cause a nonfatal error and be // aborted. // // Algorithm: // 1) efficiency check- do we need to do any expansions at all. // 2) Start at buf[0], work up to buf[buflen]-3 // 2a) do \n, \r, \a, \xHH and \oOOO // 3) are we looking at ::? // 4) no: copy 1 character, increment from and to indexes, go to step 3 // 5) yes: skip from index ahead 3, from there to next : is the varname // 6) copy var value to tbuf, incrementing tobuf index. // 7) set from-index to third colon index + 1 // 8) go to 2 (modulo last two chars need copying) // long crm_zexpandvar (char *buf, long inlen, long maxlen, long *retstat, long exec_bitmask) { long is, id; long vht_index; long q; // the maximum length allocated so far for these random buffers... static long current_maxlen = 0; // a temporary work buffer... static char *tbuf = NULL; // and another for variable names... static char *vname = NULL; char *cp; long vlen; char opchar; // efficiency check - do we even _have_ a :*: in the buffer? // if (inlen == 0) return (0); is= 0; id = 0; if (internal_trace) fprintf (stderr, "qexpandvar on =%s= len: %ld bitmask: %ld \n", buf, inlen, exec_bitmask); // GROT GROT GROT must fix this for 8-bit safe error messages if (inlen > maxlen) { q = fatalerror5 ( "You have blown the gaskets while building a string. Orig string was: ", buf, CRM_ENGINE_HERE); if (q == 0 ) return (inlen); goto bailout; }; // First thing- do the ANSI \-Expansions // if (exec_bitmask & CRM_EVAL_ANSI) { is = 0; id = 0; if (internal_trace) fprintf (stderr, " Doing backslash expansion \n"); for (is = 0; is <= inlen ; is++) { if (buf[is] != '\\' // not a backslash --> verbatim. || is == inlen ) // last char is always verbatim { buf [id] = buf [is]; id++; } else { // we're looking at a '\\'. // // Check for a few common things: \n, \a, \xNN, \oNNN is++; // switch (buf[is]) { case '0': { // it's a NULL. buf[id] = '\0'; id++; } break; case 'b': { // it's a backspace buf[id] = '\b'; id++; } break; case 't': { // it's a tab buf[id] = '\t'; id++; } break; case 'n': { // it's a newline. stuff in a newline. buf[id] = '\n'; id++; } break; case 'v': { // it's a vtab buf[id] = '\v'; id++; } break; case 'f': { // it's a form feed. buf[id] = '\f'; id++; } break; case 'r': { // it's a carriage return buf[id] = '\r'; id++; } break; case 'a': { // it's a BELL. put that in. buf[id] = '\a'; id++; } break; case 'x': case 'X': { // it might be a hex char constant. read it // and stuff it. unsigned int value; int conv_count; conv_count = 0; value = '\000'; if (is+2 < inlen) // watch out for end-of-string conv_count = sscanf (&buf[is+1], "%2X", &value); if (conv_count == 1) { buf[id] = value; id++; is++; is++; // move over the hex digits } else // and otherwise, just copy the x { buf[id] = buf[is]; id++; }; }; break; case 'o': case 'O': { // it might be an octal char constant. read it // and stuff it. unsigned int value; int conv_count ; conv_count = 0; value = '\000'; if (is+3 < inlen) // watch out for end-of-string conv_count = sscanf (&buf[is+1], "%3o", &value); if (conv_count == 1) { buf[id] = value; id++; is++; is++; is++; // move over the octal digits } else // and otherwise, just copy the conv. char. { buf[id] = buf[is]; id++; }; }; break; case '>': case ')': case ']': case '/': case ';': case '{': case '}': case '#': case '\\': { // >, ), ], ;, {, }, #, and / are themselves after // a '\', but need the \ escape to pass thru the parser // without terminating their enclosed args buf[id] = buf[is]; id++; }; break; default: { // if it's "none of the above" characters, then // the '\' character _stays_ as a literal buf[id] = '\\'; id++; buf[id] = buf[is]; id++; }; break; }; }; }; // and update the new inlen buf[id] = '\000'; // needed because slimy old GNU REGEX needs it. // and take one off for inlen, because it always gets incremented one // extra time inlen = id - 1; if (internal_trace) fprintf (stderr, "backslash expansion yields: =%s= len %ld \n", buf, inlen); } // END OF ANSI \-EXPANSIONS // Do a quick check for :'s - this is just a speedup, as all further // operators use the : notation. // if no :, then no ":" operators possible. cp = memchr (buf, ':', inlen); if (cp == NULL) { if (internal_trace) fprintf (stderr, "No further expansions possible\n"); return (inlen); }; // allocate some memory for tbuf and vname; (this funky allocation // is a workaround for malloc memory fragmentation that caused // out-of-memory problems in some kernels. Eventually we'll have // a much grander system for all mallocs, but not yet.) // if the currently allocated buffers are too small, drop them // (and force a reallocation), else we will reuse them. if (current_maxlen < maxlen + 1) // do we need to drop and reallocate? { if (tbuf != NULL) { free (tbuf); tbuf = NULL; }; if (vname != NULL) { free (vname); vname = NULL; }; current_maxlen = maxlen + 2; } if (tbuf == NULL ) { tbuf = (char *) malloc (current_maxlen); }; if (vname == NULL) vname = (char *) malloc (current_maxlen); if (tbuf == NULL || vname == NULL) { q = fatalerror5 ("Couldn't allocate memory for Q-variable expansion!", "Try making the window set smaller with the -w option", CRM_ENGINE_HERE); if (q == 0) return (inlen); }; // OK, we might have a :*: substitution operator, so we actually have // to do some work. // // Now, do we have a :*: (singlevar) possible? if ( exec_bitmask & CRM_EVAL_STRINGVAR ) { is = 0; // is is the input position index id = 0; // id is the destination position index if (internal_trace) fprintf (stderr, "Doing singlevar eval expansion\n"); // // First time through the loop, for :*: (variable expansion) // for (is = 0; is <= inlen && id < maxlen; is++) { if (is <= inlen - 5 // check only if :*:c:" possible && buf[is] == ':' && buf[is+1] == '*' && ( buf[is+2] ==':' )) { // yes, it's probably an expansion of some sort. opchar = buf[is+1]; // copy everything from the colon to the second colon // ( or the end of the string) into the vname buffer. is = is + 2; vname [0] = buf[is]; vlen = 1; is++; while (is < maxlen && is <= inlen && buf [is] != ':') { vname[vlen] = buf[is]; is++; vlen++; }; // // check for the second colon as well... if (buf[is] == ':') { vname[vlen] = ':'; vlen++; } vname [vlen] = '\000'; // // Now we've got the variable name in vname, we can // go get it's value and copy _that_ into tbuf as well. if (internal_trace) fprintf (stderr, "looking up variable >%s<\n", vname); vht_index = crm_vht_lookup (vht, vname, vlen); if (vht[vht_index] == NULL) { // there was no variable by that name, use the text itself switch (opchar) { case '*': { // // simply copy text till the close colon // for (q = 0; q < vlen && id < maxlen; q++) { tbuf[id] = vname[q]; id++; } } break; } } else { // There really was a variable value by that name. // suck it out, and splice it's text value // check to see if it's one of the self-mutating // internal variables, like :_iso: or :_cd: if (strncmp( (char *) &vht[vht_index]->nametxt[vht[vht_index]->nstart], ":_", 2) == 0) { if (strncmp( (char *) &vht[vht_index]->nametxt[vht[vht_index]->nstart], ":_iso:", 6) == 0) { // if this was :_iso:, update iso's length vht[vht_index]->vlen = tdw->nchars; }; if (strncmp( (char *) &vht[vht_index]->nametxt[vht[vht_index]->nstart], ":_cs:", 5) == 0) { // if this was :_cs:, update the current line num char lcstring [32]; long lclen; lcstring[0] = '\0'; lclen = sprintf (lcstring, "%ld", csl->cstmt); crm_set_temp_nvar (":_cs:", lcstring, lclen); }; }; switch (opchar) { case '*': { for (q = 0; q < vht[vht_index]->vlen && id < maxlen; q++) { tbuf[id] = vht[vht_index]->valtxt [(vht[vht_index]->vstart)+q]; id++; } } break; }; }; } // Now, handle the case where we were NOT looking at // :*:c: in buf else { tbuf[id] = buf[is]; id++; } } // and put our results back into buf memcpy (buf, tbuf, id); buf[id] = '\000'; inlen = id - 1; if (internal_trace) fprintf (stderr, " :*: var-expansion yields: =%s= len %ld \n", buf, inlen); } // END OF :*: EXPANSIONS // // Now, do we have a :+: (REDIRECT) operators if ( exec_bitmask & CRM_EVAL_REDIRECT ) { is = 0; // is is the input position index id = 0; // id is the destination position index if (internal_trace) fprintf (stderr, "Doing singlevar redirect expansion\n"); // // First time through the loop, for :+: (variable expansion) // for (is = 0; is <= inlen && id < maxlen; is++) { if (is <= inlen - 5 // check only if :*:c:" possible && buf[is] == ':' && buf[is+1] == '+' && ( buf[is+2] ==':' )) { // yes, it's probably an expansion of some sort. // copy everything from the colon to the second colon // ( or the end of the string) into the vname buffer. is = is + 2; vname [0] = buf[is]; vlen = 1; is++; while (is < maxlen && is <= inlen && buf [is] != ':') { vname[vlen] = buf[is]; is++; vlen++; }; // // check for the second colon as well... if (buf[is] == ':') { vname[vlen] = ':'; vlen++; } vname [vlen] = '\000'; // // Now we've got the variable name in vname, we can // go get it's value and copy _that_ into the vname buffer if (internal_trace) fprintf (stderr, "looking up variable >%s<\n", vname); vht_index = crm_vht_lookup (vht, vname, vlen); if (vht[vht_index] == NULL) { // no op - if no such variable, no change... } else { // There really was a variable value by that name. // suck it out, and make that the new vname text // if this was :_iso:, update iso's length if (strncmp( (char *) &vht[vht_index]->nametxt[vht[vht_index]->nstart], ":_iso:", 6) == 0) { vht[vht_index]->vlen = tdw->nchars; }; for (q = 0; q < vht[vht_index]->vlen && id < maxlen; q++) { vname[q] = vht[vht_index]->valtxt [(vht[vht_index]->vstart)+q]; } // note that vlen is varname len, but vht[]->vlen is // the length of the text. Bad choice of names, eh? vlen = vht[vht_index]->vlen; }; // Second time around: // We have something in vname (either the indirected // varname, or the original varname), we can // go get it's value and copy _that_ into tbuf as well. if (internal_trace) fprintf (stderr, "Second lookup variable >%s<\n", vname); vht_index = crm_vht_lookup (vht, vname, vlen); if (vht[vht_index] == NULL) { // // simply copy text including the close colon // for (q = 0; q < vlen && id < maxlen; q++) { tbuf[id] = vname[q]; id++; } } else { // There really was a variable value by that name. // suck it out, and splice it's text value // if this was :_iso:, update iso's length if (strncmp( (char *) &vht[vht_index]->nametxt[vht[vht_index]->nstart], ":_iso:", 6) == 0) { vht[vht_index]->vlen = tdw->nchars; }; { for (q = 0; q < vht[vht_index]->vlen && id < maxlen; q++) { tbuf[id] = vht[vht_index]->valtxt [(vht[vht_index]->vstart)+q]; id++; } } }; } // Now, handle the case where we were NOT looking at // :+:c: in buf else { tbuf[id] = buf[is]; id++; } } // and put our results back into buf memcpy (buf, tbuf, id); buf[id] = '\000'; inlen = id - 1; if (internal_trace) fprintf (stderr, "indirection :+: expansion yields: =%s= len %ld \n", buf, inlen); } // END OF :+: EXPANSIONS if (exec_bitmask & CRM_EVAL_STRINGLEN) { // // // Expand :#: (string lengths) // if (internal_trace) fprintf (stderr, "Doing stringglength expansion\n"); buf[id] = '\000'; if (internal_trace) fprintf (stderr, " var-expand yields: =%s= len %ld\n", buf, inlen); id = 0; for (is = 0; is <= inlen && id < maxlen; is++) { if (is <= inlen - 5 // check only if :#:c:" possible && buf[is] == ':' && ( buf[is+1] == '#' ) && buf[is+2] ==':') { // yes, it's probably an expansion of some sort. opchar = buf[is+1]; // copy everything from the colon to the second colon // into the vname buffer. is = is + 2; vname [0] = buf[is]; vlen = 1; is++; while (is < maxlen && is <= inlen && buf [is] != ':') { vname[vlen] = buf[is]; is++; vlen++; }; // // check for the second colon as well... if (buf[is] == ':') { vname[vlen] = ':'; vlen++; } vname [vlen] = '\000'; // // Now we've got the variable name in vname, we can // go get it's value and copy _that_ into tbuf as well. if (internal_trace) fprintf (stderr, "looking up variable >%s<\n", vname); vht_index = crm_vht_lookup (vht, vname, vlen); if (vht[vht_index] == NULL) { // there was no variable by that name, use the // text itself switch (opchar) { case '#': { char lentext[MAX_VARNAME]; int m, mm; // the vlen-2 is because we need to get // rid of the ':' sprintf (lentext, "%ld", vlen-2); mm = strlen (lentext); for (m = 0; m < mm && id < maxlen; m++) { tbuf[id] = lentext[m]; id++; }; } break; } } else { // There really was a variable value by that name. // suck it out, and splice it's text value // if this was :_iso:, update iso's length if (strncmp( (char *) &vht[vht_index]->nametxt[vht[vht_index]->nstart], ":_iso:", 6) == 0) { vht[vht_index]->vlen = tdw->nchars; }; switch (opchar) { case '#': { // // Actually, we want the _length_ of the variable // char lentext[MAX_VARNAME]; int m, mm; sprintf (lentext, "%ld", vht[vht_index]->vlen); mm = strlen (lentext); for (m = 0; m < mm && id < maxlen; m++) { tbuf[id] = lentext[m]; id++; }; }; break; }; }; } // Now, handle the case where we were NOT looking at // :*:c: in buf else { tbuf[id] = buf[is]; id++; } } // and put our results back into buf memcpy (buf, tbuf, id); buf[id] = '\000'; // and because id always gets an extra increment... inlen = id - 1; if (internal_trace) fprintf (stderr, " strlen :#: expansion yields: =%s= len %ld \n", buf, inlen); } // END OF :#: STRING LENGTH EXPANSIONS // Do we have any math expansions? if (exec_bitmask & CRM_EVAL_MATH) { // // handle :@: (math evaluations) // // if (internal_trace) fprintf (stderr, "Doing math expansion\n"); buf[id] = '\000'; if (internal_trace) fprintf (stderr, " length-expand yields: =%s= len %ld\n", buf, inlen); id = 0; for (is = 0; is <= inlen && id < maxlen; is++) { if (is <= inlen - 5 // check only if :*:c:" possible && buf[is] == ':' && ( buf[is+1] == '@' ) && buf[is+2] ==':') { // yes, it's probably an expansion of some sort. opchar = buf[is+1]; // copy everything from the colon to the second colon // into the vname buffer. is = is + 2; vname [0] = buf[is]; vlen = 1; is++; while (is < maxlen && is <= inlen && buf [is] != ':') { vname[vlen] = buf[is]; is++; vlen++; }; // // check for the second colon as well... if (buf[is] == ':') { vname[vlen] = ':'; vlen++; } else { nonfatalerror5 ("This math eval didn't end with a ':' which is", " often an error... check it sometime? ", CRM_ENGINE_HERE); }; vname [vlen] = '\000'; // // Now we've got the variable name in vname, we can // go get it's value and copy _that_ into tbuf as well. if (internal_trace) fprintf (stderr, "looking up variable >%s<\n", vname); vht_index = crm_vht_lookup (vht, vname, vlen); if (vht[vht_index] == NULL) { // there was no variable by that name, use the text itself switch (opchar) { case '@': { char mathtext[MAX_VARNAME]; int m, mm; memcpy (mathtext, &vname[1], vlen-2); mathtext[vlen-2] = '\000'; if (internal_trace) fprintf (stderr, "In-Mathtext is -'%s'-\n", mathtext); m = strmath (mathtext, vlen-2, MAX_VARNAME, retstat); if (internal_trace) fprintf (stderr, "Out-Mathtext is -'%s'-\n", mathtext); if (retstat && *retstat < 0) { q = fatalerror5 ("Problem during math evaluation of ", mathtext, CRM_ENGINE_HERE); if (q == 0) return (inlen); goto bailout; } mm = strlen (mathtext); for (m = 0; m < mm && id < maxlen; m++) { tbuf[id] = mathtext[m]; id++; }; } break; } } else { // There really was a variable value by that name. // suck it out, and splice it's text value // if this was :_iso:, update iso's length if (strncmp( (char *) &vht[vht_index]->nametxt[vht[vht_index]->nstart], ":_iso:", 6) == 0) { vht[vht_index]->vlen = tdw->nchars; }; switch (opchar) { case '@': { char mathtext[MAX_VARNAME]; int m, mm; m = 0; for (q = 0; q < vht[vht_index]->vlen && m < maxlen; q++) { mathtext[m] = vht[vht_index]->valtxt [(vht[vht_index]->vstart)+q]; m++; } mathtext[vlen-1] = '\000'; m = strmath (mathtext, vlen-2, MAX_VARNAME, retstat ); if (retstat && *retstat < 0) { q = fatalerror5 ("Problem during math evaluation of ", mathtext, CRM_ENGINE_HERE); if (q == 0) return (inlen); goto bailout; } mm = strlen (mathtext); for (m = 0; m < mm && id < maxlen; m++) { tbuf[id] = mathtext[m]; id++; }; } break; }; }; } // Now, handle the case where we were NOT looking at // :*:c: in buf else { tbuf[id] = buf[is]; id++; } } // and put our results back into buf memcpy (buf, tbuf, id); buf [id] = '\000'; inlen = id - 1; if (internal_trace) fprintf (stderr, " math-expand yields: =%s= len %ld\n", buf, inlen); } // END OF :@: MATH EXPANSIONS // That's all, folks! Clean up the temporary buffer. We null-terminate // it in case we need to do stupid non-8-bit-clean IO on it. tbuf[inlen+1] = '\000'; // We reuse tbuf and vname from now on. // free (tbuf); //free (vname); if (internal_trace) { fprintf (stderr, " Returned length from qexpandvar is %ld\n", inlen); if (retstat) fprintf (stderr, "retstat was: %ld\n", *retstat); }; return (inlen); bailout: return (inlen); } //////////////////////////////////////////////////////////////////// // // crm_restrictvar - hand this routine a []-string, and it hands // you back the VHT index, the applicable char* for the start, and // the length. Or maybe an error. // // Error codes: 0 == none, 1 = nonfatal, 2 = fatal // // Algorithm: first "nextword" thing is always the var. Grab it, // and get the pertinent info from the VHT. Successive nextwords // get either a /regex/ or a numeric (or possibly a numeric pair). // On each one, do successive regexing/indexranging. When no more // nextwords, you're done. // long crm_restrictvar ( char *boxstring, long boxstrlen, long *vht_idx, char **outblock, long *outoffset, long *outlen, char *errstr) { char datastring[MAX_PATTERN + 1]; long datastringlen; char varname[MAX_PATTERN + 1]; long varnamelen; long vmidx; regex_t preg; regmatch_t matches [MAX_SUBREGEX]; char scanbuf[MAX_PATTERN + 1]; long scanbuflen; long nw_start, nw_len; char *mdw; // the data window that this var is stored in. char *start_ptr; long actual_offset; long actual_len; long in_subscript; long i, j; nw_start = 0; nw_len = 0; if (user_trace) fprintf (stderr, "Performing variable restriction.\n"); // Expand the string we were handed. memcpy (datastring, boxstring, boxstrlen); datastring[boxstrlen] = '\0'; if (user_trace) fprintf (stderr, "Variable before expansion '%s' len %ld\n", datastring, boxstrlen); datastringlen = crm_qexpandvar(datastring, boxstrlen, MAX_PATTERN, NULL); if (user_trace) fprintf (stderr, "Variable after expansion: '%s' len %ld\n", datastring, datastringlen); // Get the variable name. crm_nextword (datastring, datastringlen, nw_start, &nw_start, &nw_len); if (internal_trace) fprintf (stderr, "box-parsing varname got start: %ld, len: %ld .\n", nw_start, nw_len); if (nw_len > 0) { memcpy (varname, &datastring[nw_start], nw_len); varname[nw_len] = '\0'; varnamelen = nw_len; } else { // if no variable, use :_dw: memcpy (varname, ":_dw:", 6); varnamelen = 5; }; if (user_trace) fprintf (stderr, "Using variable '%s' for source.\n", varname); // Got the varname. Do a lookup. vmidx = crm_vht_lookup (vht, varname, varnamelen); if (internal_trace) fprintf (stderr, "vmidx = %ld, vht[vmidx] = %lx\n", (long) vmidx, (long) vht[vmidx]); // Is it a real variable? if ( ((void *) vht[vmidx]) == NULL ) { strcpy ( errstr, "This program wants to use a nonexistent variable named: '"); strncat ( errstr, varname, MAX_PATTERN - 128); strcat (errstr, "'"); return (-2); }; // Get the data window - cdw, or tdw. mdw = cdw->filetext; // assume cdw unless otherwise proven... if (vht[vmidx]->valtxt == tdw->filetext) mdw=tdw->filetext; // sanity check - must be tdw or cdw for searching! if (vht[vmidx]->valtxt != tdw->filetext && vht[vmidx]->valtxt != cdw->filetext) { errstr[0] = '\0'; strcat (errstr, "Bogus text block (neither cdw nor tdw) on var "); strcat (errstr, varname); strcat (errstr, "\n"); return (-2); }; if (user_trace) fprintf (stderr, "Found that variable\n"); actual_offset = vht[vmidx]->vstart; actual_len = vht[vmidx]->vlen; // Now, we can go through the remaining terms in the var restriction // and chop the maximal region down (if desired) in_subscript = 0; while ( nw_start <= datastringlen ) { if (user_trace) fprintf (stderr, "Checking restriction at start %ld len %ld (subscr=%ld)\n", nw_start+nw_len, (datastringlen - (nw_start + nw_len)), in_subscript); // get the next word crm_nextword (datastring, datastringlen, nw_start+nw_len, &nw_start, &nw_len); if (internal_trace) fprintf (stderr, "box-parsing left returned start: %ld, len: %ld .\n", nw_start, nw_len); // Are we done? if (nw_len <= 0) { if (user_trace) fprintf (stderr, "Nothing more to do in the var-restrict.\n"); break; } // we need to shred the word (put a NULL at the end so we can // use sscanf on it ) memcpy (scanbuf, &datastring[nw_start], nw_len); scanbuflen = nw_len; scanbuf[scanbuflen] = '\0'; if (internal_trace) fprintf (stderr, " var restrict clause was '%s' len %ld \n", scanbuf, scanbuflen); // Is it int-able? i = sscanf ( scanbuf, "%ld", &j); if (i > 0) { // Check for a negative value of j; negative j would allow // out-of-bounds accessing. if (j < 0) { j = 0; fprintf (stderr, "Var-restriction has negative start or length." " Sorry, but negative start/lengths are not " "allowed, as it's a possible security exploit."); }; // Do the offset/length alternation thing. if (in_subscript == 0) { if (actual_len <= j) { if (user_trace) fprintf (stderr, "Clipping start to %ld", actual_len); j = actual_len; }; if (user_trace) fprintf (stderr, "Variable starting offset: %ld\n", j); actual_offset = actual_offset + j; actual_len = actual_len - j; in_subscript = 1; } else { if (actual_len < j) { if (user_trace) fprintf (stderr, "Clipping length to %ld\n", actual_len); j = actual_len; }; if (user_trace) fprintf (stderr, "Variable starting offset: %ld\n", j); actual_len = j; in_subscript = 0; } } else // it's not an int; see if it's a /regex/ { long regex_start; in_subscript = 0; // no longer in subscript-length mode. if (datastring[nw_start] == '/') // yes, it's a start of regex. copy it into the scan buf, // while looking for the closing '/', and keeping // any \/ as / (everything eles is verbatim). { regex_start = nw_start + 1; // regex starts +1 past start of str nw_len = 0; // nw_len is next open char idx. while ( (regex_start < datastringlen && datastring [regex_start] != '/' ) || ( regex_start < datastringlen && datastring [regex_start] == '/' && datastring [regex_start-1] == '\\')) { // overwrite escaped slashes? if (datastring[regex_start] == '/') nw_len--; scanbuf[nw_len] = datastring[regex_start]; nw_len++; regex_start++; } scanbuf[nw_len] = '\0'; if (user_trace) fprintf (stderr, "Var restriction with regex '%s' len %ld\n", scanbuf, nw_len); // // Compile up that regex j = crm_regcomp (&preg, scanbuf, nw_len, REG_EXTENDED); if (j > 0) { long curstmt; curstmt = csl->cstmt; crm_regerror ( i, &preg, tempbuf, data_window_size); strcpy (errstr, "Regular Expression Compilation Problem on:"); strncat (errstr, tempbuf, MAX_PATTERN - 128); return (-2); }; if (internal_trace) fprintf (stderr, " Running regexec, start at %ld\n", actual_offset); // Time to run the match start_ptr= &(mdw[actual_offset]); j = crm_regexec ( &preg, start_ptr, actual_len, MAX_SUBREGEX, matches, 0, NULL); crm_regfree (&preg); if (j == 0) { // Yes, the regex matched. Find the innermost // participating submatch, and use that. long i; i = 0; while (matches[i].rm_so >= 0) i++; i--; // Now use the stuff in matches[i] as // data to seet the new limits to our var actual_offset = actual_offset + matches[i].rm_so; actual_len = matches[i].rm_eo - matches[i].rm_so; if (user_trace) fprintf (stderr, " Var restrict regex matched, " "new start offset %ld, new length %ld\n", (long) matches[i].rm_so, (long) matches[i].rm_eo); } else { // The regex didn't match. We're done. Length // is now zero. actual_len = 0; if (user_trace) fprintf (stderr, "Var restrict regex didn't match, " "string is now zero length.\n"); goto all_done; }; } } } all_done: ///////////////////////////// // All calculations done. Push actual start and actual length // back onto the output vars if ( outblock ) *outblock = vht[vmidx]->valtxt; if ( outoffset) *outoffset = actual_offset; if ( outlen ) *outlen = actual_len; if (vht_idx) *vht_idx = vmidx; // if (internal_trace) { fprintf (stderr, "Final non-nulls: "); if (vht_idx) fprintf (stderr, " VHTidx %ld", (unsigned long) *vht_idx); if (outblock) fprintf (stderr, " blockaddr %ld", (unsigned long) *outblock); if (outoffset) fprintf (stderr, " startoffset %ld", (unsigned long) *outoffset); if (outlen) fprintf (stderr, " length %ld", (unsigned long) *outlen); fprintf (stderr, "\n"); }; return ( 0 ); } // Common code for LEARN/CLASSIFY/CLUMP/PMULC: parse "box restriction" // from language line (name of input variable, optional selections of // subset of value); if bad do language TRAP/FAIL. // // Return value is what crm_restrictvar() returns. Also returns // pointer to input text through three arguments, and modification of // language state. long crm_exec_box_restriction(CSL_CELL *csl, ARGPARSE_BLOCK *apb, char **txt, long *start, long *len) { char box_text[MAX_PATTERN]; char errstr[MAX_PATTERN]; long ret; // copy text from [] on language line crm_get_pgm_arg (box_text, MAX_PATTERN, apb->b1start, apb->b1len); // Use crm_restrictvar to get start & length to look at. ret = crm_restrictvar(box_text, apb->b1len, NULL, txt, start, len, errstr); if (ret < 0) { long curstmt; curstmt = csl->cstmt; // [non]fatalerror5() always return 0, if they return at all, // so no point looking at it if (ret == -1) (void)nonfatalerror5 (errstr, "", CRM_ENGINE_HERE); if (ret == -2) (void)fatalerror5 (errstr, "", CRM_ENGINE_HERE); // // Did the FAULT handler change the next statement to execute? // If so, continue from there, otherwise, we FAIL. if (curstmt == csl->cstmt) { csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk [ csl->mct[csl->cstmt]->nest_level ] = -1; }; }; return ret; } crm114-20100106-BlameMichelson.src/crm_svm.c0000644000000000017500000027030711321154266016620 0ustar rootwsy// crm_svm.c - Support Vector Machine //////////////////////////////////////////////////////////////////////// // This code is originally copyright and owned by William // S. Yerazunis as file crm_neural_net. In return for addition of // significant derivative work, Jennifer Barry is hereby granted a full // unlimited license to use this code, includng license to relicense under // other licenses. //////////////////////////////////////////////////////////////////////// // // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. #include "crm_svm.h" //static function declarations static Vector *convert_document(char *text, long text_len, unsigned int *features, ARGPARSE_BLOCK *apb); static int compare_features(const void *a, const void *b); //depending on whether SVM_USE_MMAP is set these use mmap or fread static void *map_svm_file(crm_svm_block *blck, char *filename); static void svm_get_meta_data(char *filename, crm_svm_block *blck); static int has_new_vectors(char *filename); static Vector *get_theta_from_svm_file(char *filename, void **st_addr); //these are around for the times when we want to read in the file //without overwriting what we have in memory (ie in a learn during // a classify) static int read_svm_file(crm_svm_block *blck, char *filename); static int read_svm_file_fp(crm_svm_block *blck, FILE *fp); //these always use fwrite. they have to be called sometimes even //when SVM_USE_MMAP is set to grow the file size. static size_t write_svm_file(crm_svm_block *blck, char *filename); static size_t write_svm_file_fp(crm_svm_block *blck, FILE *fp); static size_t svm_write_theta(Vector *theta, FILE *fp); //this writes to the mmap'd file in memory if there's room or //forces an unmap and calls append (then the next call to save_changes //or write will grow the file) static size_t append_vector_to_svm_file(Vector *v, char *filename); //this writes everything back to disk using fwrite or unmap as //appropriate. if the file was read, it always uses fwrite. if //the file was mapped in, it tries to alter that memory to have the //correct new values in it and, if it can't, fwrites it. static size_t crm_svm_save_changes(crm_svm_block *blck, void *addr, char *filename); static void crm_svm_block_init(crm_svm_block *blck); static void crm_svm_block_free_data(crm_svm_block blck); static void crm_svm_learn_new_examples(crm_svm_block *blck, int microgroom); //set the debug value //NOTE THAT: SVM_DEBUG_MODE, the variable used in the other svm files //is min(0, svm_trace-1). //see crm_svm_matrix_util.h for details, but a general scheme is //0 will print out nothing //1 will print out enough to update you on the progress //2 is the first setting that prints anything from functions not in this file //3-6 will print out enough info to tell you where the solver is getting // stuck (not that that should happen!) //7 can be used for big runs but only if you know what you're looking for //8-9 should only be used on small runs because they print out big // matrices int svm_trace = 0; //This is a "smart mode" where the SVM trains on examples in the way it //thinks is best for it to learn. //Mainly: // It waits until it has SVM_BASE_EXAMPLES before doing a learn // regardless of whether the user has actually put on an append. // After that it does the incremental algorithm on up to SVM_INCR_FRAC // appended examples. // If more than SVM_INCR_FRAC get appended, it does a from start learn. int svm_smart_mode = 0; /**********************CONVERTING TEXT TO FEATURES***************************/ //function to be passed to qsort that compares two features //the sort will be in INCREASING feature value order static int compare_features(const void *a, const void *b) { unsigned int *c = (unsigned int *)a; unsigned int *d = (unsigned int *)b; if (*c < *d) { return -1; } if (*c > *d) { return 1; } return 0; } /******************************************************************* *Helper function to convert text to features. * *INPUT: text: text to convert * text_len: number of characters in text * apb: the argparse block. * *OUTPUT: (features: as pass by reference contains the exact features.) * A vector of the features as a MATR_COMPACT, SPARSE_ARRAY. This * feature vector multiplies the features by their label and adds * in a constant term - both things specific to the SVM. In other * words, if apb contains the flag CRM_REFUTE, the vector will multiply * every feature by -1 (since CRM_REFUTE indicates a feature with * a -1 label). In addition, if SVM_ADD_CONSTANT is set, EVERY vector * returned from this function will have a +/-1 (according to its label) * in the first column. This is to introduce a "constant" value into the * SVM classification, as discussed in the comment to * svm_solve_no_init_sol in crm_svm_lib_fncts.c. * *WARNINGS: *1) You need to free the returned vector (using vector_free) * once you are done with it. *2) The returned vector is NOT just a vector of the features. We do * SVM-specific manipulations to it, specifically, multiplying the * features by their label and adding a column if SVM_ADD_CONSTANT * is set. *******************************************************************/ static Vector *convert_document(char *text, long text_len, unsigned int *features, ARGPARSE_BLOCK *apb) { long next_offset; long n_features, i; int class; Vector *v; VectorIterator vit; crm_vector_tokenize_selector (apb, // the APB text, // input string buffer 0, // start offset text_len, // length NULL, // parser regex 0, // parser regex len NULL, // tokenizer coeff array 0, // tokenizer pipeline len 0, // tokenizer pipeline iterations features, // where to put the hashed results MAX_SVM_FEATURES - 1, // max number of hashes &n_features, // how many hashes we actually got &next_offset); // where to start again for more hashes if (apb->sflags & CRM_REFUTE) { //this is a negative example class = -1; } else { class = 1; } if (!n_features) { //blank document if (SVM_ADD_CONSTANT) { v = vector_make_size(1, SPARSE_ARRAY, MATR_COMPACT, 1); vectorit_set_at_beg(&vit, v); vectorit_insert(&vit, 0, class, v); } else { v = vector_make_size(1, SPARSE_ARRAY, MATR_COMPACT, 0); } return v; } //Put the features into a vector qsort(features, n_features, sizeof(unsigned int), compare_features); v = vector_make_size(features[n_features-1]+1, SPARSE_ARRAY, MATR_COMPACT, n_features); vectorit_set_at_beg(&vit, v); //the SVM solver does not incorporate a constant offset //putting this in all of our feature vectors gives that constant offset if (SVM_ADD_CONSTANT) { vectorit_insert(&vit, 0, class, v); } //put the features into the vector, making them unique //if necessary for (i = 0; i < n_features; i++) { if (features[i] == 0) { continue; } vectorit_find(&vit, features[i], v); if (vectorit_curr_col(vit, v) == features[i]) { if (!(apb->sflags & CRM_UNIQUE)) { //if we see something twice and we don't have UNIQUE set //it's entry is 2 (or -2) instead of 1 vectorit_insert(&vit, features[i], vectorit_curr_val(vit, v) + class, v); } } else { vectorit_insert(&vit, features[i], class, v); } } //make v only take up the amount of memory it should if (v && v->type == SPARSE_ARRAY) { expanding_array_trim(v->data.sparray); } return v; } /**************************SVM FILE FUNCTIONS*********************************/ /****************************************************************************** * *There are two sets of functions here. One set is used when SVM_USE_MMAP is *defined and, whenever possible, uses crm_mmap and crm_munmap to do file I/O. *The other, used when SVM_USE_MMAP is not defined, uses exclusively fread and *fwrite to do file I/O. When caching is enabled, having SVM_USE_MMAP *will SIGNIFICANTLY speed up file I/O. In addition, using mmap allows shared *file I/O. We there recommend you use SVM_USE_MMAP if possible. * *Note that SVM_USE_MMAP may call fread and fwrite. It uses fwrite when it is *necessary to grow the file. The file approximately doubles in size each *time fwrite is called, so calls to fwrite should, hopefully, amortize out *over the run. It uses fread when it needs to do a learn in classify since *classify shouldn't make changes to the file. This can be avoided by calling *learn without ERASE or APPEND before doing any classifies. * *Without SVM_USE_MMAP enabled, mmap is NEVER called. * *The SVM file is a binary file formatted as follows: * *SVM_FIRST_NBIT bytes: A string or whatever you want defined in * SVM_FIRST_BITS. This isn't a checksum since we don't want to have to read * in the whole file every time in order to verify it - it's simply a stored * value (or even a string) that all SVM stat files have as the first few * bytes to identify them. While there is as much error checking as I can do * in this code, non-SVM binary files can create seg faults by mapping two * vector headers into the same space so that changing one changes part of * another. There is almost nothing I can do about that, so, to eliminate * that problem as far as I can, we have a little "magic string" in front. * *N_OFFSETS_IN_SVM_FILE size_t's: * * size_t size: The offset until the end of the actual data stored in the file. * We leave a large hole at the end of the file so we can append to it without * having to uncache it from memory. This is the offset to the beginning of * the hole. When reading the file in, we do not need to read past this * offset since the rest is garbage. This changes each time we append a * vector. * * size_t OLD_OFFSET: The offset in bytes from the BEGINNING of the file to the * matrix of old vectors if it exists. This stays constant through maps and * unmaps, but may change at fwrite. * * size_t NEW_OFFSET: The offset in bytes from the BEGINNING of the file to the * begining of any vectors that have been appended since the last full map * or read. This is "size" at the time of the last read or map. * *N_CONSTANTS_NOT_IN_BLOCK ints: * * int n_new: number of examples that we have read in but HAVEN'T learned on. * Clearly this number cannot include vectors that have been appended since * the last read or map of the file. If n_new > 0, we certainly have new * vectors to learn on, but n_new = 0 DOES NOT indicate no new vectors to * learn on. It is also necessary to seek to NEW_OFFSET and check if there * are vectors there. * *N_CONSTANTS_IN_SVM_BLOCK ints: * * int n_old: number of examples we have learned on that aren't support vectors * * int has_solution: 1 if there is a solution in the file, 0 else * * int n0: number of examples in class 0 * * int n1: number of examples in class 1 * * int n0f: total number of features in class 0 * * int n1f: total number of features in class 1 * * int map_size: the current size of the map that maps matrix rows to their * actual location in the file. This stays larger (usually) than the * number of total vectors in the file because appending wouldn't allow * us to grow at this point it the file. Therefore, we leave a "hole" so * that we don't always have to unmap the file if new vectors have been * appended. * *VECTOR MAP: * * A map_size array of ints. The ith entry in VECTOR_MAP is the offset from * the BEGINNING of the file to the ith vector where vectors are in SV, old, * new order. * *DECISION BOUNDARY: * * Decision vector: the decision vector written as a vector * * int fill: the amount of filler we leave to allow the decision boundary to * to grow without having to grow the file. * * void fill: a "hole" allowing the decision vector to grow in size in new * learns. * *RESTART CONSTANTS: * * int num_examples: the total number of examples we've learned on (since the * beginning or the last FROMSTART) * * int max_train_val: sum_c alpha_c <= max_train_val (constant used in * restarting - see crm_svm_lib_fncts.c for details) * *SV MATRIX: * * The support vector matrix header. When the file is written using fwrite, * the support vectors are written after the header. However, since SVs can * change on subsequent learns, the vectors written after SV matrix (if any) * aren't guaranteed to be SVs any more. The VECTOR MAP must be used to * reconstruct the SV MATRIX. * *OLDXY MATRIX (at OLD_OFFSET): * * The oldXy matrix header. The oldXy matrix consists of examples we have * learned on, but that aren't SVs. When the file is written using fwrite, * all rows of oldXy are written after oldXy. However since these rows can * change on subsequent learns, the vectors written after oldXy (if any) * aren't guaranteed to actually be old, non-SV examples. The VECTOR MAP * must be used to reconstruct the OLDXY MATRIX. * *NEW VECTORS YET TO BE LEARNED ON (at NEW_OFFSET or stored in VECTOR_MAP): * * Each new vector is formatted as a vector (ie we don't keep the matrix header * - this makes appending easy). Some of them may not be in the VECTOR MAP if * they have been appended since the last full read/map in. These are all * listed after NEW_OFFSET. * *The file is formatted this way to make the following actions quick both using * fread/fwrite and mmap/munmap: * * Finding if the file has a solution: requires a seek to has_solution and a * read of that value. * * Finding the decision boundary if it exists: requires a sequential fread * of N_CONSTANTS_IN_SVM_BLOCK, a seek to DECISION BOUNDARY, reading in the * vector stored there. * * Querying if there are unlearned on vectors: requries a seek to the position * of NEW_OFFSET in the file, a sequential read of NEW_OFFSET and of n_new. * If n_new = 0, requires a seek to NEW_OFFSET. * * Appending a vector: * using fread/fwrite: requires opening the file for appending and writing * out the vector * using mmap/munmap: requires mapping in the file, reading in size and * seeking to point size in the file. if there is room, writes the vector * there. else forcibly munmaps the file and opens it for appending. *****************************************************************************/ //mmap functions #ifdef SVM_USE_MMAP //maps the full file into blck. used before calling learn_new_examples. static void *map_svm_file(crm_svm_block *blck, char *filename) { struct stat statbuf; long act_size; void *addr, *last_addr, *st_addr; Vector *v; size_t old_offset, new_offset, size; int *vmap, fill, curr_rows = 0, n_new, i; if (stat(filename, &statbuf)) { nonfatalerror("Attempt to read from nonexistent svm file", filename); return NULL; } if (!blck) { //this really shouldn't happen fatalerror5("map_svm_file: bad crm_svm_block pointer.", "", CRM_ENGINE_HERE); return NULL; } crm_svm_block_init(blck); addr = crm_mmap_file(filename, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, &act_size); if (addr == MAP_FAILED) { nonfatalerror("Attempt to map svm file failed. The file was", filename); return NULL; } st_addr = addr; if (act_size < sizeof(size_t) + SVM_FIRST_NBIT) { nonfatalerror ("Attempt to read from corrupted svm file. It is much too small.", ""); crm_munmap_file(st_addr); return NULL; } if (strncmp(SVM_FIRST_BITS, (char *)st_addr, strlen(SVM_FIRST_BITS))) { nonfatalerror ("Attempt to map from corrupted SVM file. The header is incorrect.", ""); crm_munmap_file(st_addr); return NULL; } addr += SVM_FIRST_NBIT; //this is where the data actually ends size = *((size_t*)addr); if (size > act_size) { //corrupted file nonfatalerror("Attempt to read from corrupted svm file. It thinks it has a larger length than it does. The file is", filename); crm_munmap_file(st_addr); return NULL; } addr += sizeof(size_t); last_addr = st_addr + size; //last address that contains good data if (size < N_OFFSETS_IN_SVM_FILE*sizeof(size_t) + (N_CONSTANTS_IN_SVM_BLOCK + N_CONSTANTS_NOT_IN_BLOCK)*sizeof(int)) { //this is isn't a good file nonfatalerror("Attempt to read from corrupted svm file. It is somewhat too small.", filename); crm_munmap_file(st_addr); return NULL; } old_offset = *((size_t*)addr); //where oldXY header is addr += sizeof(size_t); new_offset = *((size_t*)addr); //where new vectors not in vmap are addr += sizeof(size_t); n_new = *((int *)addr); //# of read in, not learned on vectors addr += sizeof(int); blck->n_old = *((int*)(addr)); //# of learned-on, non-SV vectors addr += sizeof(int); blck->has_solution = *((int*)(addr)); //do we have a solution? addr += sizeof(int); blck->n0 = *((int *)(addr)); //# learned-on examples in class 0 addr += sizeof(int); blck->n1 = *((int *)(addr)); //# learned-on examples in class 1 addr += sizeof(int); blck->n0f = *((int *)(addr)); //# features in class 0 addr += sizeof(int); blck->n1f = *((int *)(addr)); //# features in class 1 addr += sizeof(int); blck->map_size = *((int *)(addr)); //space allocated for vmap addr += sizeof(int); if (addr + sizeof(int)*blck->map_size > last_addr) { nonfatalerror ("Attempt to map from bad svm file. It can't fit its own map.", ""); crm_svm_block_init(blck); crm_munmap_file(st_addr); return NULL; } vmap = (int *)addr; //map that tells us where each vector is stored addr += sizeof(int)*blck->map_size; if (blck->has_solution) { //read in the solution blck->sol = (SVM_Solution *)malloc(sizeof(SVM_Solution)); blck->sol->theta = vector_map(&addr, last_addr); //decision boundary blck->sol->SV = NULL; if (addr + sizeof(int) > last_addr) { nonfatalerror ("Attempt to map from bad svm file. It can't fit its solution.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); crm_munmap_file(st_addr); return NULL; } fill = *((int *)addr); //hole to grow decision boundary addr += sizeof(int); if (!blck->sol->theta || addr + fill + 2*sizeof(int) + sizeof(Matrix) > last_addr) { nonfatalerror ("Attempt to map from bad svm file. It can't fit in the SV matrix.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); crm_munmap_file(st_addr); return NULL; } addr += fill; //restart constants blck->sol->num_examples = *((int *)addr); addr += sizeof(int); blck->sol->max_train_val = *((int *)addr); addr += sizeof(int); blck->sol->SV = (Matrix *)addr; //SV matrix header addr += sizeof(Matrix); blck->sol->SV->was_mapped = 1; blck->sol->SV->data = (Vector **)malloc(sizeof(Vector *)*blck->sol->SV->rows); if (!blck->sol->SV->data) { nonfatalerror("Unable to allocate enough memory for support vector matrix. This is likely a corrupted SVM file, but we aren't going to be able to recover from it.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); crm_munmap_file(st_addr); return NULL; } //read in the SV vectors using vmap for (i = 0; i < blck->sol->SV->rows; i++) { addr = st_addr + vmap[i + curr_rows]; blck->sol->SV->data[i] = vector_map(&addr, last_addr); if (!blck->sol->SV->data[i]) { break; } } if (i != blck->sol->SV->rows) { blck->sol->SV->rows = i; nonfatalerror("Attempt to map from bad svm file. An SV was wrong somehow.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); crm_munmap_file(st_addr); return NULL; } curr_rows += blck->sol->SV->rows; } //oldXy matrix if (blck->n_old) { addr = st_addr + old_offset; if (addr + sizeof(Matrix) > last_addr) { nonfatalerror("Attempt to map from bad svm file. There's no room for the old example matrix.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); crm_munmap_file(st_addr); return NULL; } blck->oldXy = (Matrix *)addr; //oldXy header addr += sizeof(Matrix); blck->oldXy->was_mapped = 1; blck->oldXy->data = (Vector **)malloc(sizeof(Vector *)*blck->oldXy->rows); if (!blck->oldXy->data) { nonfatalerror("Unable to allocate enough memory for support vector matrix. This is likely a corrupted SVM file.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); crm_munmap_file(st_addr); return NULL; } //read in oldXy vectors using vmap for (i = 0; i < blck->oldXy->rows; i++) { addr = st_addr + vmap[i + curr_rows]; blck->oldXy->data[i] = vector_map(&addr, last_addr); if (!blck->oldXy->data[i]) { break; } } if (i != blck->oldXy->rows) { blck->oldXy->rows = i; nonfatalerror("Attempt to map from bad svm file. An old example was wrong somehow.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); crm_munmap_file(st_addr); return NULL; } curr_rows += blck->oldXy->rows; } //newXy vectors if (n_new) { //read in ones we've already read in and put in vmap addr = st_addr + vmap[curr_rows]; v = vector_map(&addr, last_addr); i = 0; if (v) { blck->newXy = matr_make_size(n_new, v->dim, v->type, v->compact, v->size); if (!blck->newXy) { nonfatalerror("Attempt to map from bad svm file. An unrecognized new vector type in our new matrix..", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); crm_munmap_file(st_addr); return NULL; } matr_shallow_row_copy(blck->newXy, 0, v); for (i = 1; i < n_new; i++) { addr = st_addr + vmap[i + curr_rows]; v = vector_map(&addr, last_addr); if (!v) { break; } matr_shallow_row_copy(blck->newXy, i, v); } } if (i != n_new) { nonfatalerror("Attempt to map from bad svm file. A new vector was wrong somewhere.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); crm_munmap_file(st_addr); return NULL; } } addr = st_addr + new_offset; //read in any vectors that have been appended since the last map if (addr < last_addr) { v = vector_map(&addr, last_addr); if (v) { if (!blck->newXy) { blck->newXy = matr_make_size(0, v->dim, v->type, v->compact, v->size); } if (!blck->newXy) { nonfatalerror("Attempt to map from bad svm file. A very new vector had an unrecognized type.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); crm_munmap_file(st_addr); return NULL; } matr_shallow_row_copy(blck->newXy, blck->newXy->rows, v); while (addr < last_addr) { v = vector_map(&addr, last_addr); if (v && v->dim) { matr_shallow_row_copy(blck->newXy, blck->newXy->rows, v); } else { if (v && !v->dim) { vector_free(v); } break; } } } } return st_addr; } //gets the integers (like n0, n1, etc) stored in the first few bytes //of the file without reading in the whole file. //puts them in blck static void svm_get_meta_data(char *filename, crm_svm_block *blck) { void *addr, *last_addr, *st_addr; struct stat statbuf; size_t size; long act_size; if (stat(filename, &statbuf)) { //heck, we don't even have a file! nonfatalerror ("You are trying to use an SVM to classify from the nonexistant file", filename); if (blck) { blck->n_old = 0; blck->has_solution = 0; blck->n0 = 0; blck->n1 = 0; blck->n0f = 0; blck->n1f = 0; blck->map_size = SVM_DEFAULT_MAP_SIZE; } else { fatalerror5("svm_get_meta_data: bad crm_svm_block pointer.", "", CRM_ENGINE_HERE); } return; } if (!blck) { fatalerror5("svm_get_meta_data: bad crm_svm_block pointer.", "", CRM_ENGINE_HERE); return; } //just always do PROT_READ | PROT_WRITE so that if it's cached we get it addr = crm_mmap_file(filename, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, &act_size); if (addr == MAP_FAILED || act_size < sizeof(size_t) + SVM_FIRST_NBIT) { fatalerror5("Could not map SVM file to get meta data. Something is very wrong and I doubt we can recover. The file is", filename, CRM_ENGINE_HERE); if (addr != MAP_FAILED) { crm_munmap_file(addr); } return; } st_addr = addr; if (strncmp(SVM_FIRST_BITS, (char *)addr, strlen(SVM_FIRST_BITS))) { nonfatalerror("This svm file is corrupted. The file is", filename); blck->n_old = 0; blck->has_solution = 0; blck->n0 = 0; blck->n1 = 0; blck->n0f = 0; blck->n1f = 0; blck->map_size = SVM_DEFAULT_MAP_SIZE; crm_munmap_file(st_addr); return; } addr += SVM_FIRST_NBIT; size = *((size_t *)addr); //actual size (rest is garbage hole) last_addr = st_addr + size; if (size > act_size || addr + N_OFFSETS_IN_SVM_FILE*sizeof(size_t) + (N_CONSTANTS_IN_SVM_BLOCK + N_CONSTANTS_NOT_IN_BLOCK)*sizeof(int) > last_addr) { nonfatalerror("This svm file is corrupted. The file is", filename); blck->n_old = 0; blck->has_solution = 0; blck->n0 = 0; blck->n1 = 0; blck->n0f = 0; blck->n1f = 0; blck->map_size = SVM_DEFAULT_MAP_SIZE; crm_munmap_file(st_addr); return; } addr += N_OFFSETS_IN_SVM_FILE*sizeof(size_t) + N_CONSTANTS_NOT_IN_BLOCK*sizeof(int); blck->n_old = *((int *)addr); //# learned-on, non-SV examples addr += sizeof(int); blck->has_solution = *((int *)addr); //1 if there is a solution addr += sizeof(int); blck->n0 = *((int *)addr); //# examples in class 0 addr += sizeof(int); blck->n1 = *((int *)addr); //# examples in class 1 addr += sizeof(int); blck->n0f = *((int *)addr); //# features in class 0 addr += sizeof(int); blck->n1f = *((int *)addr); //# features in class 1 addr += sizeof(int); blck->map_size = *((int *)addr); //size of vector map crm_munmap_file(st_addr); } //returns 1 if the file has vectors that have been appended but not yet //learned on //returns 0 else static int has_new_vectors(char *filename) { Vector *v; void *addr, *last_addr, *st_addr; size_t offset, size; int n_new; struct stat statbuf; long act_size; if (stat(filename, &statbuf)) { //heck, we don't even have a file! return 0; } //this is PROT_WRITE because, if we read in a vector, we may flip //a bit telling us that the vector was mapped in - which tells us what parts //of the vector should be freed addr = crm_mmap_file(filename, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, &act_size); if (addr == MAP_FAILED || act_size < sizeof(size_t) + SVM_FIRST_NBIT) { nonfatalerror("There was a problem mapping the svm file in while checking for new vectors. I am going to assume there are no new vectors. The file was", filename); if (addr != MAP_FAILED) { crm_munmap_file(addr); } return 0; } st_addr = addr; if (strncmp(SVM_FIRST_BITS, (char *)addr, strlen(SVM_FIRST_BITS))) { nonfatalerror("The SVM file is corrupted. I am going to assume it contains no new examples. The file is", filename); crm_munmap_file(st_addr); return 0; } addr += SVM_FIRST_NBIT; size = *((size_t *)addr); //actual amount of good data last_addr = st_addr + size; if (size > act_size || addr + 3*sizeof(size_t) + sizeof(int) > last_addr) { nonfatalerror("There was a problem mapping the svm file in while checking for new vectors. I am going to assume there are no new vectors. The file was", filename); crm_munmap_file(st_addr); return 0; } addr += 2*sizeof(size_t); offset = *((size_t *)addr); //offset to new, unread vectors addr += sizeof(size_t); n_new = *((int *)addr); //number of new, read vectors addr += sizeof(int); if (n_new) { //yep, definitely have new vectors crm_munmap_file(st_addr); return 1; } //do we have vectors we haven't read in before but have appended? addr = st_addr + offset; if (addr >= last_addr) { crm_munmap_file(st_addr); return 0; } //do we really have a vector? let's try reading one in v = vector_map(&addr, last_addr); crm_munmap_file(st_addr); if (v) { vector_free(v); return 1; } return 0; } //returns the decision boundary from an svm file //we map the decision boundary from the file so you must // FREE THE DECISION BOUNDARY returned by the function // MUNMAP THE FILE returned pass-by-reference in *addr static Vector *get_theta_from_svm_file(char *filename, void **st_addr) { Vector *v; void *last_addr, *addr; size_t size; int *hs; struct stat statbuf; long act_size; if (stat(filename, &statbuf)) { nonfatalerror("You are trying to read a decision boundary from a file that doesn't exist. The file is", filename); return NULL; } //this is PROT_WRITE because, if we read in theta, we may need to flip //a bit telling us that theta was mapped in - which tells us what parts //of theta should be freed addr = crm_mmap_file(filename, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, &act_size); if (addr == MAP_FAILED || act_size < sizeof(size_t) + SVM_FIRST_NBIT) { nonfatalerror("Attempt to map svm file while getting decision boundary failed. The file was", filename); if (addr != MAP_FAILED) { crm_munmap_file(addr); } *st_addr = NULL; return NULL; } *st_addr = addr; if (strncmp(SVM_FIRST_BITS, (char *)addr, strlen(SVM_FIRST_BITS))) { nonfatalerror("Attempt to read decision boundary from a corrupt SVM file. The file was", filename); crm_munmap_file(*st_addr); *st_addr = NULL; return NULL; } addr += SVM_FIRST_NBIT; size = *((size_t *)addr); last_addr = *st_addr + size; if (size > act_size || addr + N_OFFSETS_IN_SVM_FILE*sizeof(size_t) + (N_CONSTANTS_NOT_IN_BLOCK+N_CONSTANTS_IN_SVM_BLOCK)*sizeof(int) > last_addr) { nonfatalerror("Attempt to map svm file while getting decision boundary failed. The file was", filename); crm_munmap_file(*st_addr); *st_addr = NULL; return NULL; } addr += N_OFFSETS_IN_SVM_FILE*sizeof(size_t) + N_CONSTANTS_NOT_IN_BLOCK*sizeof(int); hs = (int *)addr; addr += sizeof(int)*N_CONSTANTS_IN_SVM_BLOCK; if (addr > last_addr || !hs[HAS_SOLUTION_INDEX] || addr + sizeof(int)*hs[MAP_SIZE_INDEX] > last_addr) { nonfatalerror("You are trying to read a decision boundary from a file that doesn't contain an SVM solution or is corrupted. The file is", filename); crm_munmap_file(*st_addr); *st_addr = NULL; return NULL; } addr += sizeof(int)*hs[MAP_SIZE_INDEX]; v = vector_map(&addr, last_addr); return v; } //fread functions #else //reads a binary svm block from a file //to make the ifdefs work this has the same prototype as the //function that maps the file in, but it always returns NULL. static void *map_svm_file(crm_svm_block *blck, char *filename) { FILE *fp = fopen(filename, "rb"); int ret; if (!fp) { //this file doesn't exist nonfatalerror("Attempt to read from nonexistent svm file", filename); return NULL; } ret = read_svm_file_fp(blck, fp); fclose(fp); if (ret) { return (void *)filename; } else { return NULL; } } //gets the integers (like n0, n1, etc) stored in the first few bytes //of the file without reading in the whole file. //puts them in blck static void svm_get_meta_data(char *filename, crm_svm_block *blck) { FILE *fp = fopen(filename, "rb"); size_t amount_read; char firstbits[strlen(SVM_FIRST_BITS)]; if (!fp) { //heck, we don't even have a file! nonfatalerror ("You are trying to use an SVM to classify from the nonexistant file", filename); if (blck) { blck->n_old = 0; blck->has_solution = 0; blck->n0 = 0; blck->n1 = 0; blck->n0f = 0; blck->n1f = 0; blck->map_size = SVM_DEFAULT_MAP_SIZE; } else { fatalerror5("svm_get_meta_data: bad crm_svm_block pointer.", "", CRM_ENGINE_HERE); } return; } if (!blck) { fatalerror5("svm_get_meta_data: bad crm_svm_block pointer.", "", CRM_ENGINE_HERE); fclose(fp); return; } amount_read = fread(firstbits, 1, SVM_FIRST_NBIT, fp); if (strncmp(SVM_FIRST_BITS, firstbits, strlen(SVM_FIRST_BITS))) { nonfatalerror("This svm file is corrupted. The file is", filename); blck->n_old = 0; blck->has_solution = 0; blck->n0 = 0; blck->n1 = 0; blck->n0f = 0; blck->n1f = 0; blck->map_size = SVM_DEFAULT_MAP_SIZE; fclose(fp); return; } fseek(fp, SVM_FIRST_NBIT + N_OFFSETS_IN_SVM_FILE*sizeof(size_t) + N_CONSTANTS_NOT_IN_BLOCK*sizeof(int), SEEK_SET); amount_read = fread(&(blck->n_old), sizeof(int), 1, fp); amount_read += fread(&(blck->has_solution), sizeof(int), 1, fp); amount_read += fread(&(blck->n0), sizeof(int), 1, fp); amount_read += fread(&(blck->n1), sizeof(int), 1, fp); amount_read += fread(&(blck->n0f), sizeof(int), 1, fp); amount_read += fread(&(blck->n1f), sizeof(int), 1, fp); amount_read += fread(&(blck->map_size), sizeof(int), 1, fp); if (amount_read < N_CONSTANTS_IN_SVM_BLOCK) { nonfatalerror("This svm file is corrupted. The file is", filename); blck->n_old = 0; blck->has_solution = 0; blck->n0 = 0; blck->n1 = 0; blck->n0f = 0; blck->n1f = 0; blck->map_size = SVM_DEFAULT_MAP_SIZE; } fclose(fp); } //returns 1 if the file has vectors that have been appended but not yet //learned on //returns 0 else static int has_new_vectors(char *filename) { FILE *fp = fopen(filename, "rb"); size_t offset, unused, size; Vector *v; int n_new; char firstbits[strlen(SVM_FIRST_BITS)]; if (!fp) { //heck, we don't even have a file! return 0; } unused = fread(firstbits, 1, SVM_FIRST_NBIT, fp); if (strncmp(SVM_FIRST_BITS, firstbits, strlen(SVM_FIRST_BITS))) { nonfatalerror("This svm file is corrupted. I am assuming it has no new vectors since I can't read it. The file is", filename); fclose(fp); return 0; } unused = fread(&size, sizeof(size_t), 1, fp); fseek(fp, sizeof(size_t), SEEK_CUR); unused = fread(&offset, sizeof(size_t), 1, fp); unused = fread(&n_new, sizeof(int), 1, fp); if (n_new) { //we have new vectors fclose(fp); return 1; } fseek(fp, offset, SEEK_SET); if (feof(fp) || ftell(fp) >= size) { //no new vectors fclose(fp); return 0; } //maybe new vectors? sometimes the end of a file is a funny place v = vector_read_bin_fp(fp); fclose (fp); if (v) { vector_free(v); if (ftell(fp) <= size) { return 1; } } return 0; } //returns the decision boundary from an svm file //don't forget to free the boundary when you are done with it! static Vector *get_theta_from_svm_file(char *filename, void **st_addr) { FILE *fp = fopen(filename, "rb"); int hs[N_CONSTANTS_IN_SVM_BLOCK]; size_t amount_read, size; Vector *v; char firstbits[strlen(SVM_FIRST_BITS)]; *st_addr = NULL; if (!fp) { nonfatalerror("You are trying to read a decision boundary from a file that doesn't exist. The file is", filename); return NULL; } amount_read = fread(firstbits, 1, SVM_FIRST_NBIT, fp); if (strncmp(SVM_FIRST_BITS, firstbits, strlen(SVM_FIRST_BITS))) { nonfatalerror("This svm file is corrupted. I cannot read a decision boundary from it. The file is", filename); fclose(fp); return NULL; } amount_read = fread(&size, sizeof(size_t), 1, fp); fseek(fp, SVM_FIRST_NBIT+ N_OFFSETS_IN_SVM_FILE*sizeof(size_t) + N_CONSTANTS_NOT_IN_BLOCK*sizeof(int), SEEK_SET); amount_read = fread(hs, sizeof(int), N_CONSTANTS_IN_SVM_BLOCK, fp); if (feof(fp) || ftell(fp) >= size || amount_read < N_CONSTANTS_IN_SVM_BLOCK || !hs[HAS_SOLUTION_INDEX]) { nonfatalerror("You are trying to read a decision boundary from a file that doesn't contain an SVM solution or is corrupted. The file is", filename); return NULL; } fseek(fp, hs[MAP_SIZE_INDEX]*sizeof(int), SEEK_CUR); if (feof(fp) || ftell(fp) >= size) { nonfatalerror("You are trying to read a decision boundary from a file that doesn't contain an SVM solution or is corrupted. The file is", filename); return NULL; } v = vector_read_bin_fp(fp); if (feof(fp) || ftell(fp) >= size) { nonfatalerror("You are trying to read a decision boundary from a file that doesn't contain an SVM solution or is corrupted. The file is", filename); vector_free(v); fclose(fp); return NULL; } fclose(fp); return v; } #endif //functions used to read in the file. //these are used both under map and read since we read in a file using fread //always when we need to do a learn in classify. static int read_svm_file(crm_svm_block *blck, char *filename) { FILE *fp = fopen(filename, "rb"); int ret; if (!fp) { nonfatalerror("Attempt to read from nonexistent svm file", filename); return 0; } ret = read_svm_file_fp(blck, fp); fclose(fp); return ret; } //reads a binary svm block from a file //returns 0 on failure static int read_svm_file_fp(crm_svm_block *blck, FILE *fp) { size_t amount_read, old_offset, new_offset, size; Vector *v; int *vmap, i, curr_rows = 0, fill, n_new; char firstbits[strlen(SVM_FIRST_BITS)]; if (!blck) { //this really shouldn't happen fatalerror5("read_svm_file_fp: bad crm_svm_block pointer.", "", CRM_ENGINE_HERE); return 0; } if (!fp) { nonfatalerror("Attempt to read svm from bad file pointer.", ""); return 0; } crm_svm_block_free_data(*blck); crm_svm_block_init(blck); amount_read = fread(firstbits, 1, SVM_FIRST_NBIT, fp); if (strncmp(SVM_FIRST_BITS, firstbits, strlen(SVM_FIRST_BITS))) { nonfatalerror("This svm file is corrupted. I cannot read it.", ""); return 0; } amount_read = fread(&size, sizeof(size_t), 1, fp); amount_read = fread(&old_offset, sizeof(size_t), 1, fp); amount_read = fread(&new_offset, sizeof(size_t), 1, fp); amount_read = fread(&n_new, sizeof(int), 1, fp); amount_read = fread(&(blck->n_old), sizeof(int), 1, fp); amount_read += fread(&(blck->has_solution), sizeof(int), 1, fp); amount_read += fread(&(blck->n0), sizeof(int), 1, fp); amount_read += fread(&(blck->n1), sizeof(int), 1, fp); amount_read += fread(&(blck->n0f), sizeof(int), 1, fp); amount_read += fread(&(blck->n1f), sizeof(int), 1, fp); amount_read += fread(&(blck->map_size), sizeof(int), 1, fp); if ((amount_read < N_CONSTANTS_IN_SVM_BLOCK) || ftell(fp) > size) { nonfatalerror("Attempt to read from bad svm file", ""); crm_svm_block_init(blck); return 0; } vmap = (int *)malloc(sizeof(int)*blck->map_size); amount_read = fread(vmap, sizeof(int), blck->map_size, fp); //read in solution if (blck->has_solution) { blck->sol = (SVM_Solution *)malloc(sizeof(SVM_Solution)); blck->sol->theta = vector_read_bin_fp(fp); blck->sol->SV = NULL; amount_read = fread(&fill, sizeof(int), 1, fp); fseek(fp, fill, SEEK_CUR); if (!blck->sol->theta || !amount_read || feof(fp) || ftell(fp) > size) { //die! nonfatalerror("Attempt to read from bad svm file.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); free(vmap); return 0; } amount_read = fread(&(blck->sol->num_examples), sizeof(int), 1, fp); amount_read += fread(&(blck->sol->max_train_val), sizeof(int), 1, fp); if (amount_read < 2) { //die! nonfatalerror("Attempt to read from bad svm file.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); free(vmap); return 0; } blck->sol->SV = (Matrix *)malloc(sizeof(Matrix)); amount_read = fread(blck->sol->SV, sizeof(Matrix), 1, fp); blck->sol->SV->was_mapped = 0; if (!amount_read) { //die! crm_svm_block_free_data(*blck); crm_svm_block_init(blck); free(vmap); return 0; } blck->sol->SV->data = (Vector **)malloc(sizeof(Vector *)*blck->sol->SV->rows); for (i = 0; i < blck->sol->SV->rows; i++) { fseek(fp, vmap[i + curr_rows], SEEK_SET); blck->sol->SV->data[i] = vector_read_bin_fp(fp); if (!blck->sol->SV->data[i]) { //oh boy, bad file break; } } if (i != blck->sol->SV->rows) { blck->sol->SV->rows = i; nonfatalerror("Attempt to read from bad SVM file.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); free(vmap); return 0; } curr_rows += blck->sol->SV->rows; } //read in oldXy if (blck->n_old) { fseek(fp, old_offset, SEEK_SET); blck->oldXy = (Matrix *)malloc(sizeof(Matrix)); amount_read = fread(blck->oldXy, sizeof(Matrix), 1, fp); blck->oldXy->was_mapped = 0; blck->oldXy->data = (Vector **)malloc(sizeof(Vector *)*blck->oldXy->rows); for (i = 0; i < blck->oldXy->rows; i++) { fseek(fp, vmap[i + curr_rows], SEEK_SET); blck->oldXy->data[i] = vector_read_bin_fp(fp); if (!blck->oldXy->data[i]) { //oh boy, bad file break; } } if (i != blck->oldXy->rows) { blck->oldXy->rows = i; nonfatalerror("Attempt to read from bad SVM file.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); free(vmap); return 0; } curr_rows += blck->oldXy->rows; } //read in parts of newXy we've seen before if (n_new) { fseek(fp, vmap[curr_rows], SEEK_SET); v = vector_read_bin_fp(fp); i = 0; if (v) { blck->newXy = matr_make_size(n_new, v->dim, v->type, v->compact, v->size); if (!blck->newXy) { nonfatalerror("Attempt to map from bad svm file.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); free(vmap); return 0; } matr_shallow_row_copy(blck->newXy, 0, v); for (i = 1; i < n_new; i++) { fseek(fp, vmap[curr_rows+i], SEEK_SET); v = vector_read_bin_fp(fp); if (!v) { break; } matr_shallow_row_copy(blck->newXy, i, v); } } if (i != n_new) { nonfatalerror("Attempt to read from bad SVM file.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); free(vmap); return 0; } } //read in new vectors fseek(fp, new_offset, SEEK_SET); if (!feof(fp) && ftell(fp) < size) { v = vector_read_bin_fp(fp); if (v && v->dim) { if (!(blck->newXy)) { blck->newXy = matr_make_size(0, v->dim, v->type, v->compact, v->size); } if (!blck->newXy) { nonfatalerror("Attempt to map from bad svm file.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); free(vmap); return 0; } matr_shallow_row_copy(blck->newXy, blck->newXy->rows, v); while (!feof(fp) && ftell(fp) < size) { v = vector_read_bin_fp(fp); if (v && v->dim) { matr_shallow_row_copy(blck->newXy, blck->newXy->rows, v); } else { if (v && !v->dim) { vector_free(v); } break; } } } else if (v) { vector_free(v); } } free(vmap); return 1; } //fwrite functions. used by both read and mmap modes since under mmap it //is sometimes necessary to grow the file //writes an svm block to a file in binary format //returns the number of bytes written //WARNING: this function creates (and removes) a temporary file to avoid //map/fwrite issues. static size_t write_svm_file(crm_svm_block *blck, char *filename) { //this is tricky because the file may be mmapped in //and we may want to be writing some of that back out //so we write it to a temporary file //then unmap the file //then rename the temporary file char tmpfilename[MAX_PATTERN]; FILE *fp; size_t size; int i, lim; //figure out what directory filename is in for (i = strlen(filename); i > 0; i--) { if (filename[i-1] == '/') { break; } } if (!i) { tmpfilename[0] = '.'; tmpfilename[1] = '/'; i = 2; } else { strncpy(tmpfilename, filename, i); } lim = i+6; for ( ; i < lim; i++) { tmpfilename[i] = 'X'; } tmpfilename[lim] = '\0'; //create a temporary file in that directory lim = mkstemp(tmpfilename); if (lim < 0) { if (svm_trace) { perror("Error opening temporary file"); } fatalerror5("Error opening a temporary file. Your directory may be too full or some other problem, but this will really mess things up.\n", "", CRM_ENGINE_HERE); return 0; } else { close(lim); } fp = fopen(tmpfilename, "wb"); if (!fp) { fatalerror5("Error opening a temporary file. Your directory may be too full or some other problem, but this will really mess things up.\n", "", CRM_ENGINE_HERE); return 0; } size = write_svm_file_fp(blck, fp); fclose(fp); #ifdef SVM_USE_MMAP //do the unmap AFTER since blck probably has memory somewhere in that mmap crm_force_munmap_filename(filename); #endif //delete the old file if (unlink(filename)) { if (svm_trace) { perror("Error deleting out-of-date svm file"); } unlink(tmpfilename); return 0; } //now rename our temporary file to be the old file if (rename(tmpfilename, filename)) { if (svm_trace) { perror("Error renaming temporary file"); } unlink(tmpfilename); fatalerror5("Could not copy from the temporary file to the new svm file. Perhaps you don't have write permissions? Whatever is going on, we are unlikely to be able to recover from it.", "", CRM_ENGINE_HERE); return 0; } return size; } //writes an svm block to a file in binary format //returns the number of bytes written //doesn't munmap the file since it doesn't have a file name!! //frees blck static size_t write_svm_file_fp(crm_svm_block *blck, FILE *fp) { size_t size = MAX_INT_VAL, unused; int i, curr_rows = 0, nv = 0, n_new; int *vmap; Matrix *M = matr_make(0, 0, SPARSE_ARRAY, MATR_COMPACT); #ifndef SVM_USE_MMAP size_t tmp; #endif if (!blck) { fatalerror5("write_svm_file: attempt to write NULL block.", "", CRM_ENGINE_HERE); return 0; } if (!fp) { nonfatalerror("Trying to write an svm file to a null file pointer.", ""); return 0; } if (blck->sol) { blck->has_solution = 1; } else { blck->has_solution = 0; } if (blck->sol && blck->sol->SV) { nv += blck->sol->SV->rows; } if (blck->oldXy) { nv += blck->oldXy->rows; } if (blck->newXy) { nv += blck->newXy->rows; n_new = blck->newXy->rows; } else { n_new = 0; } while (nv > blck->map_size) { //grow the map if we need to if (!(blck->map_size)) { blck->map_size = 1; } blck->map_size *= 2; } vmap = (int *)malloc(sizeof(int)*blck->map_size); size = sizeof(char)*fwrite(SVM_FIRST_BITS, 1, SVM_FIRST_NBIT, fp); size += sizeof(size_t)*fwrite(&size, sizeof(size_t), 1, fp); size += sizeof(size_t)*fwrite(&size, sizeof(size_t), 1, fp); size += sizeof(size_t)*fwrite(&size, sizeof(size_t), 1, fp); size += sizeof(int)*fwrite(&n_new, sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->n_old), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->has_solution), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->n0), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->n1), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->n0f), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->n1f), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->map_size), sizeof(int), 1, fp); //vector map size += sizeof(int)*fwrite(vmap, sizeof(int), blck->map_size, fp); if (blck->sol) { //write theta size += svm_write_theta(blck->sol->theta, fp); //write the constants size += sizeof(int)*fwrite(&(blck->sol->num_examples), sizeof(int), 1, fp); size += sizeof(int)*fwrite(&(blck->sol->max_train_val), sizeof(int), 1, fp); //write out the matrix size += sizeof(Matrix)*fwrite(blck->sol->SV, sizeof(Matrix), 1, fp); for (i = 0; i < blck->sol->SV->rows; i++) { vmap[i + curr_rows] = size; size += vector_write_bin_fp(blck->sol->SV->data[i], fp); } curr_rows += blck->sol->SV->rows; } else { //leave room for the solution size += svm_write_theta(NULL, fp); size += sizeof(int)*fwrite(&curr_rows, sizeof(int), 1, fp); i = SVM_MAX_X_VAL; size += sizeof(int)*fwrite(&i, sizeof(int), 1, fp); size += sizeof(Matrix)*fwrite(M, sizeof(Matrix), 1, fp); } //this is where the oldXy matrix is stored fseek(fp, SVM_FIRST_NBIT+sizeof(size_t), SEEK_SET); unused = fwrite(&size, sizeof(size_t), 1, fp); fseek(fp, size, SEEK_SET); if (blck->oldXy) { size += sizeof(Matrix)*fwrite(blck->oldXy, sizeof(Matrix), 1, fp); for (i = 0; i < blck->oldXy->rows; i++) { vmap[i+curr_rows] = size; size += vector_write_bin_fp(blck->oldXy->data[i], fp); } curr_rows += blck->oldXy->rows; } else { size += sizeof(Matrix)*fwrite(M, sizeof(Matrix), 1, fp); } if (blck->newXy && blck->newXy->data) { for (i = 0; i < blck->newXy->rows; i++) { if (blck->newXy->data[i]) { vmap[i+curr_rows] = size; size += vector_write_bin_fp(blck->newXy->data[i], fp); } } curr_rows += blck->newXy->rows; } //this tells you where the data in the file ends fseek(fp, SVM_FIRST_NBIT, SEEK_SET); #ifdef SVM_USE_MMAP unused = fwrite(&size, sizeof(size_t), 1, fp); #else tmp = MAX_INT_VAL; unused = fwrite(&tmp, sizeof(size_t), 1, fp); #endif fseek(fp, sizeof(size_t), SEEK_CUR); //this tells you the offset to appended vectors //so you can check if there *are* new vectors quickly unused = fwrite(&size, sizeof(size_t), 1, fp); //now we actually have vmap //so write it out fseek(fp, SVM_FIRST_NBIT + N_OFFSETS_IN_SVM_FILE*sizeof(size_t) + N_CONSTANTS_NOT_IN_BLOCK*sizeof(int) + N_CONSTANTS_IN_SVM_BLOCK*sizeof(int), SEEK_SET); unused = fwrite(vmap, sizeof(int), curr_rows, fp); free(vmap); #ifdef SVM_USE_MMAP //now leave a nice big hole //so we can add lots of nice vectors //without changing the file size if (SVM_HOLE_FRAC > 0) { fseek(fp, 0, SEEK_END); vmap = malloc((int)(SVM_HOLE_FRAC*size)); size += fwrite(vmap, 1, (int)(SVM_HOLE_FRAC*size), fp); free(vmap); } #endif matr_free(M); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); return size; } //writes theta to a file, leaving it room to grow static size_t svm_write_theta(Vector *theta, FILE *fp) { int dec_size = MATR_DEFAULT_VECTOR_SIZE*sizeof(double); size_t size = 0, theta_written, theta_size; void *filler = NULL; if (!fp) { if (svm_trace) { fprintf(stderr, "svm_write_theta: null file pointer.\n"); } return 0; } if (theta) { theta_size = vector_size(theta); while (theta_size >= dec_size) { if (!(dec_size)) { dec_size = 1; } dec_size *= 2; } theta_written = vector_write_bin_fp(theta, fp); } else { theta_written = 0; } size += theta_written; dec_size -= theta_written; if (dec_size > 0) { filler = malloc(dec_size); } else { dec_size = 0; } size += sizeof(int)*fwrite(&dec_size, sizeof(int), 1, fp); if (filler) { size += fwrite(filler, 1, dec_size, fp); free(filler); } return size; } //appends a vector to the svm file to be learned on later without //reading in the whole file //frees the vector static size_t append_vector_to_svm_file(Vector *v, char *filename) { FILE *fp; crm_svm_block blck; int exists = 0; long size; #ifdef SVM_USE_MMAP size_t data_ends, vsize; int ret; void *addr, *last_addr, *new_addr, *st_addr; struct stat statbuf; if (!v) { nonfatalerror("Something is wrong with the new input. I think it is NULL. I am not trying to append it.", ""); return 0; } //do we have space to write this vector without forcing an unmap? if (!stat(filename, &statbuf)) { if (statbuf.st_size > 0) { exists = 1; addr = crm_mmap_file(filename, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, &size); if (addr == MAP_FAILED || size < sizeof(size_t) + SVM_FIRST_NBIT) { vector_free(v); fatalerror5("Unable to map SVM file in order to append a vector. Something is very wrong and we are unlikely to be able to recover. The file is", filename, CRM_ENGINE_HERE); return 0; } st_addr = addr; last_addr = st_addr+size; if (strncmp(SVM_FIRST_BITS, (char *)addr, strlen(SVM_FIRST_BITS))) { nonfatalerror("I think this SVM file is corrupted. You may want to stop now and rerun this test with an uncorrupted file. For now, I'm not going to touch it. The file is", filename); crm_munmap_file(st_addr); vector_free(v); return 0; } addr += SVM_FIRST_NBIT; data_ends = *((size_t *)addr); vsize = vector_size(v); //no matter what, the data now ends here //it's important to mark that if (data_ends <= size) { *((size_t *)addr) = data_ends + vsize; } else { *((size_t *)addr) = size + vsize; } if (data_ends < size && st_addr + data_ends + vsize <= last_addr) { //we have room to write the vector //so add it new_addr = vector_memmove(st_addr + data_ends, v); vector_free(v); crm_munmap_file(st_addr); return vsize; } //we don't have room to write the vector //get rid of the hole crm_munmap_file(st_addr); if (data_ends < size) { ret = truncate(filename, data_ends); } else if (data_ends > size) { nonfatalerror("I think this SVM file is corrupted. You may want to stop now and rerun this test with an uncorrupted file. For now, I'm not going to touch it. The file is", filename); vector_free(v); return 0; } } } #else fp = fopen(filename, "rb"); if (fp) { exists = 1; fclose(fp); } #endif if (!exists) { if (svm_trace) { fprintf(stderr, "Creating new stat file.\n"); } //the file doesn't exist yet //we'll create it! //note that leaving this as open for appending instead //of writing creates problems. i'm not sure why. fp = fopen(filename, "wb"); crm_svm_block_init(&blck); blck.newXy = matr_make_size(1, v->dim, v->type, v->compact, v->size); if (!blck.newXy) { nonfatalerror("Attempt to append bad vector to SVM file.", ""); fclose(fp); return 0; } matr_shallow_row_copy(blck.newXy, 0, v); size = write_svm_file_fp(&blck, fp); fclose(fp); return size; } #ifdef SVM_USE_MMAP //force an unmap if it is mapped //append this vector to the file crm_force_munmap_filename(filename); #endif fp = fopen(filename, "ab"); size = vector_write_bin_fp(v, fp); vector_free(v); #ifdef SVM_USE_MMAP if (SVM_HOLE_FRAC > 0) { if (svm_trace) { fprintf(stderr, "Appending hole of size %d to file.\n", (int)(SVM_HOLE_FRAC*statbuf.st_size)); } new_addr = malloc((int)(SVM_HOLE_FRAC*statbuf.st_size)); size += fwrite(new_addr, 1, (int)(SVM_HOLE_FRAC*statbuf.st_size), fp); free(new_addr); } #endif fclose(fp); return size; } //this function writes the changes that have been made to blck //to disk //if addr is NULL, it will fwrite blck to filename //if blck was mapped in, it will attempt to write things back into //memory and //if this isn't possible it will force a fwrite the file //this frees all data associated with blck static size_t crm_svm_save_changes(crm_svm_block *blck, void *addr, char *filename) { size_t old_offset, theta_room, theta_req, size; void *curr = addr, *prev, *last_addr; crm_svm_block old_block; struct stat statbuf; int nv = 0, i, *vmap, curr_rows = 0; #ifndef SVM_USE_MMAP return write_svm_file(blck, filename); #endif if (!addr) { nonfatalerror("Attempting to save a file to a NULL address. Probably the original file was corrupted and couldn't be read. The file is", filename); return 0; } if (stat(filename, &statbuf)) { //ok this is really wrong fatalerror5("svm save changes: the file you are trying to save to doesn't exist. This is unrecoverable. The file is", filename, CRM_ENGINE_HERE); return 0; } if (statbuf.st_size < sizeof(size_t) + SVM_FIRST_NBIT) { if (svm_trace) { fprintf(stderr, "Writing file because it is waaaay too small.\n"); } return write_svm_file(blck, filename); } if (strncmp(SVM_FIRST_BITS, (char *)addr, strlen(SVM_FIRST_BITS))) { nonfatalerror("The magic string of the file I am trying to save isn't what I think it should be. This probably indicates that the file is corrupted and I shouldn't touch it so I won't. The file is", filename); return 0; } curr += SVM_FIRST_NBIT; size = *((size_t *)curr); curr += sizeof(size_t); if (size + sizeof(double)*MATR_DEFAULT_VECTOR_SIZE >= statbuf.st_size) { //we have no more room to append vectors to this file //so write it out now //otherwise size won't change if (svm_trace) { fprintf(stderr, "Writing file to leave a hole at the end.\n"); } return write_svm_file(blck, filename); } last_addr = addr + size; //if we are going to unmap the file, old_offset won't change //since oldXy will be in the same place old_offset = *((size_t *)curr); curr += sizeof(size_t); //new_offset, however, will go away because we now have locations //for all of the "new vectors". that we need to do a learn is //marked with a non_zero n_new *((size_t *)curr) = size; curr += sizeof(size_t); //make all of the constants correct if (blck->sol) { blck->has_solution = 1; } else { blck->has_solution = 0; } if (blck->sol && blck->sol->SV) { nv += blck->sol->SV->rows; } if (blck->oldXy) { nv += blck->oldXy->rows; } if (blck->newXy) { nv += blck->newXy->rows; } while (nv > blck->map_size) { if (!(blck->map_size)) { blck->map_size = 1; } blck->map_size *= 2; } if (blck->newXy) { *((int *)curr) = blck->newXy->rows; } else { *((int *)curr) = 0; } curr += sizeof(int); old_block.n_old = *((int *)curr); *((int *)curr) = blck->n_old; curr += sizeof(int); old_block.has_solution = *((int *)curr); *((int *)curr) = blck->has_solution; curr += sizeof(int); old_block.n0 = *((int *)curr); *((int *)curr) = blck->n0; curr += sizeof(int); old_block.n1 = *((int *)curr); *((int *)curr) = blck->n1; curr += sizeof(int); old_block.n0f = *((int *)curr); *((int *)curr) = blck->n0f; curr += sizeof(int); old_block.n1f = *((int *)curr); *((int *)curr) = blck->n1f; curr += sizeof(int); old_block.map_size = *((int *)curr); *((int *)curr) = blck->map_size; curr += sizeof(int); if (blck->map_size > old_block.map_size) { //we don't have enough room to do a vector map //we need to write out the file if (svm_trace) { fprintf(stderr, "Writing svm file to grow map size from %d to %d.\n", old_block.map_size, blck->map_size); } return write_svm_file(blck, filename); } //this is the map we will fill in vmap = curr; //do we have room to write theta? curr += sizeof(int)*blck->map_size; //keep where theta starts prev = curr; //this is how much room for theta if (old_block.has_solution) { theta_room = vector_size((Vector *)curr); } else { theta_room = 0; } curr += theta_room; theta_room += *((int *)curr); curr = prev; //how much room will theta actually take? if (blck->has_solution && blck->sol && blck->sol->theta) { theta_req = vector_size(blck->sol->theta); } else { theta_req = 0; } if (curr + theta_room > last_addr || theta_room < theta_req) { //we don't have enough room in the file to write //the decision boundary //so we need to use fwrite if (svm_trace) { fprintf (stderr, "Writing file to grow decision boundary size from %lu to %lu.\n", theta_room, theta_req); } return write_svm_file(blck, filename); } //we have enough room to unmap the solution to this file //let's do it! //write the new solution boundary if (blck->has_solution && blck->sol) { if (blck->sol->theta) { //copy over the decision boundary //it is possible that curr and blck->sol->theta //overlap if we didn't actually do a learn //so use memmove NOT memcpy prev = vector_memmove(curr, blck->sol->theta); } //leave a marker to let us know how much filler space we have *((int *)prev) = theta_room-theta_req; //keep the filler! curr += theta_room + sizeof(int); //write in the solution constants if (blck->has_solution && blck->sol) { *((int *)curr) = blck->sol->num_examples; } curr += sizeof(int); if (blck->has_solution && blck->sol) { *((int *)curr) = blck->sol->max_train_val; } curr += sizeof(int); if (blck->sol->SV) { //copy the matrix header *((Matrix *)curr) = *(blck->sol->SV); //now use the map (remember back where we stored it in vmap?) //to record which of the vectors (already somewhere in this chunk //of memory) belong to this matrix for (i = 0; i < blck->sol->SV->rows; i++) { //vmap stores offsets from the beginning of the file if (((void *)blck->sol->SV->data[i]) < addr || ((void *)blck->sol->SV->data[i]) > last_addr) { //oh oh, something is very wrong //give up and write the file if (svm_trace) { fprintf(stderr, "save_changes: somehow a vector is outside the mapped memory.\n"); } return write_svm_file(blck, filename); } vmap[i + curr_rows] = ((void *)blck->sol->SV->data[i]) - addr; } curr_rows += blck->sol->SV->rows; } } if (blck->n_old && blck->oldXy && blck->oldXy->data) { curr = addr + old_offset; //note that this shouldn't change! *((Matrix *)curr) = *(blck->oldXy); for (i = 0; i < blck->oldXy->rows; i++) { if (((void *)blck->oldXy->data[i]) < addr || ((void *)blck->oldXy->data[i]) > last_addr) { //whoops if (svm_trace) { fprintf(stderr, "save_changes: somehow a vector is outside the mapped memory.\n"); } return write_svm_file(blck, filename); } vmap[i + curr_rows] = ((void *)blck->oldXy->data[i]) - addr; } curr_rows += blck->oldXy->rows; } if (blck->newXy) { //newXy isn't saved as a matrix //since new vectors come and go all the time for (i = 0; i < blck->newXy->rows; i++) { if (((void *)blck->newXy->data[i]) < addr || ((void *)blck->newXy->data[i]) > last_addr) { if (svm_trace) { fprintf(stderr, "save_changes: somehow a vector is outside the mapped memory.\n"); } return write_svm_file(blck, filename); } vmap[i + curr_rows] = ((void *)blck->newXy->data[i]) - addr; } } //whew! we made it crm_svm_block_free_data(*blck); crm_svm_block_init(blck); crm_munmap_file(addr); return size; } /***************************SVM BLOCK FUNCTIONS*******************************/ //initializes an svm block static void crm_svm_block_init(crm_svm_block *blck) { blck->sol = NULL; blck->newXy = NULL; blck->oldXy = NULL; blck->n_old = 0; blck->has_solution = 0; blck->n0 = 0; blck->n1 = 0; blck->n0f = 0; blck->n1f = 0; blck->map_size = SVM_DEFAULT_MAP_SIZE; } //frees all data associated with a block static void crm_svm_block_free_data(crm_svm_block blck) { if (blck.sol) { svm_free_solution(blck.sol); } if (blck.oldXy) { matr_free(blck.oldXy); } if (blck.newXy) { matr_free(blck.newXy); } } /***************************LEARNING FUNCTIONS********************************/ //does the actual work of learning new examples static void crm_svm_learn_new_examples(crm_svm_block *blck, int microgroom) { int i; int inc = 0, offset = 0, n_ex = 0, lim; double d; PreciseSparseElement *thetaval = NULL; VectorIterator vit; Vector *row; if (!blck->newXy && !blck->sol) { nonfatalerror ("There are no examples for an SVM to learn on in the file you have supplied. Note that supplying a non-empty but incorrectly formatted file can cause this warning.", ""); //reset the block crm_svm_block_free_data(*blck); crm_svm_block_init(blck); return; } //update n0, n1, n0f, n1f if (blck->newXy) { for (i = 0; i < blck->newXy->rows; i++) { row = matr_get_row(blck->newXy, i); if (!row) { //this would be weird continue; } vectorit_set_at_beg(&vit, row); if (!vectorit_past_end(vit, row)) { if (vectorit_curr_val(vit, row) < 0) { //a new example for class 1 blck->n1++; blck->n1f += row->nz; if (SVM_ADD_CONSTANT) { blck->n1f--; } } else { blck->n0++; blck->n0f += row->nz; if (SVM_ADD_CONSTANT) { blck->n0f--; } } } } } //actually learn something! if (svm_trace) { fprintf(stderr, "Calling SVM solve.\n"); } svm_solve(&(blck->newXy), &(blck->sol)); if (!blck->sol || !blck->sol->theta) { nonfatalerror("Unable to solve SVM. This is likely due to a corrupted SVM statistics file.", ""); crm_svm_block_free_data(*blck); crm_svm_block_init(blck); return; } if (svm_trace) { fprintf(stderr, "Reclassifying all old examples to find extra support vectors.\n"); } if (blck->oldXy) { n_ex += blck->oldXy->rows; if (microgroom && blck->oldXy->rows >= SVM_GROOM_OLD) { thetaval = (PreciseSparseElement *) malloc(sizeof(PreciseSparseElement)*blck->oldXy->rows); } //check the classification of everything in oldXy //put anything not classified with high enough margin into sol->SV lim = blck->oldXy->rows; for (i = 0; i < lim; i++) { row = matr_get_row(blck->oldXy, i - offset); if (!row) { continue; } d = dot(blck->sol->theta, row); if (d <= 0) { inc++; } if (d <= 1+SV_TOLERANCE) { matr_shallow_row_copy(blck->sol->SV, blck->sol->SV->rows, row); matr_erase_row(blck->oldXy, i - offset); offset++; } else if (thetaval) { thetaval[i-offset].col = i - offset; thetaval[i-offset].data = d; } } if (thetaval && blck->oldXy->rows >= SVM_GROOM_OLD) { //microgroom if (svm_trace) { fprintf(stderr, "Microgrooming...\n"); } qsort(thetaval, blck->oldXy->rows, sizeof(PreciseSparseElement), precise_sparse_element_val_compare); //take the top SVM_GROOM_FRAC of this qsort(&(thetaval[(int)(blck->oldXy->rows*SVM_GROOM_FRAC)]), blck->oldXy->rows - (int)(blck->oldXy->rows*SVM_GROOM_FRAC), sizeof(PreciseSparseElement), precise_sparse_element_col_compare); lim = blck->oldXy->rows; for (i = (int)(blck->oldXy->rows*SVM_GROOM_FRAC); i < lim; i++) { matr_remove_row(blck->oldXy, thetaval[i].col); } } if (thetaval) { free(thetaval); } if (!blck->oldXy->rows) { matr_free(blck->oldXy); blck->oldXy = NULL; } } if (svm_trace) { fprintf(stderr, "Of %d old training examples, we got %d incorrect. There are now %d support vectors (we added %d).\n", n_ex, inc, blck->sol->SV->rows, offset); } //if we have any vectors that weren't support vectors //they are now stored in newXy. //so copy newXy into oldXy if (blck->newXy) { matr_append_matr(&(blck->oldXy), blck->newXy); matr_free(blck->newXy); blck->newXy = NULL; } //update the counts we keep of the number of rows //of oldXy (mostly so we know whether it exists) if (blck->oldXy) { blck->n_old = blck->oldXy->rows; } else { blck->n_old = 0; } //we've solved it! so we have a solution blck->has_solution = 1; } /****************************************************************************** *Use an SVM to learn a classification task. *This expects two classes: a class with a +1 label and a class with *a -1 label. These are denoted by the presence or absense of the *CRM_REFUTE label (see the FLAGS section of the comment). *For an overview of how the algorithm works, look at the comments in *crm_svm_lib_fncts.c. * *INPUT: This function is for use with CRM 114 so it takes the * canonical arguments: * csl: The control block. Never actually used. * apb: The argparse block. This is passed to vector_tokenize_selector * and I use the flags (see the FLAG section). * txtptr: A pointer to the text to classify. * txtstart: The text to classify starts at txtptr+txtstart * txtlen: number of characters to classify * *OUTPUT: 0 on success * *FLAGS: The SVM calls crm_vector_tokenize_selector so uses any flags * that that function uses. For learning, it interprets flags as * follows: * * CRM_REFUTE: If present, this indicates that this text has a -1 * label and should be classified as such. If absent, indicates * that this text has a +1 label. * * CRM_UNIQUE: If present, CRM_UNIQUE indicates that we should ignore * the number of times we see a feature. With CRM_UNIQUE, feature * vectors are binary - a 1 in a column indicates that a feature * with that column number was seen once or more. Without it, features * are integer valued - a number in a column indicates the number of * times that feature was seen in the document. * * CRM_MICROGROOM: If there are more than SVM_GROOM_OLD (defined in * (crm114_config.h) examples that we have learned on but are * not support vectors, CRM_MICROGROOM will remove the SVM_GROOM_FRAC * (defined in crm11_config.h) of them furthest from the decision * boundary. CRM_MICROGROOM ONLY runs AFTER an actual learn - ie * we will never microgroom during an APPEND. In fact, PASSING IN * MICROGROOM WITH APPEND DOES NOTHING. Also note that the effects * of microgrooming are not obvious until the next time the file is * written using fwrite. This will actually happen the next time enough * vectors are added * * CRM_APPEND: The example will be added to the set of examples but * not yet learned on. We will learn on this example the next time * a learn without APPEND or ERASE is called or if classify is called. * If you call learn with CRM_APPEND and actual learn will NEVER happen. * All calls to learn with CRM_APPEND will execute very quickly. * * CRM_FROMSTART: Relearn on every seen (and not microgroomed away) example * instead of using an incremental method. If CRM_FROMSTART and * CRM_APPEND are both flagged, the FROMSTART learn will be done the * next time there is a learn without APPEND or ERASE or a classify. If * examples are passed in using CRM_APPEND after CRM_FROMSTART, we will * also learn those examples whenever we do the FROMSTART learn. * * CRM_ERASE: Erases the example from the example set. If this * example was just appended and never learned on or if it is not * in the support vector set of the last solution, this simply erases * the example from the set of examples. If the example is a support * vector, we relearn everything from the start including any new examples * that were passed in using CRM_APPEND and haven't been learned on. If * CRM_ERASE and CRM_APPEND are passed in together and a relearn is required, * the relearn is done the next time learn is called without APPEND or ERASE * or a classify is called. * * ALL FLAGS NOT LISTED HERE OR USED IN THE VECTOR_TOKENIZER ARE IGNORED. * *WHEN WE LEARN: * * The various flags can seem to interact bizarrely to govern whether a * learn actually happens, but, in fact, everything follows three basic rules: * * 1) WE NEVER LEARN ON CRM_APPEND. * 2) IF WE LEARN, WE LEARN ON ALL EXAMPLES PRESENT. * 3) WHEN ERASING, WE DO EXACTLY AS MUCH WORK IS REQUIRED TO ERASE THE * EXAMPLE AND NO MORE EXCEPT WHERE THIS CONFLICTS WITH THE FIRST 2 RULES. * * Therefore, rule 2 says that a FROMSTART, for example, will learn on both * old and new examples. Likewise rule 2 states that an ERASE that requires * a relearn, will learn on both old and new examples. An ERASE that DOESN'T * require a relearn, however, is governed by rule 3 and therefore * will NOT run a learn on new examples because that is NOT necessary to * erase the example. Rule 1 ensures that passing in CRM_MICROGROOM with * CRM_APPEND does nothing because we only MICROGROOM after a learn and we * NEVER learn on CRM_APPEND. Etc. * *FORCING A LEARN: * * You can force a learn by passing in a NULL txtptr or a txtlen of 0. * This will call the svm learn functions EVEN IF there are no new * examples. If the SVM is incorrectly classifying examples it has * already seen, forcing a relearn will fix that problem. *****************************************************************************/ int crm_svm_learn(CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen) { char htext[MAX_PATTERN], filename[MAX_PATTERN]; long i, j; unsigned int features[MAX_SVM_FEATURES]; crm_svm_block blck; size_t unused; Vector *nex, *row; int read_file = 0, do_learn = 1, lim = 0; void *addr = NULL; if (user_trace) { svm_trace = 1; } if (internal_trace) { //this is a "mediumly verbose" setting svm_trace = SVM_INTERNAL_TRACE_LEVEL + 1; } SVM_DEBUG_MODE = svm_trace - 1; if (SVM_DEBUG_MODE < 0) { SVM_DEBUG_MODE = 0; } if (svm_trace) { fprintf(stderr, "Doing an SVM learn.\n"); } //Get the filename //crm_stmt_parser.c crm_get_pgm_arg(htext, MAX_PATTERN, apb->p1start, apb->p1len); crm_nexpandvar(htext, apb->p1len, MAX_PATTERN); i = 0; while (htext[i] < 0x021) i++; j = i; while (htext[j] >= 0x021) j++; htext[j] = '\000'; strcpy (filename, &htext[i]); //set things to NULL that should be null crm_svm_block_init(&blck); if (txtptr && txtlen > 0) { //get the new example nex = convert_document(txtptr+txtstart, txtlen, features, apb); if (apb->sflags & CRM_ERASE) { //find this example and remove all instances of it //then do a FROMSTART unless we haven't learned on this //example yet //requires reading in the whole file //load our stat file in if (!(addr = map_svm_file(&blck, filename))) { nonfatalerror("An error occurred trying to map in the file. Likely it is corrupted. Your vector will not be erased. The file is", filename); } else { read_file = 1; } do_learn = 0; //we are erasing, not learning if (blck.sol && blck.sol->SV) { j = 0; lim = blck.sol->SV->rows; for (i = 0; i < lim; i++) { row = matr_get_row(blck.sol->SV, i-j); if (!row) { continue; } if (vector_equals(nex, row)) { //support vector //have to start over do_learn = 1; if (!(apb->sflags & CRM_FROMSTART)) { apb->sflags = apb->sflags | CRM_FROMSTART; } matr_remove_row(blck.sol->SV, i-j); if (vector_get(nex, 0) < 0) { blck.n1--; blck.n1f -= nex->nz; if (SVM_ADD_CONSTANT) { blck.n1f++; } } else { blck.n0--; blck.n0f -= nex->nz; if (SVM_ADD_CONSTANT) { blck.n0f++; } } j++; } } } if (blck.oldXy) { j = 0; lim = blck.oldXy->rows; for (i = 0; i < lim; i++) { row = matr_get_row(blck.oldXy, i-j); if (!row) { continue; } if (vector_equals(nex, row)) { matr_remove_row(blck.oldXy, i-j); j++; if (vector_get(nex, 0) < 0) { blck.n1--; blck.n1f -= nex->nz; if (SVM_ADD_CONSTANT) { blck.n1f++; } } else { blck.n0--; blck.n0f -= nex->nz; if (SVM_ADD_CONSTANT) { blck.n0f++; } } } } } if (blck.newXy) { j = 0; lim = blck.newXy->rows; for (i = 0; i < lim; i++) { row = matr_get_row(blck.newXy, i-j); if (!row) { continue; } if (vector_equals(nex, row)) { matr_remove_row(blck.newXy, i-j); j++; } } } vector_free(nex); } else { //add the vector to the new matrix append_vector_to_svm_file(nex, filename); } } if (apb->sflags & CRM_FROMSTART) { do_learn = 1; if (!read_file) { if (!(addr = map_svm_file(&blck, filename))) { nonfatalerror("An error occurred trying to map in the file. Likely it is corrupted. The fromstart learn will have no effect. The file is", filename); } else { read_file = 1; } } //copy oldXy into newXy if (blck.oldXy) { matr_append_matr(&(blck.newXy), blck.oldXy); matr_free(blck.oldXy); blck.oldXy = NULL; blck.n_old = 0; } //copy the support vectors into newXy if (blck.sol) { matr_append_matr(&(blck.newXy), blck.sol->SV); svm_free_solution(blck.sol); blck.sol = NULL; } blck.n0 = 0; blck.n1 = 0; blck.n0f = 0; blck.n1f = 0; } if (!(apb->sflags & CRM_APPEND) && do_learn) { if (!read_file) { if (!(addr = map_svm_file(&blck, filename))) { nonfatalerror("An error occurred trying to map in the file. Either it is corrupted or the only string you have learned on so far is the empty string. Note that the SVM needs at least one non-empty example to initialize its file. Whatever is going on, your learn will have no effect. The file is", filename); do_learn = 0; } else { read_file = 1; } } //do we actually want to do this learn? //let's consult smart mode if (read_file && svm_smart_mode) { //wait until we have a good base of examples to learn if (!blck.has_solution && (!blck.newXy || blck.newXy->rows < SVM_BASE_EXAMPLES)) { if (svm_trace) { fprintf(stderr, "Running under smart_mode: postponing learn until we have enough examples.\n"); } do_learn = 0; } //if we have more than SVM_INCR_FRAC examples we haven't yet //learned on, do a fromstart if (blck.sol && blck.sol->SV && blck.oldXy && blck.newXy && blck.newXy->rows >= SVM_INCR_FRAC*(blck.oldXy->rows + blck.sol->SV->rows)) { if (svm_trace) { fprintf(stderr, "Running under smart_mode: Doing a fromstart to incorporate new examples.\n"); } matr_append_matr(&(blck.newXy), blck.oldXy); matr_free(blck.oldXy); blck.oldXy = NULL; blck.n_old = 0; } } if (do_learn) { crm_svm_learn_new_examples(&blck, apb->sflags & CRM_MICROGROOM); } } if (read_file) { //we did something to it! //save it unused = crm_svm_save_changes(&blck, addr, filename); } //free everything crm_svm_block_free_data(blck); return 0; } /****************************CLASSIFICATION FUNCTIONS*************************/ /****************************************************************************** *Use an SVM for a classification task. *This expects two classes: a class with a +1 label and a class with *a -1 label. The class with the +1 label is class 0 and the class *with the -1 label is class 1. When learning, class 1 is denoted by *passing in the CRM_REFUTE flag. The classify is considered to FAIL *if the example classifies as class 1 (-1 label). The SVM requires *at least one example to do any classification, although really you should *give it at least one from each class. If classify is called without any *examples to learn on at all, it will classify the example as class 0, but *it will also print out an error. * *If classify is called and there are new examples that haven't been learned *on or a FROMSTART learn that hasn't been done, this function will do that *BEFORE classifying. in other words: * *CLASSIFY WILL DO A LEARN BEFORE CLASSIFYING IF NECESSARY. IT WILL NOT STORE *THAT LEARN BECAUSE IT HAS NO WRITE PRIVILEGES TO THE FILE. * *INPUT: This function is for use with CRM 114 so it takes the * canonical arguments: * csl: The control block. Used to skip if classify fails. * apb: The argparse block. This is passed to vector_tokenize_selector * and I use the flags (see the FLAG section). * txtptr: A pointer to the text to classify. * txtstart: The text to classify starts at txtptr+txtstart * txtlen: number of characters to classify * *OUTPUT: return is 0 on success * The text output (stored in out_var) is formatted as follows: * * LINE 1: CLASSIFY succeeds/fails success probability: # pR: # * (note that success probability is high for success and low for failure) * LINE 2: Best match to class #0/1 probability: # pR: # * (probability >= 0.5 since this is the best matching class.) * LINE 3: Total features in input file: # * LINE 4: #0 (label +1): documents: #, features: #, prob: #, pR # * LINE 5: #1 (label -1): documents: #, features: #, prob: #, pR # * (prob is high for match class, low else. pR is positive for match class.) * * I've picked a random method for calculating probability and pR. Thinking * about it, there may be literature for figuring out the probability at * least. Anyone who wants to do that, be my guest. For now, I've found * a function that stays between 0 and 1 and called it good. Specifically, * if theta is the decision boundary and x is the example to classify: * * prob(class = 0) = 0.5 + 0.5*tanh(theta dot x) * pR = sgn(theta dot x)*(pow(11, fabs(theta dot x)) - 1) * *FLAGS: The SVM calls crm_vector_tokenize_selector so uses any flags * that that function uses. For classifying, it interprets flags as * follows: * * CRM_REFUTE: Returns the OPPOSITE CLASS. In other words, if this should * classify as class 1, it now classifies as class 0. I don't know why * you would want to do this, but you should be aware it happens. * * CRM_UNIQUE: If present, CRM_UNIQUE indicates that we should ignore * the number of times we see a feature. With CRM_UNIQUE, feature * vectors are binary - a 1 in a column indicates that a feature * with that column number was seen once or more. Without it, features * are integer valued - a number in a column indicates the number of * times that feature was seen in the document. If you used CRM_UNIQUE * to learn, use CRM_UNIQUE to classify! (duh) * * CRM_MICROGROOM: If classify does a learn, it will MICROGROOM. See the * comment to learn for how microgroom works. * * ALL FLAGS NOT LISTED HERE OR USED IN THE VECTOR_TOKENIZER ARE IGNORED. * INCLUDING FLAGS USED FOR LEARN! *****************************************************************************/ int crm_svm_classify(CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen) { char htext[MAX_PATTERN], filename[MAX_PATTERN], out_var[MAX_PATTERN]; long i, j, out_var_len = 0; unsigned int features[MAX_SVM_FEATURES], out_pos = 0; Vector *nex, *theta; double dottheta = 0; int class, sgn, nz; crm_svm_block blck; void *addr = NULL; if (user_trace) { svm_trace = 1; } if (internal_trace) { //this is a "mediumly verbose" setting svm_trace = SVM_INTERNAL_TRACE_LEVEL + 1; } SVM_DEBUG_MODE = svm_trace - 1; if (SVM_DEBUG_MODE < 0) { SVM_DEBUG_MODE = 0; } if (svm_trace) { fprintf(stderr, "Doing an SVM classify.\n"); } crm_svm_block_init(&blck); //Get the filename (we only have one) //crm_stmt_parser.c crm_get_pgm_arg(htext, MAX_PATTERN, apb->p1start, apb->p1len); crm_nexpandvar(htext, apb->p1len, MAX_PATTERN); i = 0; while (htext[i] < 0x021) i++; j = i; while (htext[j] >= 0x021) j++; htext[j] = '\000'; strcpy (filename, &htext[i]); //Get the output variable name if (apb->p2start) { crm_get_pgm_arg(out_var, MAX_PATTERN, apb->p2start, apb->p2len); out_var_len = crm_nexpandvar(out_var, apb->p2len, MAX_PATTERN); } //do we have new vectors to learn on? if (has_new_vectors(filename)) { //we use read so that we don't make changes to the file //also doing a learn when you can't benefit from it is stupid //so we don't do that in smart mode if (!svm_smart_mode && read_svm_file(&blck, filename)) { crm_svm_learn_new_examples(&blck, 0); } if (blck.sol) { theta = blck.sol->theta; } else { crm_svm_block_free_data(blck); crm_svm_block_init(&blck); theta = NULL; } } else { svm_get_meta_data(filename, &blck); theta = get_theta_from_svm_file(filename, &addr); } //get the new example nex = convert_document(txtptr+txtstart, txtlen, features, apb); //classify it if (theta) { dottheta = dot(nex, theta); if (blck.sol) { crm_svm_block_free_data(blck); } else { vector_free(theta); } } else { if (!svm_smart_mode) { nonfatalerror ("Nothing was learned before asking SVM for a classification. I am trying to classify from the file", filename); } dottheta = 0; } if (addr) { crm_munmap_file(addr); } if (svm_trace) { fprintf(stderr, "The dot product of the example and decision boundary is %lf\n", dottheta); } if (dottheta < 0) { class = 1; sgn = -1; } else { class = 0; sgn = 1; } if (fabs(dottheta) > 6/log10(11)) { nonfatalerror("The pR values here are HUGE. One fix for this is to redo things with the unique flag set. This is especially true if you are also using the string flag.", ""); dottheta = sgn*6/log10(11); } if (apb->p2start) { //annnnnnd... write it all back out if (!class) { //these are very arbitrary units of measurement //i picked tanh because... it's a function with a middle at 0 //and nice asymptotic properties near 1 //yay! out_pos += sprintf (outbuf + out_pos, "CLASSIFY succeeds"); } else { out_pos += sprintf(outbuf + out_pos, "CLASSIFY fails"); } out_pos += sprintf(outbuf + out_pos, " success probability: %f pR: %6.4f\n", 0.5 + 0.5*tanh(dottheta), sgn*(pow(11, fabs(dottheta))-1)); out_pos += sprintf(outbuf + out_pos, "Best match to class #%d prob: %6.4f pR: %6.4f \n", class, 0.5 + 0.5*tanh(fabs(dottheta)), pow(11, fabs(dottheta))-1); nz = nex->nz; if (SVM_ADD_CONSTANT) { nz--; } out_pos += sprintf(outbuf + out_pos, "Total features in input file: %d\n", nz); out_pos += sprintf (outbuf + out_pos, "#0 (label +1): documents: %d, features: %d, prob: %3.2e, pR: %6.2f\n", blck.n0, blck.n0f, 0.5 + 0.5*tanh(dottheta), sgn*(pow(11, fabs(dottheta)) - 1)); out_pos += sprintf (outbuf + out_pos, "#1 (label -1): documents: %d, features: %d, prob: %3.2e, pR: %6.2f\n", blck.n1, blck.n1f, 0.5 - 0.5*tanh(dottheta), -1*sgn*(pow(11, fabs(dottheta))-1)); if (out_var_len) { crm_destructive_alter_nvariable(out_var, out_var_len, outbuf, out_pos); } } vector_free(nex); if (class) { //classifies out-of-class csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk[csl->mct[csl->cstmt]->nest_level] = -1; return 0; } return 0; } /****************************SAMPLE MAINS*************************************/ //#define MAKE_PREC_RECALL_GRAPHS #ifdef MAKE_PREC_RECALL_GRAPHS // the command line argv char **prog_argv; // the auxilliary input buffer (for WINDOW input) char *newinputbuf; // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. char *inbuf; char *outbuf; char *tempbuf; //classify spam //this generates svm_time.txt and svm_err.txt int main(int argc, char **argv) { //CSL_CELL csl; ARGPARSE_BLOCK apb; char *txtptr = NULL; long txtstart, txtlen; FILE *fp, *slfp, *hlfp; size_t size; Matrix *X; char *gdir, *glist, *bdir, buf[256]; crm_svm_block blck; int i = 0, start, read_good = 0, ng = 0, nb = 0, label, curr_pt = 0; unsigned int features[MAX_SVM_FEATURES]; Vector *v; double d; int errpts[20], deth = 0, deta = 0, ah = 0; FILE *err_file = fopen("svm_err.txt", "w"); csl = (CSL_CELL *)malloc(sizeof(CSL_CELL)); apb.p1start = (char *)malloc(sizeof(char)*MAX_PATTERN); strcpy(apb.p1start, ""); apb.p1len = strlen(apb.p1start); apb.a1start = buf; apb.a1len = 0; apb.p2start = NULL; apb.p2len = 0; apb.p3start = buf; apb.p3len = 0; apb.b1start = buf; apb.b1len = 0; apb.s1start = buf; apb.s1len = 0; apb.s2start = buf; apb.s2len = 0; gdir = argv[1]; bdir = argv[2]; unlink(apb.p1start); data_window_size = DEFAULT_DATA_WINDOW; printf("data_window_size = %d\n", data_window_size); outbuf = (char *)malloc(sizeof(char)*data_window_size); prog_argv = (char **)malloc(sizeof(char *)); prog_argv[0] = (char *)malloc(sizeof(char)*MAX_PATTERN); //list of files in ham folder strcpy(buf, gdir); start = strlen(buf); strcpy(&(buf[start]), "/list.txt\0"); start = strlen(buf); printf("start = %d\n", start); hlfp = fopen(buf, "r"); //list of files in spam folder strcpy(buf, bdir); start = strlen(buf); strcpy(&(buf[start]), "/list.txt\0"); start = strlen(buf); slfp = fopen(buf, "r"); crm_svm_block_init(&blck); i = 0; while (fscanf(hlfp, "%s", buf) != EOF) { ng++; } while (fscanf(slfp, "%s", buf) != EOF) { nb++; } printf("ng = %d, nb = %d\n", ng, nb); errpts[0] = 125; curr_pt = 0; while (errpts[curr_pt] < nb + ng) { errpts[curr_pt+1] = 2*errpts[curr_pt]; curr_pt++; } errpts[curr_pt-1] = nb + ng; curr_pt = 0; rewind(hlfp); rewind(slfp); while (!feof(hlfp) || !feof(slfp)) { v = NULL; if ((read_good && !feof(hlfp)) || feof(slfp)) { ah++; strcpy(buf, gdir); start = strlen(buf); strcpy(&buf[start], "/"); start = strlen(buf); if (fscanf(hlfp, "%s", &(buf[start])) == EOF) { continue; } read_good++; if (read_good >= ng/nb + 1) { read_good = 0; } apb.sflags = CRM_UNIQUE; label = 1; } else if (!feof(slfp)) { strcpy(buf, bdir); start = strlen(buf); strcpy(&buf[start], "/"); start = strlen(buf); if (fscanf(slfp, "%s", &(buf[start])) == EOF) { continue; } start = strlen(buf); apb.sflags = CRM_REFUTE | CRM_UNIQUE; read_good = 1; label = -1; } printf("Reading %s i = %d\n", buf, i); fp = fopen(buf, "r"); fseek(fp, 0, SEEK_END); size = ftell(fp); rewind(fp); txtptr = (char *)realloc(txtptr, size); size = fread(txtptr, 1, size, fp); fclose(fp); //do a classify //d = crm_svm_classify(csl, &apb, txtptr, 0, size); v = convert_document(txtptr, size, features, &apb); if (blck.sol) { d = dot(blck.sol->theta, v); } else { d = 0; } printf("d = %f\n", d); if (d < 1) { //do a learn //crm_svm_learn(csl, &apb, txtptr, 0, size); if (!blck.newXy) { blck.newXy = matr_make_size(1, v->dim, v->type, v->compact, v->size); } matr_shallow_row_copy(blck.newXy, blck.newXy->rows-1, v); crm_svm_learn_new_examples(&blck, 0); } if (d > 0 && label > 0) { //a correct ham detection! deth++; deta++; } //could be less than or equal if d is actual dot //right now it is return from classify if (d < 0 && label < 0) { //an incorrect ham detection deta++; } i++; if (i == errpts[curr_pt]) { //record this fprintf(err_file, "%d %d %d %d\n", i, deth, deta, ah); deth = 0; deta = 0; ah = 0; curr_pt++; } } fclose(hlfp); fclose(slfp); fclose(err_file); free(outbuf); free(csl); free(apb.p1start); free(txtptr); crm_svm_block_free_data(blck); return 0; } #endif //#define SVM_NON_TEXT #ifdef SVM_NON_TEXT //and yet another main to test taking non-text data int main(int argc, char **argv) { Vector *theta; int currarg = 1, i; char *opt; FILE *thout = NULL; SVM_Solution *sol = NULL; Matrix *Xy; if (argc < 2) { fprintf(stderr, "Usage: linear_svm [options] example_file [solution_file].\n"); exit(1); } opt = argv[currarg]; DEBUG_MODE = 0; while (currarg < argc && opt[0] == '-') { switch(opt[1]) { case 'v': DEBUG_MODE = atoi(&(opt[2])); break; case 't': currarg++; thout = fopen(argv[currarg], "w"); if (!thout) { fprintf(stderr, "Bad theta output file name: %s. Writing to stdout.\n", argv[currarg]); thout = stdout; } break; case 'p': thout = stdout; break; case 's': currarg++; sol = read_solution(argv[currarg]); break; default: fprintf(stderr, "Options are:\n"); fprintf(stderr, "\t-v#: Verbosity level.\n"); fprintf(stderr, "\t-t filename: Theta ascii output file.\n"); fprintf(stderr, "\t-p filename: Print theta to screen.\n"); fprintf(stderr, "\t-s filename: Starting solution file.\n"); break; } currarg++; opt = argv[currarg]; } printf("DEBUG_MODE = %d\n", DEBUG_MODE); if (currarg >= argc) { fprintf(stderr, "Error: No input file or no output file.\n"); fprintf(stderr, "Usage: linear_svm [options] example_file [solution_file].\n"); if (thout != stdout) { fclose(thout); } exit(1); } Xy = matr_read_bin(argv[currarg]); currarg++; //if (sol) { //solve(NULL, &sol); //} else { solve(&Xy, &sol); //} matr_free(Xy); theta = sol->theta; if (thout == stdout) { //otherwise this just gets in the way fprintf(thout, "There are %d SVs\n", sol->SV->rows); fprintf(thout, "Solution using Cutting Planes is\n"); } if (thout) { vector_write_sp_fp(theta,thout); } if (currarg < argc) { //write out the solution write_solution(sol, argv[currarg]); } free_solution(sol); return 0; } #endif crm114-20100106-BlameMichelson.src/crm_compiler.c0000644000000000017500000005233011321154266017617 0ustar rootwsy// crm114_compiler.c - CRM114 microcompiler // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file // (but not the stmt table ) #define BASE_COMPILER_TABLE_HERE #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // Here's the real statement description table. // STMT_TABLE_TYPE stmt_table[] = { // // text internal nlen exec? min max min max min max flags // rep code slashargs parens boxes // {"\n", CRM_NOOP, 0, 0, 0, 0, 0, 0, 0, 0, 0}, { "#", CRM_NOOP, 1, 0, 0, 0, 0, 0, 0, 0, 0}, { "insert=", CRM_NOOP, 7, 0, 0, 0, 0, 0, 0, 0, 0}, { "noop", CRM_NOOP, 0, 0, 0, 0, 0, 0, 0, 0, 0}, { "exit", CRM_EXIT, 0, 1, 0, 0, 0, 1, 0, 0, 0}, { "{", CRM_OPENBRACKET, 0, 0, 0, 0, 0, 0, 0, 0, 0}, { "}", CRM_CLOSEBRACKET, 0, 0, 0, 0, 0, 0, 0, 0, 0}, { "goto", CRM_GOTO, 0, 0, 1, 1, 0, 0, 0, 0, 0}, { "match", CRM_MATCH, 0, 1, 1, 1, 0, 1, 0, 1, CRM_ABSENT | CRM_NOCASE | CRM_LITERAL | CRM_FROMSTART | CRM_FROMCURRENT | CRM_FROMNEXT | CRM_FROMEND | CRM_NEWEND | CRM_BACKWARDS | CRM_NOMULTILINE }, { "fail", CRM_FAIL, 0, 1, 0, 0, 0, 0, 0, 0, 0}, { "liaf", CRM_LIAF, 0, 1, 0, 0, 0, 0, 0, 0, 0}, { "accept", CRM_ACCEPT, 0, 1, 0, 0, 0, 0, 0, 0, 0}, { "trap", CRM_TRAP, 0, 1, 1, 1, 0, 1, 0, 0, 0}, { "fault", CRM_FAULT, 0, 1, 0, 1, 0, 0, 0, 0, 0}, { "output", CRM_OUTPUT, 0, 1, 0, 1, 0, 0, 0, 1, CRM_APPEND }, { "window", CRM_WINDOW, 0, 1, 0, 2, 0, 2, 0, 0, CRM_NOCASE | CRM_BYCHAR | CRM_BYEOF | CRM_EOFACCEPTS | CRM_EOFRETRY }, { "alter", CRM_ALTER, 0, 1, 1, 1, 1, 1, 0, 0, 0}, { "learn", CRM_LEARN, 0, 1, 1, 1, 1, 1, 0, 1, CRM_NOCASE | CRM_REFUTE | CRM_MICROGROOM | CRM_ERASE | CRM_APPEND }, { "classify",CRM_CLASSIFY, 0, 1, 1, 1, 1, 2, 0, 1, CRM_NOCASE }, { "isolate", CRM_ISOLATE, 0, 1, 0, 1, 1, 1, 0, 0, 0}, { "input", CRM_INPUT, 0, 1, 0, 0, 1, 1, 0, 1, CRM_BYLINE }, { "syscall", CRM_SYSCALL, 0, 1, 1, 1, 0, 3, 0, 0, CRM_KEEP | CRM_ASYNC }, { "hash", CRM_HASH, 0, 1, 1, 1, 1, 1, 0, 0, 0}, { "translate",CRM_TRANSLATE,0, 1, 0, 2, 0, 1, 0, 1, CRM_UNIQUE | CRM_LITERAL }, { "intersect",CRM_INTERSECT,0, 1, 0, 0, 1, 1, 1, 1, 0}, { "union", CRM_UNION, 0, 1, 0, 0, 1, 1, 1, 1, 0}, { "eval", CRM_EVAL, 0, 1, 1, 1, 1, 1, 0, 0, 0}, { "alius", CRM_ALIUS, 0, 1, 0, 0, 0, 0, 0, 0, 0}, { "call", CRM_CALL, 0, 1, 0, 0, 0, 0, 0, 0, 0}, { "routine", CRM_ROUTINE, 0, 1, 0, 0, 0, 0, 0, 0, 0}, { "return", CRM_RETURN, 0, 1, 0, 1, 0, 0, 0, 0, 0}, { "debug", CRM_DEBUG , 0, 0, 0, 0, 0, 0, 0, 0, 0}, { "clump", CRM_CLUMP, 0, 1, 0, 1, 1, 1, 0, 1, 0}, { "pmulc", CRM_PMULC, 0, 1, 0, 1, 0, 0, 0, 1, 0}, { "NoMoreStmts",CRM_UNIMPLEMENTED,0,0, 0, 0, 0, 0, 0, 0, 0} }; // Get a file into a memory buffer. We can either prep to execute // it, or use it as read-only data, or as read-write data. // int crm_load_csl (CSL_CELL *csl) { struct stat statbuf; // status buffer - for statting files int i; // open it first csl->filedes = -1; if (csl->rdwr) { csl->filedes = open (csl->filename, O_RDWR); } else { csl->filedes = open (csl->filename, O_RDONLY); }; if (csl->filedes < 0) { if (csl->filedes == ENAMETOOLONG) untrappableerror5 ("Couldn't open the source code file because the ", "filename is too long.", CRM_ENGINE_HERE); else untrappableerror5 ("Couldn't open the file: ", csl->filename, CRM_ENGINE_HERE ); }; if (internal_trace > 0) fprintf (stderr, "file open on file descriptor %ld \n", csl->filedes); // and stat the file descriptor fstat (csl->filedes, &statbuf); csl->nchars = statbuf.st_size; if (internal_trace > 0) fprintf (stderr, "file is %ld bytes\n", csl->nchars); if (csl->nchars + 2048 > max_pgmsize) untrappableerror5 ("Your program is too big. ", " You need to use smaller programs or the -P flag, ", CRM_ENGINE_HERE); // and read in the source file csl->filetext = (char *) malloc ( max_pgmsize * sizeof (char)); if (csl->filetext == NULL) untrappableerror5 ("malloc of the file text failed","", CRM_ENGINE_HERE ); if (internal_trace > 0) fprintf (stderr, "File text malloc'd at %lX \n", (long int) csl->filetext); // put in a newline at the beginning csl->filetext[0] = '\n'; // read the file in... i = read (csl->filedes, &(csl->filetext[1]), csl->nchars); // and put a cr and then a null at the end. i++; csl->filetext[i] = '\n'; i++; csl->filetext[i] = '\n'; i++; csl->filetext[i] = '\000'; csl->nchars = i; csl->hash = strnhash (csl->filetext, csl->nchars); if (user_trace) fprintf (stderr, "Hash of program: %X, length %ld bytes \n", csl->hash, csl->nchars ); return 0; } // The CRM114 microcompiler. It takes in a paritally completed // csl struct which is the program, the program length, and // returns a completed microcompile table for that particular // program. Side effect: it also sets some variables in the // variable hash table (a.k.a. the VHT) int crm_microcompiler ( CSL_CELL *csl, VHT_CELL ** vht ) { // *** This is the CRM114 microcompiler. It's a 5-pass // compiler, but each pass is really simple. // pass 1) count lines and allocate microcompile table // pass 2) run thru file, matching on first word in statement, setting // statement type code. If it's a label statement, also add // an entry to the variables hash table. If the statement // assigns a value, we _don't_ put the value itself into the // variable hash table until we actually execute the statement // pass 3) run thru file, setting bracket nesting level. If // bracket level ever goes below 0 or is nonzero at the end // of the file, issue a warning. // pass 4) run thru file, setting FAIL and LIAF targets // // HACK ALERT HACK ALERT HACK ALERT // // THIS WHOLE PARSER IS A PIECE OF JUNK. IT REALLY NEEDS TO BE // REDONE IN BISON. MAYBE FOR V 2.0? // // NOTE: this is redone to be table-driven; it's still a piece of // junk but it's _good_ junk. And it will allow us to do JITting // of programs (1 pass to get labels, then JIT each statement as we // encounter it the first time. This should make programs run // significantly faster as we never parse twice, and we only parse // the full statement if we are about to execute it. // // HACK ALERT HACK ALERT HACK ALERT // i, j, and k are random beat-upon-me longs long i, j, k; // a counter to use when iterating thru statements long stmtnum; // number of statements actually used long numstmts; // how many chars in this program long pgmlength; // pointer to the chars char *pgmtext; // how deep a nesting of brackets does this file have? long bracketlevel; // index of first char of this statement long sindex; // index of first nonblank character in the line? long nbindex; // length of the first nonblank string on the line? long nblength; // index of first character in the arguments to any particular statement long aindex; // length of this statement long slength; // have we seen an action statement yet? long seenaction ; // counters for looking through the statemt archetype table. long stab_idx; long stab_max; if (internal_trace > 0) fprintf (stderr, "Starting phase 1 of microcompile.\n"); seenaction = 0; pgmlength = csl->nchars; pgmtext = csl->filetext; // *** Microcompile phase 1 **** // *** Allocation of the microcompile table // get a line count j = 0; // j will be our line counter here. // preprocessor has already run; all statements have been // properly line-broken and we just count the '\n's. for (i = 0; i < pgmlength; i++) { if ( pgmtext[i] == '\n' ) j++; }; csl->nstmts = j; // now, allocate the microcompile table if (user_trace > 0) fprintf (stderr, "Program statements: %ld, program length %ld\n", j, pgmlength); csl->mct = (MCT_CELL **) malloc (sizeof (MCT_CELL * ) * (csl->nstmts + 10) ); if (!csl->mct) untrappableerror5 ("Couldn't malloc MCT table.\n" "This is where your compilation results go, " "so if we can't compile, we can't run. Sorry.","", CRM_ENGINE_HERE); if (internal_trace > 0) fprintf (stderr, "microcompile table at %lX\n", (long) csl->mct); // malloc all of the statement cells. for (i = 0; i < csl->nstmts + 10; i++) { csl->mct[i] = (MCT_CELL *) malloc (sizeof (MCT_CELL)); if (!csl->mct[i]) untrappableerror5 ( "Couldn't malloc MCT cell. This is very bad.\n","", CRM_ENGINE_HERE); }; // *** Microcompile phase 2 - set statement types // iterate through the statements, setting types. // i is our character counter // // HACK ALERT HACK ALERT HACK ALERT // // THIS WHOLE PARSER IS A PIECE OF JUNK. IT REALLY NEEDS TO BE // REDONE IN BISON. MAYBE FOR V 2.0? // // HACK ALERT HACK ALERT HACK ALERT if (internal_trace > 0) fprintf (stderr, "Starting phase 2 of microcompile.\n"); stmtnum = 0; sindex = 0; bracketlevel = 0; // #ifdef STAB_TEST // Since we don't know how big the stmt_table actually is, // we go through it once, looking for the "NoMoreStmts" statement, // with operation code of CRM_BOGUS. This tells us how many // entries there are; we also set up the namelens for the // statement types. // stab_idx = 0; while ( strncmp (stmt_table[stab_idx].stmt_name, "NoMoreStmts", strlen ("NoMoreStmts")) != 0) { if (stmt_table[stab_idx].namelen == 0) stmt_table[stab_idx].namelen = strlen (stmt_table[stab_idx].stmt_name); stab_idx++; }; stab_max = stab_idx; // // now the statement table should be set up. // #endif // STAB_TEST while (stmtnum <= csl->nstmts && sindex < pgmlength) { long stab_stmtcode; long stab_done; // the strcspan below will fail if there's an unescaped // semicolon embedded in a string (or, for that matter, an // explicit newline). Fortunately, the preprocessor fixes the // former and the latter is explicitly prohibited by the language. // slength = strcspn (&pgmtext[sindex], "\n"); // allocate and fill in the mct table entry for this line csl->mct[stmtnum]->hosttxt = pgmtext; csl->mct[stmtnum]->apb = NULL; csl->mct[stmtnum]->start = sindex; csl->mct[stmtnum+1]->start = sindex + slength + 1; csl->mct[stmtnum]->stmt_utime = 0; csl->mct[stmtnum]->stmt_stime = 0; csl->mct[stmtnum]->stmt_type = CRM_BOGUS; csl->mct[stmtnum]->nest_level = bracketlevel; csl->mct[stmtnum]->fail_index = 0; csl->mct[stmtnum]->liaf_index = 0; csl->mct[stmtnum]->stmt_break = 0; csl->cstmt = stmtnum; // skip nbindex to the first nonblank // GROT GROT GROT here we define nonblank as values > 0x21 // GROT GROT GROT which absolutely _sucks_ in terms of coding // GROT GROT GROT portability, but it's what we have. nbindex = sindex; while (pgmtext[nbindex] < 0x021 && nbindex < slength + sindex) nbindex++; // save up the first nonblank char: csl->mct[stmtnum]->fchar = nbindex; // and set up the start of arguments as well, they start at the first // nonblank after the first blank after the command... aindex = nbindex; while (pgmtext[aindex] > 0x021 && aindex < slength + sindex ) aindex++; nblength = aindex - nbindex ; while (pgmtext[aindex] < 0x021 && aindex < slength + sindex ) aindex++; csl->mct[stmtnum]->achar = aindex; // We can now sweep thru the statement archetype table from 0 // to stab_max and compare the strlens and strings themselves. // stab_done = 0; stab_stmtcode = 0; // Empty lines are noops. if (nblength == 0) { stab_done = 1; stab_stmtcode = CRM_NOOP; }; // Comment lines are also NOOPS if ( pgmtext[nbindex] == '#') { stab_done = 1; stab_stmtcode = CRM_NOOP; }; // :LABEL: lines get special treatment if ( pgmtext[nbindex] == ':' && pgmtext[nbindex + nblength - 1] == ':') { stab_done = 1; stab_stmtcode = CRM_LABEL; k = strcspn (&pgmtext[nbindex+1], ":"); crm_setvar ( NULL, -1, pgmtext, nbindex, k+2, NULL, 0, 0, stmtnum, 0); }; // INSERTs get special handling (NOOPed..) if ( strncasecmp ( &pgmtext[nbindex], "insert=", 7) == 0) { stab_done = 1; stab_stmtcode = CRM_NOOP; }; i = -1; // Now a last big loop for the rest of the stmts. while (! stab_done) { i++; if ( nblength == stmt_table[i].namelen && strncasecmp (&pgmtext[nbindex], stmt_table[i].stmt_name, nblength) == 0) { stab_done = 1; stab_stmtcode = stmt_table[i].stmt_code; // Deal with executable statements and WINDOW if (stab_stmtcode == CRM_WINDOW && !seenaction) csl->preload_window = 0; // and mark off the executable statements if (stmt_table[i].is_executable) seenaction = 1; }; if (i >= stab_max) stab_done = 1; }; // Fill in the MCT entry with what we've learned. // csl->mct [stmtnum] -> stmt_type = stab_stmtcode; if (stab_stmtcode == CRM_OPENBRACKET) bracketlevel++; if (stab_stmtcode == CRM_CLOSEBRACKET) { bracketlevel--; // hack - reset the bracketlevel here, as a bracket is "outside" // the nestlevel, not inside. csl->mct[stmtnum]->nest_level = bracketlevel; } if (0) // (internal_trace) { fprintf (stderr, "\nStmt %3ld type %2d ", stmtnum, csl->mct[stmtnum]->stmt_type); { long ic; for (ic = csl->mct[stmtnum ]->start; ic < csl->mct[stmtnum + 1]->start-1 ; ic++) fprintf (stderr, "%c", pgmtext[ic]); }; }; #ifdef STAB_TEST if (stab_stmtcode != csl->mct[stmtnum]-> stmt_type) { fprintf (stderr,"Darn! Microcompiler stab error (not your fault!)\n" "Please file a bug report if you can. The data is:\n"); fprintf (stderr, "Stab got %ld, Ifstats got %d, on line %ld with len %ld \n", stab_stmtcode, csl->mct[stmtnum]->stmt_type, stmtnum, nblength ); fprintf (stderr, "String was >>>"); fwrite ( &pgmtext[nbindex], 1, nblength, stderr); fprintf (stderr, "<<<\n\n"); }; #endif // STAB_TEST // check for bracket level underflow.... if (bracketlevel < 0) fatalerror5 (" Your program seems to achieve a negative nesting", "level, which is quite likely bogus.", CRM_ENGINE_HERE); // move on to the next statement - +1 to get past the \n sindex = sindex + slength + 1; stmtnum++; } numstmts = stmtnum - 1; // check to be sure that the brackets close! if (bracketlevel != 0) nonfatalerror5 ("\nDang! The curly braces don't match up!\n", "Check your source code. ", CRM_ENGINE_HERE); // Phase 3 of microcompiler- set FAIL and LIAF targets for each line // in the MCT. { long stack[MAX_BRACKETDEPTH]; long sdx ; if (internal_trace > 0) fprintf (stderr, "Starting phase 3 of microcompile.\n"); // set initial stack values sdx = 0; stack[sdx] = 0; // Work downwards first, assigning LIAF targets for (stmtnum = 0; stmtnum < numstmts; stmtnum++) { switch (csl->mct[stmtnum]-> stmt_type) { case CRM_OPENBRACKET: { // if we're open bracket, we're the new LIAF target, // but we ourselves LIAF to the previous open bracket. csl->mct[stmtnum]->liaf_index = stack[sdx]; sdx++; stack[sdx] = stmtnum; }; break; case CRM_CLOSEBRACKET: { // if we're a close bracket, we LIAF not to the current // open bracket, but to the one before it, so pop the // stack and LIAF there. sdx--; csl->mct[stmtnum]->liaf_index = stack [sdx]; }; break; default: { // Most statements use the current liaf csl->mct[stmtnum] -> liaf_index = stack[sdx]; } break; } } // Work upwards next, assigning the fail targets sdx = 0; stack[sdx] = numstmts+1; for (stmtnum = numstmts; stmtnum >= 0; stmtnum--) { switch (csl->mct[stmtnum]-> stmt_type) { case CRM_CLOSEBRACKET: { // if we're close bracket, we're the new FAIL target, // but we ourselves FAIL to the next close bracket csl->mct[stmtnum]->fail_index = stack[sdx]; sdx++; stack[sdx] = stmtnum; }; break; case CRM_OPENBRACKET: { // if we're an open bracket, we FAIL not to the current // CLOSE bracket, but to the one before it, so pop the // stack and FAIL there. sdx--; csl->mct[stmtnum]->fail_index = stack [sdx]; }; break; default: { // Most statements use the current liaf csl->mct[stmtnum] -> fail_index = stack[sdx]; } break; } } // Work upwards again, assigning the TRAP targets sdx = 0; stack[sdx] = numstmts+1; for (stmtnum = numstmts; stmtnum >= 0; stmtnum--) { switch (csl->mct[stmtnum]-> stmt_type) { case CRM_TRAP: { // if we're the TRAP statement, we change the TRAP target, // but we ourselves TRAP to the next TRAP statement csl->mct[stmtnum]->trap_index = stack[sdx]; stack[sdx] = stmtnum; }; break; case CRM_OPENBRACKET: { // if we're an open bracket, we trap not to the current // level's TRAP statement, but to the one before it, so pop the // stack and aim TRAP there. sdx--; csl->mct[stmtnum]->trap_index = stack [sdx]; }; break; case CRM_CLOSEBRACKET: { // if we're a close bracket, we keep our current trap target // but move down one level in the stack stack[sdx + 1] = stack [sdx]; sdx++; csl->mct[stmtnum]->trap_index = stack [sdx]; }; break; default: { // Most statements use the current TRAP level csl->mct[stmtnum] -> trap_index = stack[sdx]; } break; } } // print out statement info if desired if ( prettyprint_listing > 0 ) { for (stmtnum = 0; stmtnum <= numstmts; stmtnum++) { fprintf (stderr, "\n"); if (prettyprint_listing > 1) fprintf (stderr, "%4.4ld ", stmtnum); if (prettyprint_listing > 2) fprintf (stderr, "{%2.2d}", csl->mct[stmtnum]->nest_level); if (prettyprint_listing > 3) { fprintf (stderr, " <<%2.2d>>", csl->mct[stmtnum]->stmt_type); fprintf (stderr, " L%4.4d F%4.4d T%4.4d", csl->mct[stmtnum]->liaf_index, csl->mct[stmtnum]->fail_index, csl->mct[stmtnum]->trap_index); } if (prettyprint_listing > 1) fprintf (stderr, " : "); // space over two spaces per indent for (k = 0; k < csl->mct[stmtnum]->nest_level; k++) fprintf (stderr, " "); // print out text of the first statement: if (prettyprint_listing > 4) fprintf (stderr,"-"); k = csl->mct[stmtnum]->fchar; while (pgmtext[k] > 0x021 && k < csl->mct[stmtnum]->achar) { fprintf (stderr, "%c", pgmtext[k]); k++; }; if (prettyprint_listing > 4) fprintf (stderr,"-"); fprintf (stderr, " "); // and if there are args, print them out as well. if ( csl->mct[stmtnum]->achar < csl->mct[stmtnum+1]->start-1) { if (prettyprint_listing > 4) fprintf (stderr, "="); for (k = csl->mct[stmtnum]->achar; k< csl->mct[stmtnum+1]->start-1; k++) fprintf (stderr, "%c", pgmtext[k]); if (prettyprint_listing > 4) fprintf (stderr, "="); } } fprintf (stderr, "\n"); } // Finally got to the end. Fill in the last bits of the CSL // with the new information, and return. csl->nstmts = numstmts; if (internal_trace > 0) fprintf (stderr, "microcompile completed\n"); } return (0); } crm114-20100106-BlameMichelson.src/inoc_passwd.txt0000644000000000017500000000013711321154266020050 0ustar rootwsysomebody_you_trust spam their_secret_code somebody_you_trust nonspam their_nonspam_secret_code crm114-20100106-BlameMichelson.src/FAQ.txt0000644000000000017500000007400411321154266016152 0ustar rootwsy# # FAQ.txt - The CRM114 and CRM114 Mailfilter FAQ # # Copyright 2006-2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # The CRM114 and CRM114 Mailfilter FAQ (last update - 2006-06-06 by WSY) *** What does CRM114 stand for? - CRM stands for Controllable Regex Mutilator, concept # 114. It's a mutilating regex engine, designed to slice and dice text with the vigor of a Cuisinart on an overripe zucchini. There is no truth to the rumor that it really means "Crash's Regex Monstrosity". *** Very funny. What does CRM114 _really_ stand for? - CRM114, or more accurately, "the CRM114 Discriminator" is from the Stanley Kubrick movie "Dr. Strangelove". (an _excellent_ movie- you should go buy it and watch it. Really. Some critics have said it is the greatest movie of it's era; others are more accurate and call it the greatest movie _ever_ made. In my opinion, a hundred years from now, Dr. Strangelove will be considered the _definitive_ satirical history of the Cold War and perhaps of the second half of the 20th Century, an archetype movie, in the same class as "Metropolis", "Nosferatu", "M", "The Wizard of Oz", "Blade Runner", and "Star Wars"). But I digress... Anyway, the "CRM114 Discriminator" in the movie is a fictional accessory for a radio receiver that's "designed not to receive _at all_", that is, unless the message is properly authenticated. That was the original goal of CRM114 - to discriminate between authentic messages of importance, and get rid of the rest. Note the emphasis on "get rid of". Unlike many other "filters", CRM114's default action is to read all of input, and put NOTHING onto output. The simplest possible CRM114 program does exactly that, read all of stdin, and throw it away. With vigor. *** I tried a "train", and I got an error message like: [X-CRM114-Action: LEARN AS NONSPAM UNNECESSAR\Y- ALREADY CLASSIFIED CORRECTLY - NO ACTION TAKEN] (or the same for a SPAM LEARN) - Ahhh... you've got a mail delivery or mail reading program that changed the headers just a little bit, so what was spam is now nonspam, or vice versa. The fix is to tell the system to override this "safety valve", either by: 1) Switching over to mailreaver.crm :) 2) use the "force" command, either in the inline command like: "command secretpassword spam force" or in the command line, as: mailfilter.crm --spam --force < my_text.txt Either will do it for you. *** I've got a bug! What do I do? - First, read the _whole_ HOWTO, which will tell you a few useful tricks and diagnostics. You can +probably+ fix the bug yourself. Then, read the rest of this document (the FAQ). If it's not clear at this point, and you _really_ have read both the FAQ and the HOWTO, then you have a choice: 1) Smart computer people will then _also_ read the QUICKREF.txt file to understand the CRM114 language, and then the INTRO.txt file to see how it all works. Then they might try debugging themselves. 2) Less computer-savvy types at this point might want to put their question onto the crm114 mailing list. Hint: the location of the mailing list is hidden somewhere in the documents available from the CRM114 webpage, which includes the above - the HOWTO, the FAQ, the QUICKREF, and the INTRO. So, there's a good reason to read the docs. :-) *** This is a BIG bug! I got it to SEGFAULT! - Okay, that's bad. It shouldn't do that. You should still read this document, the HOWTO, the QUICKREF, and known-bugs.txt, as the bug may already be known and a workaround developed (or it may be a problem with some external system that's also known). Of course, if you've managed to SEGFAULT the CRM114 system, then try to reduce your program and data right down to the minimum needed (if you're using an unaltered mailfilter.crm, then don't worry about this). Then please let us know on the main CRM114 general list. IF YOU POST A BUG REPORT, _PLEASE_ INCLUDE THE FOLLOWING: * - What version of CRM114 you are running (find out by typing "crm -v", or by turning on "add headers" and looking at the X-CRM114-Version header. If you can, include the headers in your bug report. * - What version (if any) of mailfilter.crm you're using (if you have "add headers" turned on, which it is by default, you will have this as a checksum in the X-CRM114-Version header as "MF-something", where "something" is eight hexadecimal digits. * - Any other details that might be pertinent, like how you invoke CRM114 (via procmail, via .forward, etc). *** I'm training my CRM114 install. When I mail mistakes back to myself to retrain, I was wondering which headers to include and where to place the retrain command? << For Reaver-based installs (which are now recommended) >> You have a Reaver-based system if you are running mailreaver.crm ; this is good because mailreaver.crm uses very little of your text. All it really needs are the reaver-cache IDs, which look like sfid-yyyymmdd-hhmmss-uuuuuu-HHHHHHHH (where ymdhmsu are the year, month, day, hour, minute, second, and microsecond when the mail hit CRM114 the first time, and H is a hexadecimal checksum.). There's a cache of files (in sorta-like-maildir format) that contains SMTP-time copies of the email; those copies are "clean" and as long as the training information you send to yourself contains either the intact X-CRM114-CacheID: sfid line, or a Message-Id: containing the above sfid) then mailreaver.crm will use the clean copy of the text and you don't need to worry at all. << for non-Reaver installs - no longer recommended >> The basic rule is to make the stuff after the COMMAND look as much like the original misclassified mail as possible. Since you can configure CRM114 to do things to the subject, the header, and the body, you have to _undo_ that stuff ( design flaw there? ) What you should strive for is an email "forward" below the COMMAND line that looks _exactly_ like the mail looked before it got to CRM114 in the first place, with all the headers there and intact. Expand the headers fully - then remove all of the headers that CRM114 inserted. Then check the text; down a ways, you may find a CRM114 "statistics" section - remove that. You may also find an "extra stuff" section, remove that too. Then put the COMMAND line right before the first of the expanded headers. Then, before you hit , check your work; you can cost yourself considerable accuracy by training in the wrong things! *** I want to change classifiers. Can I just change it in the mailfilter :clf: setup? - Sorry, NO. That's not the way it works. You pick a classifier in mailfilter with the :clf: variable (that stands for CLassifier Flags), and everything that you LEARN in will depend on that initial choice. Once you pick it, you MUST stay with that :clf: setup until you are willing to delete your .css files and retrain from scratch. The reason is that all the classifiers have file formats that are all similar enough (except for Hyperspace, which uses a varying bucket length, and Correlator, which uses plain text) that some of the utilities like cssutil *think* they can work on them all. Boy, are they _wrong_. Except for Markovian, OSB, and OSB-Unique, they are NOT interchangeable. Switching between Markovian, OSB, and OSB-Unique classifiers is permitted if you set a flag in the file crm114_config.h. This flag was supposed to be default on, but several releases were made with it off, and now it's basically "stuck" as a historical artifact and thus although you can use the same utility for statistics operations on Markov, OSB, and OSB-Unique, you can't crossover classify with them. We're sorry. Anyway, once you train with a particular classifier, you should stay with it until/unless you remove the .css files and retrain from scratch. Note: if you keep two separate directories of "good" and "spam" email, then mailtrainer.crm will let you build the statistics files automatically. This is how we try out new classifiers. << Gruesome Details Warning -- OK to skip >> Markovian, OSB, and OSB-Unique all use the same .css file format, Each feature bucket has a hash field is a pair of INT32's, and the data is one INT32. The first bucket is reserved for version information, but that code is curently disabled. All other information is stored "in stream"- that is, hashed, with a secondary hash field == 0, indicating "I am a special field, never microgroom me away". The advantage of this is that Markovian, OSB, and OSB-Unique can "grow features compatibly", and the header will always stay the same size. This is also why you can switch between Markovian, OSB, and OSB-unique with relative impunity (assuming you set the compatibility switch). OSBF uses it's own format. Hash field is a pair of INT32's and the data field is bit fields within an INT32 (though nominally when the system isn't filtering or learning, it looks like an INT32.) There's an additional 20-byte header struct that has a bunch of stuff in it as well, and as that struct isn't a multiple of the normal bucket and it's at the front, the byte offsets of OSBF buckets are not at the "normal" offsets. That's why OSBF is written via a bunch of macros that make this all look reasonable - and why OSBF is *not* compatible with Winnow, OSB, and OSB-Unique. Winnow uses it's own format very similar to OSB's format. Hash field is still a pair of INT32's, but the data is a FLOAT32. Everything else is the same. That's why cssutil runs on a winnow .css will give such bogus results- the hash fields are in the right places, but you're treating FLOAT32's as INT32s and that never leads to anything but pain. Hyperspace uses it's own format, with varying-length fields. Hash fields are a 0x00000000-terminated set of INT32 fields (typically a few thousand of them). There is no data field as multiple counts of a hyperspace feature are rare and in the event that they actually occur, it uses less space to just repeat the occasional INT32 hash that represents them rather than to use up file space for a value that is almost always 0x00000001. Correlator uses it's own format. It is actually plain text, no headers, no delimiters. *** What do these wierd version names mean? - The version _number_ of a CRM114 release is the year-month-day at which that version went into testing. For example, 20031225 means year 2003, month 12, day 25, or Christmas Day, 2003. This makes it easy to see how old a version of CRM114 is. As open-source software revs very quickly to fix bugs and incorporate improvements, if you have a version more than a month or two old you probably are running obsolete software. The -Blame is an easier way to refer to a version; it reflects one or more of: 1) someone who sent in a massive or important patch that fixed a big problem 2) someone who pointed out a big problem, thereby motivating me to fix it 3) someone or something that either motivated me to get some work done, or impeded that work, by means unstated (but you can often guess... I'll give you a hint- the sushi waitresses did _not_ send in a patch). Generally speaking, it is an _honor_ to be blamed for a particular release of CRM114, and recipients of that honor should wear it proudly. :-) *** I've got a _ton_ of spam in my library. Why shouldn't I just load it into CRM114 and get a head start on training? - This used to be a bad idea, and unless you use a special tool like mailtrainer.crm, it's still a bad idea. But, if you do have such a tool (and we include mailtrainer with the kit now) it's actually quite useful. Here's why you shouldn't do massive bulk loadins: CRM114's learning algorithm is predicated on using the TOE strategy (or variations thereof, like DSTTTR which give further accuracy boost) - that is, Train Only Errors. When CRM114's mailfilter makes a mistake, you train in the right result and it will do better next time. But say you bulk-load all of your good and spam email. You will end up with bloated overflowing classifier files and they won't be very accurate, because they contain a lot of extraneous information. I've tested this _exhaustively_, spending a few CPU-weeks in the process; CRM114 really does work best if you train in only errors, and in the order encountered. It's about a factor of two times more accurate, and about a factor of two times faster during the execution. The actual numbers work out something like this: I used a torture-test corpus of 4147 messages, split roughly 60/40 between nonspam and spam. Running TOE, with the 5th order polynomial and entropic correction, the error rate curve showed a nice exponential approach to zero errors. Reshuffling the corpus of 4147 messages ten different ways, the final error rate (that is, the error rate in the last 500 messages) was just about 6.9 errors per 500 final messages, or 1.3% (very good on such a difficult corpus. I _personally_, when hand-scoring these messages, get about a 30% error rate). Training _every_ sample yielded about 14.9 errors in the final 500 messages, or an error rate of about 2.9 %. Interestingly, the error curve or training every sample dove more quickly initially, but then _rose_ again as new items were trained. The relative runtimes were 14 minutes (roughly) for TOE and training only errors, versus about 29 minutes (roughly) for training everything, averaged across the 10 runs of 4147 messages each. So, if you don't mind being something more than a factor of 2 less accurate, and twice as slow, you can go ahead and train everything. Seriously- if you want accuracy, start from an empty .css file and train only errors, as you encounter them. *** But WHY does it work better to train only errors ? - Intuitively, here's how you can understand it: If you train in only on an error, that's close to the minimal change necessary to obtain correct behavior from the filter. If you train in something that would have been classified correctly anyway, you have now set up a prejudice (an inappropriately strong reaction) to that particular text. Now, that prejudice will make it _harder_ to re-learn correct behavior on the next piece of text that isn't right. Instead of just learning the correct behavior, we first have to unlearn the prejudice, and _then_ learn the correct behavior. It can be done- but it doesn't converge on the right answer as fast as never getting these unwarranted prejudices in the first place. In filters as in people, prejudices are generalizations that are best avoided. ----- There is a secondary effect as well, due to the limits to growth of the .css files. If you train everything, you will typically start seeing CRM114 go into microgrooming at around a megabyte of text. This is because there is a limited amount of space in a growth-limited .css file. When you reach this point, for every new feature added, at least one old feature must be forgotten. This loss of information is a mixed blessing- although useful information is now being lost, old prejudices are also being forgotten. This slow tracking allows even an aging, saturated CRM114 system data base to adapt to an evolving spam stream. Nota Bene: It actually turns out that the above is almost completely true, but not _absolutely_ true. You can get significant accuracy improvement if you train not just errors, but also "almost errors". Typically (for the default OSB classifiers) anything with pR scores between -10 and +10 can be trained in for extra accuracy. This is called "thick threshold training *** What is this "mailtrainer.crm" tool of which you speak? - Mailtrainer.crm is a program that you supply a pair of directorys to, and a few extra parameters, and it goes off and does thick-threshold optimization training on your .css files to give you really good classification. It's not very fast (about 50 messages/second) and makes multiple passes (usually 3 to 5) but will about double your accuracy if allowed a "full grind" of 5 completed passes on a decent-sized corpus. The exact command to invoke mailtrainer.crm will vary depending on your version, so you should look in the HOWTO or README for more information. So- go there now. *** Why are the bucket files called .css files? They aren't cascading style sheets. The .css suffix for SBPH bucket files originally stood for CRM Sparse Spectra, until it was pointed out by a colleague that "sparse spectra" was actually taken by another related but different method. The name stuck, even though it was no longer strictly accurate. *** How accurate is CRM114 for anti-spam filtering? - Depending on your spam/nonspam mix, _very_ accurate. I regularly clock over 99%; I've had months where it was over 99.9%. DON'T expect this level of performance without training on your errors for a week or more. Also note: spam _evolves_. A filter that was perfect in June may be making errors by December as spam topics change and attacks vary. Don't feel bad if you have to retrain. That's part of the spammer's attacks; if the spammers stop mutating their attacks it means we're no longer making a difference (and impacting their business model) *** It was working fine, then I trained one thing and it started making mistakes again! Did I break it? - Ah, you've encountered what we've termed an "error shower", (or, depending on topic, a Porn Storm). What's happened is that your filter was just on the verge of accuracy; it made an error, you retrained it, but the retrain went too far. Don't worry. Keep training, and the error shower will damp out and you'll quickly converge on an even more accurate filter. Error showers are most common for me in the third to sixth month of use; and usually they occur in groups of four to six related errors. Then they damp away for a month or so; eventually they stop. So relax... *** Why did you make the CRM114 language so weird? - Because I had some ideas about how I thought a "filter language" should be, and wanted to see how they worked out in practice. I had a bad experience with PERL, so I wanted a language where everything was easy to understand, where the actions of a particular statement could always be determined without referring to ANY other statement, let alone "magic mind reading" and "action-at-a-distance"... I probably would do it differently now that I've done it this way. *** So, is CRM114 a mailfilter, or what? - No, CRM114 is actually a language that makes it easy to write filters of any sort. The most useful of these so far is for mail filtering; the CRM114 distribution pack contains a pretty reasonable mail filter for people who want it to "just work". Other people have written Usenet filters, Web content filters, and (in a spree of creative hackery) a "cheater seeker" to find people who were playing multiple users in a competitive email-based roleplay game (and, by violating the one-user-one-player-character rule) gaining an unfair advantage over the other game players. *** What algorithm does the mailfilter use? - There's a whole file that just describes it ("classify_details.txt") in the distribution, but in short, it matches short phrases from the incoming text with phrases you supplied previously as example text. In reality, it does a lot of hashing and polynomials to make the run time acceptable. I call the filtering algorithm Sparse Binary Polynomial Hashing with Bayesian Chain Rule evaluation (SBPH/BCR), which gives you a vague idea of how it might work inside. Note that in CRM114's included Mailfilter.crm, we do NOT do "special tagging", such as creating special tokens saying "This was in the Subject" or "This was in the Received header chain". The short-phrase sliding window is long enough that such tokens aren't necessary. Minor Update- by altering the weightings of different lengths of short phrases, it's possible to change the behavior of SBPH/BCR from a strict Bayesian, to an entropically-corrected Bayesian, to a Markovian matcher. Releases since roughly 20040101 have all had this improved Markovian matcher as the default configuration as this has been tested and demonstrated to provide the best performance. *** So that's it? - Mathematically, yes. But since about 2003-11-xx, the chain rule function has been updated with entropic correction; this puts more weight on longer chains. In effect, this is a Markov model of the data stream with lots of hidden states. So, SBPH/BCR is really not SBPH/BCR, its more of Sparse Binary Polynomial Hashing / Bayesian-Markov Model (SBPH/BMM). The really nice thing about SBPH/BMM is that it's slightly more accurate than the previous SBPH/BCR and it's 100% upward compatible with /BCR data files. All the information was there, it just needed to be used properly. *** Why didn't you just use Bayesian filtering? - I had played with single-word Bayesian filtering from '96 through 2000 and found that it could behave very well on very large input texts (typically, tens to hundreds of megabytes). But first brutally naive implementation was far too memory-intensive to use for real filtering; Paul Graham and others have refined Bayesian filtering to the point where it's actually very useful for large numbers of people to use (by clipping the less significant words). I didn't try that. In short, I didn't think that Bayesian filtering would work as well as it does; I was wrong. So, I tried a different idea and it seems to work pretty well too. The two methods are closely related; SBPH/BCR with a polynomial of order 1 (that is, phrase length == 1 word) is completely equivalent to 1-word Bayesian filtering without insignificant-word and hapax clipping. (addendum: as of the Nov 5th 2002 edition of CRM114, the classifier does indeed do full Bayesian matching on these polynomial features. This improves accuracy out into the >99.9% region, and December-2003-onward versions default to use Markov weighting as well, which gives somewhat better accuracy than entropically corrected Bayesian weighting. ) *** Can I use the CRM114 mailfilter from inside PROCMAIL? - Yes. You'll want to edit the file mailfilter.crm to change the actions from "accept" to "exit /0/" when the mail is good, and from mailing your spambucket account with "syscall ..." to an "exit /1/" when the mail is spam. But yes, you can. *** It's making too many mistakes! What did I do wrong? - You probably didn't do anything wrong. What's probably happened is that your spam/nonspam mix is very different than mine. This causes the words and phrases in your spam/nonspam to not match up with the words/phrases in mine. The fix is to train your spam filter anytime it makes an error. The filter learns very fast; you should see drastic improvement after a single day of error feedback. I usually pass 99% accuracy at two or three days, starting from zero. In extreme cases, delete the pre-generated spam.css and nonspam.css files, and start from scratch with the training. In one day, (and assuming sufficient spam and nonspam) you should be around 97%, two days 98%, and three days > 99%. *** How much data does it take to get that accurate? - Not a lot. At 99.67% accuracy, I only had 84K of nonspam and 185K of spam text. Interestingly, because spam contains a lot of run-on HTML, the total number of hashed datapoint features is roughly equal. *** I tried training in a huge amount of spam or nonspam, and it hung! - Actually, it probably didn't hang, and you shouldn't be doing that anyway. Read up on mailtrainer.crm *** I trained in (some huge amount) of spam and nonspam, and it doesn't work any more!!! - As noted above, you can overflow the buckets in the .css files if you train in too much spam or nonspam. You should get very good results with less than 100K each of spam and nonspam text (roughly equal numbers of messages is good too). Use the most recent spam and nonspam you can get your hands on. Don't use spam more than a few months old for training. And realize, if you're doing any "bulk training", rather than Train Only Errors, that you could be doing 2x _better_ if you trained only errors. So there. :) *** Does CRM114 or the mailfilter work for any language besides English? CRM114 uses 8-bit ASCII characters, and is 8-bit clean except for NULL string terminations (which are forced by the GNU REGEX library, not my decision). If you use the included (and defaulted) TRE regex engine instead, it's a NULL-clean system and you should be OK for 8-bit languages. BUT if you use a unicode-based or other wide-character language, you'll need to port up CRM114 to use wchar instead of char, as well as getting unicode-clean regex libraries (there is a version of TRE that does that, nicely enough). This is not a minor undertaking, but if you do it, please let me know and I'd gladly roll your changes back into the standard CRM114 kit. That said, if you get _mail_ in any language other than English, there are two possibilities. If you're lucky, you use a language that fits in 8-bit characters. In that case, you can just delete the spam.css and nonspam.css files, and re-train the mailfilter for your local spam mix. Otherwise, you're stuck with wchars, so see above. (Note: new versions of CRM114 since August 2003 default to use the TRE library, which is both 8-bit-safe and has fewer edge errors than the GNU library. The GNU-based version remains available as a Makefile option for those who depend on the GNU idiosyncrasies.) Note: new versions of CRM114 include the TRANSLATE statement as well, to make it easy to coerce 8-bit languages into ASCII or LATIN-1. Additionally you can use Kakasi (google for it) to transliterate Unicode-style languages like Japanese into ASCII. *** Why is LEARNing or CLASSIFYing so slow? - It's not _that_ slow. In fact, it's really quite fast nowadays. With a (relatively slow) Intel Centrino 1.6 GHz and a slow laptop disk, CRM114 can train a little over 50 messages/second, where each training is at least one classify, and if the message wasn't correct, then a TRAIN and another classify. With the text size limiter set at 16Kbytes, that works out to about 800 Kbytes/second, full training. This compares _very_ favorably with most other algorithms, and totally blows the doors off genetic algorithms or neural nets. Of course, that assumes that the .css file is already in the UMB's (a reasonable assumption on a Linux machine); if they're not, add a reasonable amount of time for disk I/O to page in the needed bits. Note that because LEARNing and CLASSIFYing do a lot of very randomized accesses into the bucket files, these two verbs will thrash cache pretty intensely. I've had reports that 16MB bucket files will learn or classify at horrendously slow rates- the results are still correct and accurate, but it's very annoying. We have a workaround plan (to do sorted access, or use a tree structure) in consideration. We're now a comfortable two orders of magnitude faster than SpamAssassin- but in the honorable spirit of Open Source Software, I doubt that the SpamAssassin folks will take this lying down. :) *** Why is CRM114 such a memory pig? - It's not _that piggish_. To keep speed up, the CRM114 engine preallocates five buffers for data; each buffer is the size of a data window (default 8 megabytes each, change it with the -w option). Small buffers are allocated dynamically on the stack; expect to see 50K or so there. LEARN and CLASSIFY use mmap to access the .css files as part of virtual memory, so each .css file will consume a fair amount of virtual memory (by default, 24 megabytes per .css file, but this is released as soon as it's no longer needed, and since it's mmaped rather than malloced, it does not require paging file or swapfile space). Also, since mmap does I/O through the fast paging system rather than the file IO system, it runs VERY fast. *** Aren't you afraid spammers will dissect CRM114 in order to beat it? - Not really. The basis of the LEARN/CLASSIFY filter is to look at significant phrases in human language. At least in English, there are relatively few "natural" phrases one can use to sell Viagra, porn, or low-interest mortgages. So, a spammer trying to beat CRM114 would have to avoid those phrases, and instead use phrases used in normal non-sales-pitch discourse. The cool part is that the non-sales-pitch discourse has no way to express the sales pitch! The medium cannot carry the message, there's just no way to say it. So the spammers are simply unable to function. *** That sounds awfully close to 1984 and Newspeak. - Yes, I realize this, and _yes_, it bothers me. CRM114 could provide a uniquely powerful tool for censorship. But from what I can tell from the public literature, the concept of phrase analysis is nothing new to the CIA or the NSA. *** Why can't you give me your sample spam and nonspam files? - I can't give the text out because I don't own the copyright on it! Spam text often has a copyright notice at the bottom, and nonspam text (stuff my friends/cow-orkers/etc send me) is clearly copyright _them_, not _me_. So, it would be a gross breach of confidence at the very least, if not an outright violation of any reasonable copyright law, for me to distribute that text. Fear not, you don't _have_ to trust my "magic files" to not contain a hidden agenda. You can rebuild the .css files with your own spamtext.txt and nonspamtext.txt files easily. Just delete *.css and then create two files of spam and nonspam "spamtext.txt" and "nonspamtext.txt". Run the "make cssfiles" command and new .css files will be built. Even better, delete the .css files, type cssutil -b -s spam.css cssutil -b -s nonspam.css and train only errors for a few days; you'll end up with a highly accurate filter that matches exactly the kind of mail you get, and the kind of spam you get. ------ OLD, OBSOLETE QUESTIONS ------- *** When will CRM114 go to full Bayesian? As of Nov 1 2002, it has. :-) See the file "classify_details.txt" for the full scoop. We may change the Bayesian Chain Rule at some point in the future; the reason is that the standard Bayesian Chain Rule (BCR) has an underlying assumption of statistical independence on the input events. Unfortunately, spam features and nonspam features are NOT independent and so BCR is really quite incorrect to use. I'm working on better alternatives and they will appear as they are found, tested, and proven to work better than BCR. crm114-20100106-BlameMichelson.src/windowtest_fromvar.crm0000755000000000017500000000145211321154266021450 0ustar rootwsy#! /usr/bin/crm # # windowtest_fromvar.com - testing windowing on windows from a variable # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. { output /:*:_nl: CRM114: testing windowing on windows from a variable :*:_nl: :*:_nl:/ isolate (:instuff:) /:*:_dw:/ alter (:_dw:) /data window test text / isolate (:t:) output / Input stuff is : :*:instuff: :*:_nl: :*:_nl:/ output /:*:_nl: testing delimited by 'A':*:_nl: / { window (:t:) (:instuff:) /A/ /A/ output / Data Window: :*:_dw: :*:_nl:/ output / Windowed value = :*:t: :*:_nl:/ output / Remaining stuff = :*:instuff: :*:_nl: :*:_nl:/ liaf } alius { output / Bounced out of the WINDOW loop -- no further stuff/ output /:*:_nl:/ } output / End of window-from-variable testing :*:_nl:/ } crm114-20100106-BlameMichelson.src/rewritetest.crm0000755000000000017500000000333411321154266020067 0ustar rootwsy#! /usr/bin/crm # # rewritetest.mfp - rewrite test with rewrites.mfp # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # # do rewriting, based on rewrite rules in rewrites.mfp # # Start with some data in the :_dw: window output /:*:_nl: --- CRM114 testing string rewrites :*:_nl: :*:_nl:/ output /------------------------:*:_nl:/ alter (:_dw:) /abc:*:_nl:frobnitz_singleline this should trigger :*:_nl:def:*:_nl:frobnitz_multiline zebra:*:_nl:giraffe:*:_nl:and so should this.:*:_nl:testpattern:*:_nl:mno:*:_nl:/ accept output /------------------------:*:_nl:/ { isolate (:rewrites:) input (:rewrites:) [test_rewrites.mfp] # reset matching on rewrites to start of string match [:rewrites:] // } # { # Grab the next regexturn the one-per-line patterns into a regex # First, do the line-spanning regexes. match (:ch: :fr: :to:) [:rewrites:] /(.+)>-->(.*)/ # see if the "fr" regex matches anywhere { match (:place:) /:*:fr:/ # Yep, it matched... alter it and do it again # alter (:place:) /:*:to:/ liaf } # Nope, didn't match... grab the next regex and try again, liaf } # reset back to the start of the rewrites. match [:rewrites:] // # and do it again for non-line-spanners { # Go through and do it again, except this time do it for # the non-line-spanning regexes. match (:ch: :fr: :to:) [:rewrites:] /(.+)>->(.*)/ # see if the "fr" regex matches anywhere { match (:place:) /:*:fr:/ # Yep, it matched... alter it and do it again # alter (:place:) /:*:to:/ liaf } # Nope, didn't match... grab the next regex and try again, liaf } accept crm114-20100106-BlameMichelson.src/crm_vector_tokenize.c0000644000000000017500000006770311321154266021231 0ustar rootwsy// crm_vector_tokenize.c - vectorized tokening to create 32-bit hash output // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" /////////////////////////////////////////////////////////////////////////// // // This code section (from this comment block to the one declaring // "end of section dual-licensed to Bill Yerazunis and Joe // Langeway" is copyrighted and dual licensed by and to both Bill // Yerazunis and Joe Langeway; both have full rights to the code in // any way desired, including the right to relicense the code in // any way desired. // //////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////// // // Vectorized tokenizing - get a bunch of features in a nice // predigested form (a counted array of chars plus control params // go in, and a nice array of 32-bit ints come out. The idea is to // encapsulate tokenization/hashing into one function that all // CRM114 classifiers can use, and so improved tokenization raises // all boats equally, or something like that. // // If you need two sets of hashes, call this routine twice, with // different pipeline coefficient arrays (the OSB and Markov // classifiers need this) // // If the features_out area becomes close to overflowing, then // vector_stringhash will return with a value of next_offset <= // textlen. If next_offset is > textlen, then there is nothing // more to hash. // // The feature building is controlled via the pipeline coefficient // arrays as described in the paper "A Unified Approach To Spam // Filtration". In short, each row of an array describes one // rendition of an arbitrarily long pipeline of hashed token // values; each row of the array supplies one output value. Thus, // the 1x1 array {1} yields unigrams, the 5x6 array // // {{ 1 3 0 0 0 0} // { 1 0 5 0 0 0} // { 1 0 0 11 0 0} // { 1 0 0 0 23 0} // { 1 0 0 0 0 47}} // // yields "Classic CRM114" OSB features. The unit vector // // {{1}} // // yields unigrams (that is, single units of whatever the // the tokenizing regex matched). The 1x2array // // {{1 1}} // // yields bigrams that are not position nor order sensitive, while // // {{1 2}} // // yields bigrams that are order sensitive. // // Because the array elements are used as dot-product multipliers // on the hashed token value pipeline, there is a small advantage to // having the elements of the array being odd (low bit set) and // relatively prime, as it decreases the chance of hash collisions. // // NB: the reason that we have "output stride" is that for some formats, // we want more than 32 bits per feature (Markov, standard OSB, Winnow, // etc.) we need to interleave hashes, and "stride" makes that easy. // /////////////////////////////////////////////////////////////////////////// long crm_vector_tokenize ( char *txtptr, // input string (null-safe!) long txtstart, // start tokenizing at this byte. long txtlen, // how many bytes of input. char *regex, // the parsing regex (might be ignored) long regexlen, // length of the parsing regex int my_regex_cflags, // regex flags int *coeff_array, // the pipeline coefficient control array int pipe_len, // how long a pipeline (== coeff_array row length) int pipe_iters, // how many rows are there in coeff_array unsigned *features, // where the output features go long featureslen, // how many output features (max) int features_stride, // Spacing (in words) between features long *features_out, // how many unsigned ints did we actually use up long *next_offset // next invocation should start at this offset ) { unsigned int hashpipe[UNIFIED_WINDOW_LEN]; // the pipeline for hashes unsigned int ihash; long keepgoing; // the loop controller regex_t regcb; // the compiled regex regmatch_t match[5]; // we only care about the outermost match long i, j, k; // some handy index vars int regcomp_status; long text_offset; long max_offset; long irow, icol; char errortext[4096]; // now do the work. *features_out = 0; keepgoing = 1; j = 0; // Compile the regex. if (regexlen) { regcomp_status = crm_regcomp (®cb, regex, regexlen, my_regex_cflags); if (regcomp_status > 0) { crm_regerror (regcomp_status, ®cb, errortext, 4096); nonfatalerror5 ("Regular Expression Compilation Problem: ", errortext, CRM_ENGINE_HERE); return (-1); }; }; // fill the hashpipe with initialization for (i = 0; i < UNIFIED_WINDOW_LEN; i++) hashpipe[i] = 0xDEADBEEF ; // Run the hashpipe, either with regex, or without. // text_offset = txtstart; max_offset = txtstart + txtlen; if (internal_trace) fprintf (stderr, "Text offset: %ld, length: %ld\n", text_offset, txtlen); while (keepgoing) { // If the pattern is empty, assume non-graph-delimited tokens // (supposedly an 8% speed gain over regexec) if (regexlen == 0) { k = 0; // k == 0 means found another token.... same as regexec // skip non-graphical characthers match[0].rm_so = 0; //fprintf (stderr, "'%c'", text[text_offset+match[0].rm_so]); while ( (! isgraph (txtptr [text_offset + match[0].rm_so])) && ( text_offset + match[0].rm_so < max_offset)) { //fprintf (stderr, ""%c'", txtptr[text_offset+match[0].rm_so]); match[0].rm_so ++; } match[0].rm_eo = match[0].rm_so; while ( (isgraph (txtptr [text_offset + match[0].rm_eo])) && (text_offset + match[0].rm_eo < max_offset)) { //fprintf (stderr, "'%c'", txtptr[text_offset+match[0].rm_eo]); match[0].rm_eo ++; }; if ( match[0].rm_so == match[0].rm_eo) k = 1; } else { k = crm_regexec (®cb, &txtptr[text_offset], max_offset - text_offset, 5, match, REG_EXTENDED, NULL); }; // Are we done? if ( k == 0 ) { // Not done,we have another token (the text in text[match[0].rm_so, // of length match[0].rm_eo - match[0].rm_so size) // if (user_trace) { fprintf (stderr, "Token; k: %ld T.O: %ld len %d ( %d %d on >", k, text_offset, match[0].rm_eo - match[0].rm_so, match[0].rm_so, match[0].rm_eo); for (k = match[0].rm_so+text_offset; k < match[0].rm_eo+text_offset; k++) fprintf (stderr, "%c", txtptr[k]); fprintf (stderr, "< )\n"); }; // Now slide the hashpipe up one slot, and stuff this new token // into the front of the pipeline // // for (i = UNIFIED_WINDOW_LEN; i > 0; i--) // GerH points out that // hashpipe [i] = hashpipe[i-1]; // this smashes stack memmove (& hashpipe [1], hashpipe, sizeof (hashpipe) - sizeof (hashpipe[0]) ); hashpipe[0] = strnhash( &txtptr[match[0].rm_so+text_offset], match[0].rm_eo - match[0].rm_so); // Now, for each row in the coefficient array, we create a // feature. // for (irow = 0; irow < pipe_iters; irow++) { ihash = 0; for (icol = 0; icol < pipe_len; icol++) ihash = ihash + hashpipe[icol] * coeff_array[ (pipe_len * irow) + icol]; // Stuff the final ihash value into features array features[*features_out] = ihash; if (internal_trace) fprintf (stderr, "New Feature: %x at %ld\n", ihash, *features_out); *features_out = *features_out + features_stride ; }; // And finally move on to the next place in the input. // // Move to end of current token. text_offset = text_offset + match[0].rm_eo; } else // Failed to match. This is the end... { keepgoing = 0; }; // Check to see if we have space left to add more // features assuming there are any left to add. if ( *features_out + pipe_iters + 3 > featureslen) { keepgoing = 0; } }; if (next_offset) *next_offset = text_offset + match[0].rm_eo; features[*features_out] = 0; features[*features_out+1] = 0; if (internal_trace) fprintf (stderr, "VT: Total features generated: %ld\n", *features_out); return (0); } /////////////////////////////////////////////////////////////////////////// // // End of code section dual-licensed to Bill Yerazunis and Joe Langeway. // //////////////////////////////////////////////////////////////////////////// static int markov1_coeff [] = { 1, 0, 0, 0, 0, 1, 3, 0, 0, 0, 1, 0, 5, 0, 0, 1, 3, 5, 0, 0, 1, 0, 0, 11, 0, 1, 3, 0, 11, 0, 1, 0, 5, 11, 0, 1, 3, 5, 11, 0, 1, 0, 0, 0, 23, 1, 3, 0, 0, 23, 1, 0, 5, 0, 23, 1, 3, 5, 0, 23, 1, 0, 0, 11, 23, 1, 3, 0, 11, 23, 1, 0, 5, 11, 23, 1, 3, 5, 11, 23 }; static int markov2_coeff [] = { 7, 0, 0, 0, 0, 7, 13, 0, 0, 0, 7, 0, 29, 0, 0, 7, 13, 29, 0, 0, 7, 0, 0, 51, 0, 7, 13, 0, 51, 0, 7, 0, 29, 51, 0, 7, 13, 29, 51, 0, 7, 0, 0, 0, 101, 7, 13, 0, 0, 101, 7, 0, 29, 0, 101, 7, 13, 29, 0, 101, 7, 0, 0, 51, 101, 7, 13, 0, 51, 101, 7, 0, 29, 51, 101, 7, 13, 29, 51, 101 }; #ifdef JUST_FOR_REFERENCE // hctable is where the OSB coeffs came from- this is now just a // historical artifact - DO NOT USE THIS!!! static int hctable[] = { 1, 7, 3, 13, 5, 29, 11, 51, 23, 101, 47, 203, 97, 407, 197, 817, 397, 1637, 797, 3277 }; #endif // JUST_FOR_REFERENCE static int osb1_coeff [] = { 1, 3, 0, 0, 0, 1, 0, 5, 0, 0, 1, 0, 0, 11, 0, 1, 0, 0, 0, 23}; static int osb2_coeff [] = { 7, 13, 0, 0, 0, 7, 0, 29, 0, 0, 7, 0, 0, 51, 0, 7, 0, 0, 0, 101}; static int string1_coeff [] = { 1, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 49, 51 }; static int string2_coeff [] = { 51, 49, 43, 41, 37, 31, 29, 23, 19, 17, 13, 11, 7, 5, 3, 1 }; static int unigram_coeff [] = { 1 }; ////////////////////////////////////////////////////////////////////////// // // Now, some nice, easy-to-use code wrappers for commonly used // versions of the vector tokenizer ////////////////////////////////////////////////////////////////////////// // crm_vector_tokenize_selector is the "single interface" to get // the right vector tokenizer result given an classifier algorithm default, // an int64 "flags", and a coeff vector with pipelen and pipe_iters // // Algorithm: coeff / pipelen / pipe_iters are highest priority; if // coeff is non-NULL, use those. // A specfication in the FLAGS is next highest priority; if // the FLAGS specifies a particular tokenization, use that. // Finally, use the default for the particular classifier // // Nota Bene: you'll have to add new defaults here as new classifier // algorithms get added. // long crm_vector_tokenize_selector ( ARGPARSE_BLOCK *apb, // The args for this line of code char *txtptr, // input string (null-safe!) long txtstart, // start tokenizing at this byte. long txtlen, // how many bytes of input. char *regex, // the parsing regex (might be ignored) int regexlen, // length of the parsing regex int *coeff_array, // the pipeline coefficient control array int pipe_len, // how long a pipeline (== coeff_array row length) int pipe_iters, // how many rows are there in coeff_array unsigned *features, // where the output features go long featureslen, // how many output features (max) long *features_out, // how many unsigned ints did we actually use up long *next_offset // next invocation should start at this offset ) { // To do the defaulting, we work from the "bottom up", filling // in defaults as we go. // // First, we pick the length by what the classifier expects/needs. // Some classifiers (Markov, OSB, and Winnow) use the OSB feature // set, which is 64-bit features (referred to as "hash and key", // where hash and key are each 32-bit). Others (Hyperspace, SVM) // use only 32-bit features; FSCM uses them as an ersatz entry // to do index speedup. And finally, Correlate and // Bit Entropy don't use tokenization at all; getting here with those // is an error of the first water. :-) // // Second, the actual hashing vector is chosen. Because of a // historical accident (well, actually stupidity on Bill's part) // Markov and OSB use slightly different hashing control vectors; they // should have been the same. // long long classifier_flags; long featurebits; int *hash_vec0; int hash_len0; int hash_iters0; int *hash_vec1; int hash_len1; int hash_iters1; int output_stride = 1; char *my_regex; int my_regex_len; int my_regex_cflags; char s1text[MAX_PATTERN]; long s1len; // For slash-embedded pipeline definitions. int ca[UNIFIED_WINDOW_LEN * UNIFIED_VECTOR_LIMIT]; char *string_kern_regex = "."; int string_kern_regex_len = 1; char *fscm_kern_regex = "."; int fscm_kern_regex_len = 1; if (user_trace) fprintf (stderr, "Vector tokenization summary: start %ld len %ld\n", txtstart, txtlen); // Set up some clean initial values for the important parameters. // Default is always the OSB featureset, 32-bit features. // classifier_flags = apb->sflags; featurebits = 32; hash_vec0 = osb1_coeff; hash_len0 = OSB_BAYES_WINDOW_LEN; // was 5 hash_iters0 = 4; // should be 4 hash_vec1 = osb2_coeff; hash_len1 = OSB_BAYES_WINDOW_LEN; // was 5 hash_iters1 = 4; // should be 4 output_stride = 1; // put in the passed-in regex values, if any. my_regex = regex; my_regex_len = regexlen; my_regex_cflags = REG_EXTENDED; // Now we can proceed to set up the work in a fairly linear way. // If it's the Markov classifier, then different coeffs and a longer len if ( classifier_flags & CRM_MARKOVIAN) { hash_vec0 = markov1_coeff; hash_vec1 = markov2_coeff; hash_iters0 = hash_iters1 = 16; }; // If it's one of the 64-bit-key classifiers, then the featurebits // need to be 64. if ( classifier_flags & CRM_MARKOVIAN || classifier_flags & CRM_OSB || classifier_flags & CRM_WINNOW || classifier_flags & CRM_OSBF ) { // We're a 64-bit hash, so build a 64-bit interleaved feature set. featurebits = 64; output_stride = 2; }; // The new FSCM does in fact do tokeniation and hashing over // a string kernel, but only for the indexing. if (classifier_flags & CRM_FSCM) { // fprintf (stderr, "FSCM selector activated.\n"); hash_vec0 = string1_coeff; hash_len0 = FSCM_DEFAULT_CODE_PREFIX_LEN; hash_iters0 = 1; hash_vec1 = string2_coeff; hash_len1 = 1; hash_iters1 = 0; if (regexlen > 0) { my_regex = regex; my_regex_len = regexlen; } else { my_regex = fscm_kern_regex; my_regex_len = fscm_kern_regex_len; }; }; // Do we want a string kernel? If so, then we have to override // a few things. if ( classifier_flags & CRM_STRING) { // fprintf (stderr, "String Kernel"); hash_vec0 = string1_coeff; hash_len0 = 5; hash_iters0 = 1; hash_vec1 = string2_coeff; hash_len1 = 5; hash_iters1 = 1; if (regexlen == 0) { my_regex = string_kern_regex; my_regex_len = string_kern_regex_len; }; }; // Do we want a unigram system? If so, then we change a few more // things. if ( classifier_flags & CRM_UNIGRAM) { hash_vec0 = unigram_coeff; hash_len0 = 1; hash_iters0 = 1; hash_vec1 = unigram_coeff; hash_len1 = 1; hash_iters1 = 1; }; // Now all of the defaults have been filled in; we now see if the // caller has overridden any (or all!) of them. We assume that the // user who overrides them has pre-sanity-checked them as well. // First check- did the user override the regex? // Did the user program specify a first slash paramter? (only // override this if a regex was passed in) if (regexlen > 0) { crm_get_pgm_arg (s1text, MAX_PATTERN, apb->s1start, apb->s1len); s1len = apb->s1len; s1len = crm_nexpandvar (s1text, s1len, MAX_PATTERN); my_regex = s1text; my_regex_len = s1len; }; // Did the user specify a pipeline vector set ? If so, it's // in the second set of slashes. { char s2text[MAX_PATTERN]; long s2len; long local_pipe_len; long local_pipe_iters; char *vt_weight_regex = "vector: ([ 0-9]*)"; regex_t regcb; long regex_status; regmatch_t match[5]; // We'll only care about the second match local_pipe_len = 0; local_pipe_iters = 0; // get the second slash parameter (if used at all) crm_get_pgm_arg (s2text, MAX_PATTERN, apb->s2start, apb->s2len); s2len = apb->s2len; s2len = crm_nexpandvar (s2text, s2len, MAX_PATTERN); if (s2len > 0) { // Compile up the regex to find the vector tokenizer weights crm_regcomp (®cb, vt_weight_regex, strlen (vt_weight_regex), REG_ICASE | REG_EXTENDED); // Use the regex to find the vector tokenizer weights regex_status = crm_regexec (®cb, s2text, s2len, 5, match, REG_EXTENDED, NULL); // Did we actually get a match for the extended parameters? if (regex_status == 0) { char *conv_ptr; long i; // Yes, it matched. Set up the pipeline coeffs specially. // The first parameter is the pipe length conv_ptr = & s2text[match[1].rm_so]; local_pipe_len = strtol (conv_ptr, &conv_ptr, 0); if (local_pipe_len > UNIFIED_WINDOW_LEN) { nonfatalerror5 ("You've specified a tokenizer pipe length " "that is too long.", " I'll trim it.", CRM_ENGINE_HERE); local_pipe_len = UNIFIED_WINDOW_LEN; }; //fprintf (stderr, "local_pipe_len = %ld\n", local_pipe_len); // The second parameter is the number of repeats local_pipe_iters = strtol (conv_ptr, &conv_ptr, 0); if (local_pipe_iters > UNIFIED_VECTOR_LIMIT) { nonfatalerror5 ("You've specified too high a tokenizer " "iteration count.", " I'll trim it.", CRM_ENGINE_HERE); local_pipe_iters = UNIFIED_VECTOR_LIMIT; }; //fprintf (stderr, "pipe_iters = %ld\n", local_pipe_iters); // Now, get the coefficients. for (i = 0; i < local_pipe_len * local_pipe_iters; i++) { ca[i] = strtol (conv_ptr, &conv_ptr, 0); // fprintf (stderr, "coeff: %ld\n", ca[i]); }; // If there was a numeric coeff array, use that, else // use our slash coeff array. if (! coeff_array) { coeff_array = ca; pipe_len = local_pipe_len; pipe_iters = local_pipe_iters; }; }; // free the compiled regex. crm_regfree (®cb); }; }; // if any non-default coeff array was given, use that instead. if (coeff_array) { hash_vec0 = coeff_array; // GROT GROT GROT --2nd array should be different from // first array- how can we do that nonlinearly? // This will work for now, but birthday clashes will // happen more often in 64-bit featuresets hash_vec1 = coeff_array; }; if (pipe_len > 0) { hash_len0 = pipe_len; hash_len1 = pipe_len; }; if (pipe_iters > 0) { hash_iters0 = pipe_iters; hash_iters1 = pipe_iters; }; // Final bit - did the user specify or or // or any other match flags? Yes, it's madness to use // in a vector tokenization (and easier to do \Q and \U // in that case anyway) but we support it in case someone ever uses // vector tokenization in a way that it isn't madness to use my_regex_cflags = REG_EXTENDED; if (classifier_flags & CRM_NOCASE) my_regex_cflags += REG_ICASE; if (classifier_flags & CRM_NOMULTILINE) my_regex_cflags += REG_NEWLINE; if (classifier_flags & CRM_LITERAL) my_regex_cflags += REG_LITERAL; // We now have our parameters all set, and we can run the vector hashing. // if (internal_trace) fprintf (stderr, "Next offset: %ld, length: %ld\n", txtstart, txtlen); if (output_stride == 1) { crm_vector_tokenize ( txtptr, txtstart, txtlen, my_regex, my_regex_len, my_regex_cflags, hash_vec0, hash_len0, hash_iters0, features, featureslen, 1, // stride 1 for 32-bit features_out, next_offset); } else { // We're doing the 64-bit-long features for Markov/OSB crm_vector_tokenize ( txtptr, txtstart, txtlen, my_regex, my_regex_len, my_regex_cflags, hash_vec0, hash_len0, hash_iters0, features, featureslen, 2, // stride 2 for 64-bit features_out, next_offset); crm_vector_tokenize ( txtptr, txtstart, txtlen, my_regex, my_regex_len, my_regex_cflags, hash_vec1, hash_len1, hash_iters1, &(features[1]), featureslen - 1, 2, // stride 2 for 64-bit features_out, next_offset); }; return (*features_out); } // crm_vector_markov_1 gets the features of the markov H1 field long crm_vector_markov_1 ( char *txtptr, // input string (null-safe!) long txtstart, // start tokenizing at this byte. long txtlen, // how many bytes of input. char *regex, // the parsing regex (might be ignored) long regexlen, // length of the parsing regex unsigned *features, // where the output features go long featureslen, // how many output features (max) long *features_out, // how many longs did we actually use up long *next_offset // next invocation should start at this offset ) { return crm_vector_tokenize ( txtptr, txtstart, txtlen, regex, regexlen, 0, markov1_coeff, 5, 16, features, featureslen, 2, // stride 2 for 64-bit features features_out, next_offset ); } // crm_vector_markov_2 is the H2 field in the Markov classifier. long crm_vector_markov_2 ( char *txtptr, // input string (null-safe!) long txtstart, // start tokenizing at this byte. long txtlen, // how many bytes of input. char *regex, // the parsing regex (might be ignored) long regexlen, // length of the parsing regex unsigned *features, // where the output features go long featureslen, // how many output features (max) long *features_out, // how many longs did we actually use up long *next_offset // next invocation should start at this offset ) { return crm_vector_tokenize ( txtptr, txtstart, txtlen, regex, regexlen, 0, markov2_coeff, 5, 16, features, featureslen, 2, // Stride 2 for 64-bit features features_out, next_offset ); } // vectorized OSB featureset generator. // long crm_vector_osb1 ( char *txtptr, // input string (null-safe!) long txtstart, // start tokenizing at this byte. long txtlen, // how many bytes of input. char *regex, // the parsing regex (might be ignored) long regexlen, // length of the parsing regex unsigned *features, // where the output features go long featureslen, // how many output features (max) long *features_out, // how many longs did we actually use up long *next_offset // next invocation should start at this offset ) { return crm_vector_tokenize ( txtptr, txtstart, txtlen, regex, regexlen, 0, osb1_coeff, OSB_BAYES_WINDOW_LEN, 4, // should be 4 features, featureslen, 2, features_out, next_offset ); } long crm_vector_osb2 ( char *txtptr, // input string (null-safe!) long txtstart, // start tokenizing at this byte. long txtlen, // how many bytes of input. char *regex, // the parsing regex (might be ignored) long regexlen, // length of the parsing regex unsigned *features, // where the output features go long featureslen, // how many output features (max) long *features_out, // how many longs did we actually use up long *next_offset // next invocation should start at this offset ) { return crm_vector_tokenize ( txtptr, txtstart, txtlen, regex, regexlen, 0, osb2_coeff, OSB_BAYES_WINDOW_LEN, 4, // should be 4 features, featureslen, 2, features_out, next_offset ); } // vectorized string kernel featureset generator. // long crm_vector_string_kernel1 ( char *txtptr, // input string (null-safe!) long txtstart, // start tokenizing at this byte. long txtlen, // how many bytes of input. long string_kern_len, // length of the kernel (must be < 16) unsigned *features, // where the output features go long featureslen, // how many output features (max) long *features_out, // how many longs did we actually use up long *next_offset // next invocation should start at this offset ) { // The coeffs should be relatively prime. Relatively... if (string_kern_len > 15) string_kern_len = 15; return crm_vector_tokenize ( txtptr, txtstart, txtlen, ".", // regex 1, // regexlen 0, string1_coeff, string_kern_len, // how many coeffs to use 1, // how many variations (just one) features, featureslen, 1, features_out, next_offset ); } long crm_vector_string_kernel2 ( char *txtptr, // input string (null-safe!) long txtstart, // start tokenizing at this byte. long txtlen, // how many bytes of input. long string_kern_len, // length of the kernel (must be < 16) unsigned *features, // where the output features go long featureslen, // how many output features (max) long *features_out, // how many longs did we actually use up long *next_offset // next invocation should start at this offset ) { // The coeffs should be relatively prime. Relatively... if (string_kern_len > 15) string_kern_len = 15; return crm_vector_tokenize ( txtptr, txtstart, txtlen, ".", // regex 1, // regexlen 0, string2_coeff, string_kern_len, // how many coeffs to use 1, // how many variations (just one) features, featureslen, 1, features_out, next_offset ); } //#define DUMMY_MAIN_TEST #ifdef DUMMY_MAIN_TEST // int main2() { char input [1024]; long i, j; unsigned long feavec [2048]; char my_regex [256]; long coeff[]= { 1, 3, 0, 0, 0, 1, 0, 5, 0, 0, 1, 0, 0, 11, 0, 1, 0, 0, 0, 23 } ; strcpy (my_regex, "[[:alpha:]]+"); printf ("Enter a test string: "); scanf ("%128c", &input[0]); crm_vector_stringhash ( input, 0, strlen(input), my_regex, strlen (my_regex), coeff, 5, 4, feavec, 2048, & j, & i); printf ("... and i is %ld\n", i); exit(0); } #endif // DUMMY_MAIN_TEST crm114-20100106-BlameMichelson.src/mailreaver.crm0000755000000000017500000005461511321154266017645 0ustar rootwsy#! /usr/bin/crm # -(spam good cache dontstore stats_only outbound undo verbose maxprio minprio delprio fileprefix config) # # mailreaver.crm - 3rd Generation mail filter "standard script" # # Note to SunOS and FreeBSD users - do not place command arguments of # "-([arguments])" format on the first line of this program # or you will not get what you expect. This is due to a kernel # difference in how a bangline should be dealt with. # Copyright 2002-2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # This is MailReaver, the 3rd Generation mail filter "standard script" # for CRM114. The goal is to make a more maintainable, understandable, # and easier-to-customize mail filter. # # 1) we use the consolidated library "maillib.crm" for most shareable # things like parsing the .cf file, munging text, cacheing text, etc. # # 2) we always use the CacheIDs and the Reaver (maildir-like) format # for storing incoming email in unaltered form if there is # any possibility of training it. # # 3) We always train using mailtrainer.crm rather than training # internally. Thus, if you want to change the way things are # trained, you need to look at mailtrainer.crm as well. # # # We support the following commands, on both the command line # in the form "--blahblah --blahblah" and # by using the "command {secretpassword} word word" method # # spam - this is spam. Treat appropriately.[*] # good - this is good. Treat appropriately.[*] # cache - pretend the email starts on the following line. # process it normally and put it in the cache. # dontstore - do NOT store this text into the cache. Use # this for text that you never want to train. # stats_only - output ONLY the status. Nothing else. # Since this means there's no output of the # CacheIDs, we don't store the text for later training. # report_only - output ONLY the headers that would have # been added, not the entire text. # outbound - this is an "outbound" message and hence can # assumed to be nonspam. Train if necessary. # undo - the message was mis-trained as type X. Undo that. # verbose - tell me more. # maxprio +/- - add a maximum priority entry # minprio +/- - add a minimum priority entry # delprio - delete a prio list entry. # fileprefix=dir - look in the dir for the .css and .cf files # config=filename - use filename as the .cf file # # [*] - meaning that if it's a command line flag, the entire # standard input is the example text and if it's an inline # command, then the example text follows the inline commandline. # # Overall Design: # # 1) Read in the parameter file # # 2) Check for commands. Set flags as appropriate # # 3) Are any command flags set? # # 3a) run those commands # # 3b) report the results and exit # # ... otherwise... # # 4) Run the priolist # # 5) Run the classifier # # 6) Dispatch on the result # ############################################################## # # Step 1 - Read in the parameter file # ############################################################# # # --- uncomment this if you want to include a "forced" # configuration file --- # # insert mailfilterconfig.crm # # # --- These vars must have a value, or else we'll get errors ---- # isolate (:fileprefix:) // # isolate (:classifier_reason:) /no reason yet/ # isolate (:classify_status:) // # isolate (:our_exit_code:) /0/ # isolate (:stats:) / pR: 0.000000 / # isolate (:pr:) / pR: 0.00000/ # isolate (:subj_text:) / (None) / # isolate (:add_extra_stuff:) // # isolate (:decision_length:) /8192/ # # Isolate these email addresses, and give them values, # in case the user doesn't. isolate (:reject_address:) // isolate (:unsure_address:) // isolate (:fail_priority_mail_to:) // isolate (:fail_classify_mail_to:) // isolate (:fail_blacklist_mail_to:) // isolate (:fail_SSM_mail_to:) // isolate (:log_rejections:) // isolate (:report_only:) // # # # load the config file isolate (:config:) /:*:fileprefix:mailfilter.cf/ call /:load_cf_file:/ [:*:config:] ####################################################################### # # Do a quick check- has the password been changed or not? If it's # still the default, put in something that will be well-nigh unguessable # (esp. since it will contain recieved headers that the sender cannot # see nor control.) { match [:spw:] /DEFAULT_PASSWORD/ # yes, it's the same as default. So we scramble it just so # nobody can hack in without major trickery. hash (:spw:) /:*:_env_string::*:_dw:/ } ###################################################################### # # if a particular "fail" category hasn't been assigned, but # the :general_fails_to: category has, then send there instead { match [:fail_priority_mail_to:] /./ isolate (:fail_priority_mail_to:) /:*:general_fails_to:/ } { match [:fail_classify_mail_to:] /./ isolate (:fail_classify_mail_to:) /:*:general_fails_to:/ } ######################################################################### # # START OF ACTUAL MAIL PROCESSING HERE # ######################################################################### # # Does the user want us to log all incoming mail? This is handy for # testing and auditing purposes; by default it is turned off in the # configuration file. { match [:log_to_allmail.txt:] /yes/ output [:*:fileprefix:allmail.txt] /:*:mail_separator:/ output [:*:fileprefix:allmail.txt] /:*:_dw:/ } # allow logging to anywhere... { match [:log_all_mail_to_file:] /./ output [:*:fileprefix::*:log_all_mail_to_file:] /:*:mail_separator:/ output [:*:fileprefix::*:log_all_mail_to_file:] /:*:_dw:/ } ######################################################################## # # 2) Check for a command. Set flags as appropriate. Note that we # can't just dispatch on the presence of a flag, because we need # to merge in anything that might be in an inline command. # ######################################################################## # # Our commands are of the form: # # command password extra_args # # and the extra_args are one or more of # spam # good # cache # dontstore # stats_only # outbound # undo # verbose # # # Start our searching at the start of our input. # If nothing else requires it, we use the current input as the # text to be operated upon. { isolate (:spam: :good: :cache: :dontstore: :stats_only: ) isolate (:outbound: :undo: :verbose:) isolate (:in_text:) /:*:_dw:/ # now find the command, and set the cmdline insert point. match (:: :cmdline:) /.*(.)/ match (:cmdline:) /\n\n/ match (:cmdline: :cmds:) /^command :*:spw: (.*)$/ { # Yep, found a command. Grab remaining text in case # we aren't using the cached version match (:in_text:) /.*/ # # Parse out the command (and in the case of the prio lists, # actually do the work) { # Command to learn spam isolate (:spam:) // match [:cmds:] /spam/ alter (:spam:) /SET/ } { # Command to learn good isolate (:good:) // match [:cmds:] /good/ alter (:good:) /SET/ } { # Command to use the cached text copy isolate (:cache:) // match [:cmds:] /cache/ alter (:cache:) /SET/ } { # Command to NOT store this in the cache for later isolate (:dontstore:) // match [:cmds:] /dontstore/ alter (:dontstore:) /SET/ } { # Command to run stats_only, which implies dont store. isolate (:stats_only:) // match [:cmds:] /stats_only/ alter (:stats_only:) /SET/ } { # Command to run outbound mode isolate (:outbound:) // match [:cmds:] /outbound/ alter (:outbound:) /SET/ } { # Command to UNdo isolate (:undo:) // match [:cmds:] /undo/ alter (:undo:) /SET/ } # # The following commands can only be used inline, not # on the command line, and they do NOT use the text. { # Command to set a maxprio entry match [:cmds:] \ /maxprio ([-+][[:graph:]]+) / \ (:: :prio_regex:) input [:*:fileprefix:priolist.mfp] (:priotext:) alter (:priotext:) \ /:*:prio_regex:\n:*:priotext:/ output [:*:fileprefix:priolist.mfp] /:*:priotext:/ call /:mungmail_add:/ [X-CRM114-Success: Added new highest priority entry ":*:prio_regex:" ] accept exit } { # Command to set a minprio entry match [:cmds:] \ /minprio ([-+][[:graph:]]+)/ \ (:: :prio_regex:) input [:*:fileprefix:priolist.mfp] (:priotext:) alter (:priotext:) \ /:*:priotext:\n:*:prio_regex:/ output [:*:fileprefix:priolist.mfp] /:*:priotext:/ call /:mungmail_add:/ [X-CRM114-Success: Added new lowest priority entry ":*:prio_regex:" ] accept exit } { # Command to delete a priolist entry match [:cmds:] \ /delprio ([[:graph:]]+)/ \ (:: :prio_regex:) input [:*:fileprefix:priolist.mfp] (:priotext:) match [:priotext:] \ /^.*:*:prio_regex:.*$/ (:die:) alter (:die:) /\n/ output [:*:fileprefix:priolist.mfp] /:*:priotext:/ call /:mungmail_add:/ [X-CRM114-Success: Deleted priority entry ":*:prio_regex:" ] accept exit } } } ####### Inter-flag dependencies fixed up here. { match [:stats_only:] /SET/ alter (:dontstore:) /SET/ } ################################################################### # # See if there's already a CacheID in the headers- if so, grab it. # isolate (:cacheid:) // { { match [:in_text:] (:: :cacheid:) \ /X-CRM114-CacheID:.*sfid-([[:graph:]]+)/ isolate (:cacheid:) isolate (:long_cacheid:) /:*:text_cache:\/texts\/:*:cacheid:/ } alius # nope, not in the explicit headers. Check for it as a # comment in the Message-Id: header. { match [:in_text:] (:: :cacheid:) \ /Message-Id:.*\(sfid-([[:graph:]]+)\)/ isolate (:cacheid:) isolate (:long_cacheid:) /:*:text_cache:\/texts\/:*:cacheid:/ } # ADD OTHER PLACES TO CHECK HERE } #################################################################### # # Do we save the text into the cache, or has that already happened # # Unless "dontstore" is set or we already have a CacheID, we're supposed # to save the text. Saving is the usual case, mind you. Normally this # is the whole text, but if we had an inline "command cache" line then # use whatever text follows the "command password cache" line and stuff # that into the cache rather than the whole text. You don't normally # need to do this except as a prelude to training this new text. # { match [:cacheid:] /./ match [:dontstore:] /SET/ # yes - so the text to be worked follows the command line. # which is already in :in_text: This also calculates the # new cacheids named :cacheid: and :long_cacheid: . call /:reavercache_init:/ call /:reavercache_store:/ [:*:_dw:] } ##################################################################### ##################################################################### # # Command flags are all set; at this point we can run strictly from # the :var: values and the text held in :in_text: # ##################################################################### ##################################################################### # # We still need to cope with the following possibilities: # learn as spam, learn as nonspam, classify, and stats_only. # # But at least now we can run the preprocessing on :in_text: # #output /\nPREPROCESS INPUT:\n :*:in_text:\n/ { # note - this work gets thrown away if we are training from cache match (:chopped:) [:in_text: 0 :*:decision_length:] /.*/ alter (:in_text:) /:*:chopped:/ call /:mail_preprocess:/ [:*:in_text:] (:in_text:) } #output /\nPREPROCESS RESULT:\n :*:in_text:\n/ # # Are we supposed to learn this as spam? { match [:spam:] /SET/ { isolate (:tmp:) // syscall () (:tmp:) /\/bin\/ls :*:long_cacheid:/ match [:tmp:] /:*:cacheid:/ # # remember this file on a permanent basis by linking it into # the known-spam directory. syscall /:*:cache_dupe_command: :*:text_cache:\/texts\/:*:cacheid: :*:text_cache:\/known_spam\/:*:cacheid: / # # Now run mailtrainer.crm on the new copy isolate (:mailtrainer_output:) syscall /:*:fileprefix::*:trainer_invoke_command: --fileprefix=:*:fileprefix: --spam=:*:text_cache:\/known_spam\/:*:cacheid: --good=:*:text_cache:\/empty\/ / () (:mailtrainer_output:) #output /mailtrainer output: ':*:mailtrainer_output:'\n/ # and remove it from the prob_* directories, as # now it's known syscall /\/bin\/rm -rf :*:text_cache:\/prob_spam\/:*:cacheid:/ syscall /\/bin\/rm -rf :*:text_cache:\/prob_good\/:*:cacheid:/ # # now it's trained; put in a marker in the headers call /:mungmail_delete:/ [X-CRM114-Status: ] call /:mungmail_delete:/ [X-CRM114-Unsure: ] call /:mungmail_add:/ [X-CRM114-Action: LEARNED AND CACHED SPAM] # Insert the training report in the msgbody, if desired { match [:add_mailtrainer_report:] /yes/ match (:: :firstline:) /.*(.)/ match (:: :firstline:) /\n\n()/ alter (:firstline:) / -------\n :*:mailtrainer_output:\n ------ \n/ } { match [:confirm_flag_subject_string:] /./ call /:mungmail_mung_subject:/ [:*:confirm_flag_subject_string:] } accept exit /:*:accepted_mail_exit_code:/ } alius { # GROT GROT GROT We should make a better attempt at finding # the file, like looking in known_spam and known_good. # match (:: :firstline:) /.*(.)/ match (:: :firstline:) /\n\n()/ alter (:firstline:) /\n\n-----\n Problem: couldn't find the cached text.\n Perhaps you've already trained it? \n No action taken.\n\n/ accept exit /:*:accepted_mail_exit_code:/ } } # # Are we supposed to learn this as good? { match [:good:] /SET/ { isolate (:tmp:) // syscall () (:tmp:) /\/bin\/ls :*:long_cacheid:/ match [:tmp:] /:*:cacheid:/ # # remember this file on a permanent basis by linking it into # the known-good directory. syscall /:*:cache_dupe_command: :*:text_cache:\/texts\/:*:cacheid: :*:text_cache:\/known_good\/:*:cacheid: / # # Now run mailtrainer.crm on the new copy isolate (:mailtrainer_output:) syscall /:*:fileprefix::*:trainer_invoke_command: --fileprefix=:*:fileprefix: --good=:*:text_cache:\/known_good\/:*:cacheid: --spam=:*:text_cache:\/empty\/ / () (:mailtrainer_output:) #output /mailtrainer output: ':*:mailtrainer_output:'\n/ # and remove it from the prob_* directories, as # now it's known syscall /\/bin\/rm -rf :*:text_cache:\/prob_spam\/:*:cacheid:/ syscall /\/bin\/rm -rf :*:text_cache:\/prob_good\/:*:cacheid:/ # # now it's trained; put in a marker in the headers call /:mungmail_delete:/ [X-CRM114-Status: ] call /:mungmail_delete:/ [X-CRM114-Unsure: ] call /:mungmail_add:/ [X-CRM114-Action: LEARNED AND CACHED GOOD] # Insert the training report in the msgbody, if desired { match [:add_mailtrainer_report:] /yes/ match (:: :firstline:) /.*(.)/ match (:: :firstline:) /\n\n()/ alter (:firstline:) / -------\n :*:mailtrainer_output:\n ------ \n/ } { match [:confirm_flag_subject_string:] /./ call /:mungmail_mung_subject:/ [:*:confirm_flag_subject_string:] } accept exit /:*:accepted_mail_exit_code:/ } alius { # GROT GROT GROT We should make a better attempt at finding # the file, like looking in known_spam and known_good. # match (:: :firstline:) /.*(.)/ match (:: :firstline:) /\n\n()/ alter (:firstline:) /\n\n-----\n Problem: couldn't find the cached text.\n Perhaps you've already trained it? \n No action taken.\n\n/ accept exit /:*:accepted_mail_exit_code:/ } } ########################################################################## # # Not a learn, so it's a CLASSIFY job. Maybe full, maybe stats_only. # We try the Priolist first (eventually) then fall back on the # classifiers. Priolist patterns start in column 1, and are a + or # a - immediately followed by a regex. # isolate (:priolist:) input (:priolist:) [:*:fileprefix:priolist.mfp] # # { #... Grab the next regexturn the one-per-line patterns into a regex match (:w: :pm: :pat:) [:priolist:] /(.)(.+)/ #... see if this regex matches the incoming mail { match (:reason:) /:*:pat:/ # Yep, it matched... branch based on pm # { match [:pm:] /[+]/ # put in a little tag saying why prio-listed alter (:classifier_reason:) /** ACCEPT: CRM114 Priority Whitelisted by: :*:reason: **:*:_nl:/ alter (:stats:) /Match to priority pattern :*:pat:\n Forced pR: 999.99 / match [:stats:] (:: :pr:) /Forced pR: ([[:graph:]]+)/ goto /:looks_good:/ } # No, we didn't have a +, was it a '-'? { match [:pm:] /[-]/ alter (:classifier_reason:) /** REJECT: CRM114 Priority Blacklisted by: :*:reason: **:*:_nl:/ alter (:reject_address:) /:*:fail_priority_mail_to:/ { output [:*:fileprefix:rejected_by_blacklist.txt] /:*:_dw:/ } alter (:stats:) /Match to priority pattern :*:pat:\n Forced pR: -999.99 / match [:stats:] (:: :pr:) /pR: ([[:graph:]]+)/ goto /:looks_spam:/ } } # Nope, didn't match as a priority... grab the next regex until # there are no prio-list regexes left liaf } ####################################################################### # # No joy on the priolist. Use the Classifier. # { isolate (:stats:) // classify [:in_text:] <:*:clf:> /:*:lcr:/ \ (:*:fileprefix:nonspam.css :*:fileprefix:spam.css) (:stats:) match [:stats:] \ /^#0.* pR: ([-. 0-9]+)/ (:: :pr:) { # Is this a stats_only run? # match [:stats_only:] /SET/ output /:*:pr:/ exit /:*:accepted_mail_return_code:/ } # # It's a pure classify. Three possibilities: { # Case 1 - It's spam. eval /:@: :*:pr: <= :*:spam_threshold: :/ goto /:looks_spam:/ } { # Case 2 - It's good. eval /:@: :*:pr: >= :*:good_threshold: :/ goto /:looks_good:/ } { # Case 3 - Unsure goto /:looks_unsure:/ } } ################################################################## # # Final Dispatch - one of :looks_spam:, :looks_good:, or :looks_unsure: # will be gone-to. # ################################################################# :looks_spam: { # Do we log rejections somewhere? { match [:log_rejections_to_file:] /./ output [:*:fileprefix::*:log_rejections_to_file:] /:*:mail_separator:/ output [:*:fileprefix::*:log_rejections_to_file:] /:*:_dw:/ } # Do we put prob_spams into the prob_spam directory? { match [:text_cache:] /./ match [:cacheid:] /./ syscall /:*:cache_dupe_command: :*:text_cache:\/texts\/:*:cacheid: :*:text_cache:\/prob_spam\/:*:cacheid: / } # flag the subject line { match [:spam_flag_subject_string:] /./ call /:mungmail_mung_subject:/ [:*:spam_flag_subject_string:] } isolate (:subj_text:) alter (:our_exit_code:) /:*:rejected_mail_exit_code:/ alter (:reject_address:) /:*:fail_classify_mail_to:/ { match [:add_headers:] /yes/ call /:mungmail_unique:/ \ [X-CRM114-Version: :*:_crm_version: MR-:*:_pgm_hash: ] call /:mungmail_add_cache_info:/ [:*:cacheid:] call /:mungmail_unique:/ [X-CRM114-Status: SPAM ( :*:pr: )] call /:mungmail_delete:/ [X-CRM114-Notice: ] } # # Since sending mail needs complicated args, we do it here rather # than in a mungmail routine - and if we send mail, we exit here # rather than in the usual finish-up routine. { match [:reject_address:] /.../ syscall (:*:_dw:) /mail :*:reject_address: -s ' :*:subj_text: '/ exit /:*:our_exit_code:/ } } goto /:finish_up:/ :looks_good: { # Do we put prob_good mail into the prob_good directory? { match [:text_cache:] /./ match [:cacheid:] /./ syscall /:*:cache_dupe_command: :*:text_cache:\/texts\/:*:cacheid: :*:text_cache:\/prob_good\/:*:cacheid: / } alter (:our_exit_code:) /:*:accepted_mail_exit_code:/ { match [:good_flag_subject_string:] /./ call /:mungmail_mung_subject:/ [:*:good_flag_subject_string:] } { match [:add_headers:] /yes/ call /:mungmail_delete:/ [X-CRM114-Notice: ] call /:mungmail_unique:/ \ [X-CRM114-Version: :*:_crm_version: MR-:*:_pgm_hash: ] call /:mungmail_add_cache_info:/ [:*:cacheid:] call /:mungmail_unique:/ [X-CRM114-Status: GOOD ( :*:pr: )] } } goto /:finish_up:/ :looks_unsure: { alter (:our_exit_code:) /:*:unsure_mail_exit_code:/ { match [:unsure_flag_subject_string:] /./ call /:mungmail_mung_subject:/ [:*:unsure_flag_subject_string:] } { match [:add_headers:] /yes/ call /:mungmail_unique:/ \ [X-CRM114-Version: :*:_crm_version: MR-:*:_pgm_hash: ] call /:mungmail_add_cache_info:/ [:*:cacheid:] call /:mungmail_unique:/ [X-CRM114-Status: UNSURE ( :*:pr: )] call /:mungmail_unique:/ [X-CRM114-Notice: Please train this message. ] } } goto /:finish_up:/ ############################################################### # # Finish up - common exit routine :finish_up: { { # If :report_only: is SET then delete everything that's # not a X-CRM114 header match [:report_only:] /SET/ match [:_dw:] // { match [:_dw:] /.*?\n/ (:z:) { match [:z:] /^X-CRM114/ # output /Deleting :*:z:\n/ alter (:z:) // } liaf } } accept } exit /:*:our_exit_code:/ ################################################################ # # Catch failures. trap (:broken_program_message:) /.*/ { accept output /:*:_nl: Aw, crud. mailreaver.crm broke. Here's the error: :*:\_nl:/ output /:*:broken_program_message:/ output [stderr] /:*:_nl: ERROR: mailreaver.crm broke. Here's the error\: :*:_nl:/ output [stderr] /ERROR: :*:broken_program_message:/ } exit /:*:program_fault_exit_code:/ ######################################################################3 # # Library insertions start here. insert maillib.crm crm114-20100106-BlameMichelson.src/crm_main.c0000644000000000017500000006040411321154266016732 0ustar rootwsy// crm_main.c - main interface // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // include the routine declarations file #include "crm114.h" // and include OSBF declarations #include "crm114_osbf.h" // the command line argv char **prog_argv; // the auxilliary input buffer (for WINDOW input) char *newinputbuf; // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. char *inbuf; char *outbuf; char *tempbuf; int main (int argc, char **argv) { int i; // some random counters, when we need a loop int status; int openbracket; // if there's a command-line program... int openparen = -1; // if there's a list of acceptable arguments int user_cmd_line_vars = 0; // did the user specify --vars on cmdline? // printf (" args: %d \n", argc); // for (i = 0; i < argc; i++) // fprintf (stderr, " argi: %d, argv: %s \n", i, argv[i]); // copy argv into global statics... prog_argv = argv; vht_size = DEFAULT_VHT_SIZE; cstk_limit = DEFAULT_CSTK_LIMIT; max_pgmlines = DEFAULT_MAX_PGMLINES; max_pgmsize = DEFAULT_MAX_PGMLINES * 128; data_window_size = DEFAULT_DATA_WINDOW; user_trace = DEFAULT_USER_TRACE_LEVEL; internal_trace = DEFAULT_INTERNAL_TRACE_LEVEL; sparse_spectrum_file_length = 0; microgroom_chain_length = 0; microgroom_stop_after = 0; min_pmax_pmin_ratio = OSBF_MIN_PMAX_PMIN_RATIO; ignore_environment_vars = 0; debug_countdown = -1; cycle_counter = 0; cmdline_break = -1; profile_execution = 0; prettyprint_listing = 0; engine_exit_base = 0; q_expansion_mode = 0; // allcate and initialize the initial root csl (control stack // level) cell. We do this first, before command-line parsing, // because the command line parse fills in a lot of the first level csl. csl = (CSL_CELL *) malloc (sizeof (CSL_CELL)); if (!csl) untrappableerror5 ("Couldn't malloc the csl. Big problem!\n", "", CRM_ENGINE_HERE); csl -> filename = NULL; csl -> filedes = -1; csl -> rdwr = 0; // 0 means readonly, 1 means read/write csl -> nchars = 0; csl -> mct = 0; csl -> cstmt = 0; csl -> nstmts = 0; csl -> preload_window = 1; csl -> caller = NULL; csl -> calldepth = 0; csl -> aliusstk[0] = 0; // this gets initted later. openbracket = -1; openparen = -1; // and allocate the argparse block apb = (ARGPARSE_BLOCK *) malloc (sizeof (ARGPARSE_BLOCK)); if (!apb) untrappableerror5 ("Couldn't malloc apb. This is very bad.\n", "", CRM_ENGINE_HERE); // Parse the input command arguments // user_trace = 1; //internal_trace = 1; for (i = 1; i < argc; i++) { // fprintf (stderr, "Arg %d = '%s' \n", i, argv[i]); // is this a plea for help? if ( (strncmp (argv[i], "-?", 2) == 0) || (strncmp (argv[i], "-h", 2) == 0) || (argc == 1) ) { fprintf (stderr, " CRM114 version %s (regex engine: %s)\n ", VERSION, crm_regversion()); fprintf (stderr, " Copyright 2001-2009 William S. Yerazunis\n"); fprintf (stderr, " This software is licensed under the GPL " "with ABSOLUTELY NO WARRANTY\n"); fprintf (stderr, " For language help, RTFRM. \n"); fprintf (stderr, " Command Line Options:\n"); fprintf (stderr, " -{statements} executes statements\n"); fprintf (stderr, " -b nn sets a breakpoint on stmt nn\n"); fprintf (stderr, " -d nn run nn statements, then drop to debug\n"); fprintf (stderr, " -e ignore environment variables\n"); fprintf (stderr, " -E set base for engine exit values\n"); fprintf (stderr, " -h this help\n"); fprintf (stderr, " -l n listing (detail level 1 through 5)\n"); fprintf (stderr, " -m nn max number of microgroomed buckets in a chain\n"); fprintf (stderr, " -M nn max chain length - triggers microgrooming if enabled\n"); fprintf (stderr, " -p profile statement times \n"); fprintf (stderr, " -P nn max program lines @ 128 chars/line\n"); fprintf (stderr, " -q m mathmode (0,1 alg/RPN in EVAL," "2,3 alg/RPN everywhere)\n"); fprintf (stderr, " -r nn set OSBF min pmax/pmin ratio (default=9)\n"); fprintf (stderr, " -s nn sparse spectra (.css) featureslots \n"); fprintf (stderr, " -S nn round up to 2^N+1 .css featureslots \n"); fprintf (stderr, " -C use env. locale (default POSIX)\n"); fprintf (stderr, " -t user trace mode on\n"); fprintf (stderr, " -T implementors trace mode on\n"); fprintf (stderr, " -u dir chdir to directory before starting\n"); fprintf (stderr, " -v print version ID and exit \n"); fprintf (stderr, " -w nn max data window size ( bytes ) \n"); fprintf (stderr, " -- end of CRM114 flags; start of user args\n"); fprintf (stderr, " --foo creates var :foo: with value 'SET'\n"); fprintf (stderr, " --x=y creates var :x: with value 'y'\n"); if (openparen > 0) { fprintf (stderr, "\n This program also claims to accept these command line args:" ); fprintf (stderr, "\n %s\n", &argv[openparen][1] ); }; if (engine_exit_base != 0) { exit (engine_exit_base + 14); } else exit (EXIT_SUCCESS); } // -- means "end of crm114 flags" - remainder of args goes to // the program alone. if (strncmp (argv[i], "--", 2) == 0 && strlen (argv[i]) == 2) { if (user_trace > 0) fprintf (stderr, "system flag processing ended at arg %d .\n", i); i = argc; goto end_command_line_parse_loop; } if (strncmp (argv[i], "--", 2) == 0 && strlen (argv[i]) > 2) { if (user_trace > 0) fprintf (stderr, "Commandline set of user variable at %d '%s'.\n", i, argv[i]); if (user_cmd_line_vars == 0) user_cmd_line_vars = i; goto end_command_line_parse_loop; }; // set debug levels if (strncmp (argv[i], "-t", 2) == 0 && strlen(argv[i]) == 2) { user_trace++; if (user_trace > 0) { fprintf (stderr, "Setting usertrace level to %ld\n", user_trace); }; goto end_command_line_parse_loop; }; if (strncmp (argv[i], "-T", 2) == 0 && strlen(argv[i]) == 2) { internal_trace++; if (user_trace > 0 ) fprintf (stderr, "Setting internaltrace to %ld\n", internal_trace); goto end_command_line_parse_loop; }; if (strncmp (argv[i], "-p", 2) == 0 && strlen(argv[i]) == 2) { profile_execution = 1; if (user_trace > 0 ) fprintf (stderr, "Setting profile_execution to 1" ); goto end_command_line_parse_loop; }; // is this a change to the maximum number of program lines? if (strncmp (argv[i], "-P", 2) == 0 && strlen(argv[i]) == 2) { i++; // move to the next arg if (i < argc) { sscanf (argv[i], "%ld", &max_pgmlines); max_pgmsize = 128 * max_pgmlines; } if (user_trace > 0) fprintf (stderr, "Setting max prog lines to %ld (%ld bytes)\n", max_pgmlines, max_pgmsize); goto end_command_line_parse_loop; }; // is this a "gimme a listing" flag? if (strncmp (argv[i], "-l", 2) == 0 && strlen(argv[i]) == 2) { i++; // move to the next arg if (i < argc) { sscanf (argv[i], "%ld", &prettyprint_listing); } if (user_trace > 0) fprintf (stderr, "Setting listing level to %ld \n", prettyprint_listing); goto end_command_line_parse_loop; }; // is this a "Use Local Country Code" flag? if (strncmp (argv[i], "-C", 2) == 0 && strlen(argv[i]) == 2) { if (user_trace > 0) fprintf (stderr, "Setting locale to local\n"); setlocale (LC_ALL, ""); goto end_command_line_parse_loop; }; // is this a change to the math mode (0,1 for alg/RPN but only in EVAL, // 2,3 for alg/RPN everywhere. if (strncmp (argv[i], "-q", 2) == 0 && strlen(argv[i]) == 2) { i++; // move to the next arg if (i < argc) sscanf (argv[i], "%ld", &q_expansion_mode); if (user_trace > 0) { fprintf (stderr, "Setting math mode to %ld ", q_expansion_mode); if (q_expansion_mode == 0) fprintf (stderr, "(algebraic, only in EVAL\n"); if (q_expansion_mode == 1) fprintf (stderr, "(RPN, only in EVAL\n"); if (q_expansion_mode == 2) fprintf (stderr, "(algebraic, in all expressions)\n"); if (q_expansion_mode == 3) fprintf (stderr, "(RPN, in all expressions)\n"); }; goto end_command_line_parse_loop; }; // change the size of the maximum data window we'll allow if (strncmp (argv[i], "-w", 2) == 0 && strlen(argv[i]) == 2) { i++; // move to the next arg if (i < argc) sscanf (argv[i], "%ld", &data_window_size); if (data_window_size < 8192) { fprintf (stderr, "Sorry, but the min data window is 8192 bytes"); data_window_size = 8192; }; if (user_trace > 0) fprintf (stderr, "Setting max data window to %ld chars\n", data_window_size); goto end_command_line_parse_loop; }; // change the size of the sparse spectrum file default. if (strncasecmp (argv[i], "-s", 2) == 0 && strlen(argv[i]) == 2) { i++; // move to the next arg if (i < argc && sscanf (argv[i], "%ld", &sparse_spectrum_file_length)) { if (strcmp (argv[i-1], "-S") == 0) { long k; k=(long) floor(log10(sparse_spectrum_file_length-1) / log10(2)); while ( (2< 0) fprintf (stderr, "Setting sparse spectrum length to %ld bins\n", sparse_spectrum_file_length ); goto end_command_line_parse_loop; }; // set a break from the command line if (strncmp (argv[i], "-b", 2) == 0 && strlen(argv[i]) == 2) { i++; // move to the next arg if (i < argc) sscanf (argv[i], "%ld", &cmdline_break); if (user_trace > 0) fprintf (stderr, "Setting the command-line break to line %ld\n", cmdline_break); goto end_command_line_parse_loop; }; // set base value for detailed engine exit values if (strncmp (argv[i], "-E", 2) == 0 && strlen(argv[i]) == 2) { i++; // move to the next arg if (i < argc) sscanf (argv[i], "%ld", &engine_exit_base); if (user_trace > 0) fprintf (stderr, "Setting the engine exit base value to %ld\n", engine_exit_base); goto end_command_line_parse_loop; }; // set countdown cycles before dropping to debugger if (strncmp (argv[i], "-d", 2) == 0 && strlen(argv[i]) == 2) { i++; // move to the next arg debug_countdown = 0; if (i < argc) sscanf (argv[i], "%ld", &debug_countdown); if (user_trace > 0) fprintf (stderr, "Setting debug countdown to %ld statements\n", debug_countdown); if (debug_countdown == 0) // if next arg wasn't numeric, back up i-- ; goto end_command_line_parse_loop; }; // ignore environment variables? if (strncmp (argv[i], "-e", 2) == 0 && strlen(argv[i]) == 2) { ignore_environment_vars++; if (user_trace > 0) fprintf (stderr, "Ignoring environment variables\n"); goto end_command_line_parse_loop; }; // is this to set the cwd? if (strncmp (argv[i], "-u", 2) == 0 && strlen(argv[i]) == 2) { i++; // move to the next arg if (user_trace) fprintf (stderr, "Setting WD to %s\n",argv[i]); if ( i >= argc ) { fprintf (stderr, "The -u working-directory change needs an arg"); goto end_command_line_parse_loop; }; if ( chdir(argv[i] )) { fprintf (stderr, "Sorry, couldn't chdir to %s \n", argv[i]); }; goto end_command_line_parse_loop; }; if (strncmp (argv[i], "-v", 2) == 0 && strlen(argv[i]) == 2) { // NOTE - version info goes to stdout, not stderr, just like GCC does fprintf (stdout, " This is CRM114, version %s (%s)\n", VERSION, crm_regversion()); fprintf (stdout, " Copyright 2001-2009 William S. Yerazunis\n"); fprintf (stdout, " This software is licensed under the GPL with ABSOLUTELY NO WARRANTY\n"); if (engine_exit_base != 0) { exit (engine_exit_base + 16); } else exit( EXIT_SUCCESS ); }; if (strncmp (argv[i], "-{", 2) == 0) // don't care about the "}" { if (user_trace) fprintf (stderr, "Command line program at arg %d\n", i); openbracket = i; goto end_command_line_parse_loop; }; // // What about -( var var var ) cmdline var restrictions? if (strncmp (argv[i], "-(", 2) == 0 ) { if (user_trace) fprintf (stderr, "Allowed command line arg list at arg %d\n", i); openparen = i; // // If there's a -- at the end of the arg, lock out system // flags as though we hit a '--' flag. // (i.e. no debugger. Minimal security. No doubt this is // circumventable by a sufficiently skilled user, but // at least it's a start.) if ( strncmp ("--", &argv[i][strlen(argv[i])-2], 2) == 0) { if (user_trace) fprintf (stderr, "cmdline arglist also locks out sysflags.\n"); i = argc; }; goto end_command_line_parse_loop; }; // set microgroom_stop_after if (strncmp (argv[i], "-m", 2) == 0 && strlen(argv[i]) == 2) { i++; // move to the next arg if (i < argc) sscanf (argv[i], "%ld", µgroom_stop_after); if (user_trace > 0) fprintf (stderr, "Setting microgroom_stop_after to %ld\n", microgroom_stop_after); if (microgroom_stop_after <= 0) // if value <= 0 set it to default microgroom_stop_after = MICROGROOM_STOP_AFTER; goto end_command_line_parse_loop; }; // set microgroom_chain_length length if (strncmp (argv[i], "-M", 2) == 0 && strlen(argv[i]) == 2) { i++; // move to the next arg if (i < argc) sscanf (argv[i], "%ld", µgroom_chain_length); if (user_trace > 0) fprintf (stderr, "Setting microgroom_chain_length to %ld\n", microgroom_chain_length); if (microgroom_chain_length < 5) // if value <= 5 set it to default microgroom_chain_length = MICROGROOM_CHAIN_LENGTH; goto end_command_line_parse_loop; }; // set min_pmax_pmin_ratio if (strncmp (argv[i], "-r", 2) == 0 && strlen(argv[i]) == 2) { i++; // move to the next arg if (i < argc) sscanf (argv[i], "%f", &min_pmax_pmin_ratio); if (user_trace > 0) fprintf (stderr, "Setting min pmax/pmin of a feature to %f\n", min_pmax_pmin_ratio); if (min_pmax_pmin_ratio < 0) // if value < 0 set it to 0 min_pmax_pmin_ratio = OSBF_MIN_PMAX_PMIN_RATIO ; goto end_command_line_parse_loop; }; // that's all of the flags. Anything left must be // the name of the file we want to use as a program // BOGOSITUDE - only the FIRST such thing is the name of the // file we want to use as a program. The rest of the args // should just be passed along if (csl->filename == NULL) { if (strlen(argv[i]) > MAX_FILE_NAME_LEN) untrappableerror5 ("Invalid filename, ", "filename too long.", CRM_ENGINE_HERE); csl->filename = argv[i]; if (user_trace > 0) fprintf (stderr, "Using program file %s\n", csl->filename); }; end_command_line_parse_loop: if (internal_trace) fprintf (stderr, "End of pass %d through cmdline parse loop\n", i); } // main2 (); // // Did we get a program filename? If not, look for one. // At this point, accept any arg that doesn't start with a - sign // if (csl->filename == NULL && openbracket < 1) { if (internal_trace) fprintf (stderr, "Looking for _some_ program to run...\n"); for (i = 1; i < argc; i++) if (argv[i][0] != '-') { if (strlen(argv[i]) > MAX_FILE_NAME_LEN) untrappableerror5 ("Couldn't open the file, ", "filename too long.", CRM_ENGINE_HERE); csl->filename = argv[i]; i = argc; } if (user_trace > 0) fprintf (stderr, "Using program file %s\n", csl->filename); }; // If we still don't have a program, we're done. Squalk an // error. if (csl->filename == NULL && openbracket < 0) { fprintf (stderr, "\nCan't find a file to run," "or a command-line to execute. \n" "I give up... (exiting)\n"); if (engine_exit_base != 0) { exit (engine_exit_base + 17); } else exit (EXIT_SUCCESS); }; // open, stat and load the program file if (openbracket < 0 ) { { if (argc <= 1) { fprintf (stderr, "CRM114 version %s \n", VERSION); fprintf (stderr, "Try 'crm ', or 'crm -h' for help\n"); if (engine_exit_base != 0) { exit (engine_exit_base + 18); } else exit (EXIT_SUCCESS); } else { if (user_trace) fprintf (stderr, "Loading program from file %s\n", csl->filename); crm_load_csl (csl); }; }; } else { // if we got here, then it's a command-line program, and // we should just assemble the proggie from the argv [openbracket] if (strlen (&(argv[openbracket][1])) + 2048 > max_pgmsize) untrappableerror5 ("The command line program is too big. \n", "Try increasing the max program size with -P. \n", CRM_ENGINE_HERE); csl->filename = "(from command line)"; csl->filetext = (char *) malloc (sizeof (char) * max_pgmsize); if (!csl->filetext) untrappableerror5 ("Couldn't malloc csl->filetext space (where I was going to put your program.\nWithout program space, we can't run. Sorry.", "", CRM_ENGINE_HERE); strcpy (csl->filetext, "\n"); // the [1] below gets rid of the leading - sign strcat (csl->filetext, &(argv[openbracket][1])); strcat (csl->filetext, "\n"); strcat (csl->filetext, "\n"); csl->nchars = strlen (csl->filetext); csl->hash = strnhash (csl->filetext, csl->nchars); if (user_trace) fprintf (stderr, "Hash of program: %X, length is %ld bytes\n", csl->hash, csl->nchars); }; // We get another csl-like data structure, // which we'll call the cdw, which has all the fields we need, and // simply allocate the data window of "adequate size" and read // stuff in on stdin. cdw = malloc (sizeof (CSL_CELL)); if (!cdw) untrappableerror5 ("Couldn't malloc cdw.\nThis is very bad.","", CRM_ENGINE_HERE); cdw->filename = NULL; cdw->rdwr = 1; cdw->filedes = -1; cdw->filetext = malloc (sizeof (char) * data_window_size); if (!cdw->filetext) untrappableerror5 ("Couldn't malloc cdw->filetext.\nWithout this space, you have no place for data. Thus, we cannot run.","", CRM_ENGINE_HERE); // also allocate storage for the windowed data input newinputbuf = malloc (sizeof (char) * data_window_size); // and our three big work buffers - these are used ONLY inside // of a single statement's execution and do NOT ever contain state // that has to exist across statements. inbuf = malloc (sizeof (char) * data_window_size); outbuf = malloc (sizeof (char) * data_window_size); tempbuf = malloc (sizeof (char) * data_window_size); if (!tempbuf || !outbuf || !inbuf || !newinputbuf) untrappableerror5 ( "Couldn't malloc one or more of" "newinputbuf,inbuf,outbuf,tempbuf.\n" "These are all necessary for operation." "We can't run.","", CRM_ENGINE_HERE); // Initialize the VHT, add in a few predefined variables // crm_vht_init(argc, argv); // Call the pre-processor on the program // status = crm_preprocessor (csl, 0); // Now, call the microcompiler on the program file. status = crm_microcompiler ( csl, vht); // Great - program file is now mapped via csl->mct // Put a copy of the preprocessor-result text into // the isolated variable ":_pgm_text:" crm_set_temp_var (":_pgm_text:", csl->filetext); // If the windowflag == 0, we should preload the data window. Now, // let's get some data in. // and preload the data window with stdin until we hit EOF i = 0; if (csl->preload_window) { // GROT GROT GROT This is slow // //while (!feof (stdin) && i < data_window_size - 1) // { // cdw->filetext[i] = fgetc (stdin); // i++; // }; //i-- ; // get rid of the extra ++ on i from the loop; this is the // EOF "character" which prints like an umlauted-Y. // // // This is the much faster way. // // i = fread (cdw->filetext, 1, data_window_size -1, stdin); // // JesusFreke suggests this instead- retry with successively // smaller readsizes on systems that can't handle full // POSIX-style massive block transfers. int readsize = data_window_size - 1; while (! feof (stdin) && i < data_window_size - 1) { //i += fread (cdw->filetext + i, 1, readsize-1, stdin); int rs; rs = i + readsize < data_window_size - 1 ? readsize : data_window_size - i - 1; i+= fread (cdw->filetext + i, 1, rs, stdin); if (feof (stdin)) { break; }; if (ferror (stdin)) { if (errno == ENOMEM && readsize > 1) // insufficient memory? { readsize = readsize / 2; // try a smaller block clearerr (stdin); } else { fprintf (stderr, "Error while trying to get startup input. " "This is usually pretty much hopeless, but " "I'll try to keep running anyway. "); break; }; }; }; }; // data window is now preloaded (we hope), set the cdwo up. cdw->filetext[i] = '\000'; cdw->nchars = i; cdw->hash = strnhash (cdw->filetext, cdw->nchars); cdw->mct = NULL; cdw->nstmts = -1; cdw->cstmt = -1; cdw->caller = NULL; // and put the initial data window suck-in contents into the vht // with the special name :_dw: // // GROT GROT GROT will have to change this when we get rid of separate // areas for the data window and the temporary area. In particular, the // "start" will no longer be zero. Note to self: get rid of this comment // when it gets fixed. Second note to self - since most of the insert // and delete action happens in :_dw:, for efficiency reasons perhaps // we don't want to merge these areas. // { long dwname; long dwlen; tdw->filetext[tdw->nchars] = '\n'; tdw->nchars++; dwlen = strlen (":_dw:"); dwname = tdw->nchars; //strcat (tdw->filetext, ":_dw:"); memmove (&tdw->filetext[dwname], ":_dw:", dwlen); tdw->nchars = tdw->nchars + dwlen; // strcat (tdw->filetext, "\n"); memmove (&tdw->filetext[tdw->nchars], "\n", strlen ("\n")); tdw->nchars++; crm_setvar ( NULL, 0, tdw->filetext, dwname, dwlen, cdw->filetext, 0, cdw->nchars, -1, 0); }; // // We also set up the :_iso: to hold the isolated variables. // Note that we must specifically NOT use this var during reclamation // or GCing the isolated var storage area. // // HACK ALERT HACK ALERT - note that :_iso: starts out with a zero // length and must be updated // #define USE_COLON_ISO_COLON #ifdef USE_COLON_ISO_COLON { long isoname; long isolen; isolen = strlen (":_iso:"); isoname = tdw->nchars; //strcat (tdw->filetext, ":_dw:"); memmove (&tdw->filetext[isoname], ":_iso:", isolen); tdw->nchars = tdw->nchars + isolen; // strcat (tdw->filetext, "\n"); memmove (&tdw->filetext[tdw->nchars], "\n", strlen ("\n")); tdw->nchars++; crm_setvar ( NULL, 0, tdw->filetext, isoname, isolen, tdw->filetext, 0, 0, -1, 0); }; #endif // USE_COLON_ISO_COLON // Now we're here, we can actually run! // set up to start at the 0'th statement (the start) csl->cstmt = 0; status = crm_invoke (); // This is the *real* exit from the engine, so we do not override // the engine's exit status with an engine_exit_base value. exit ( (char) status); } crm114-20100106-BlameMichelson.src/paolo_ov5.crm0000755000000000017500000000133011321154266017403 0ustar rootwsy#! /usr/bin/crm # # paolo_ov5.crm - paolo written test script # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # window alter (:_dw:) /HHHH\nC-T: mpt; by="--0123"\nhhhh\n\nMIME.\n----0123\nC-T: txt\nC-T-E: 7\n\nTTTT\n\n----0123\nC-T: img\n name="clonic.GIF"\nC-T-E: b64\niiii\n\noooo\noooo\n\n----0123--\n\n\n/ { match (:: :headers: :body:) /(.*?)\n\n(.*)/ } isolate (:headers:) { match [:headers:] /^C-T: .* by="(.+)"/ \ (:x: :boundary:) output /:boundary:=:*:boundary:\n/ } isolate (:c:) /0/ { eval (:c:) /:@::*:c:+1:/ match [:body:] (:x: :headers:) /\n--:*:boundary:\n(.+?)\n\n/ output /:c:=:*:c:\n:*:headers:\n\n/ liaf } crm114-20100106-BlameMichelson.src/randomiotest.crm0000755000000000017500000000330511321154266020214 0ustar rootwsy#! /usr/bin/crm # # reandomiotest.crm - random I/O test # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # # Random I/O test - test lseeking I/O window # first test simple input and output test syscall /rm -f randtst.txt/ isolate (:z:) output [randtst.txt] /The quick brown fox jumped over the lazy dog's back/ input [randtst.txt] (:z:) output /Original: :*:z:\n/ # Redo it with append, to be sure append works with no prior file. syscall /rm -f randtst.txt/ isolate (:z:) output [randtst.txt] / alpha bravo charlie / input [randtst.txt] (:z:) output /delete file then append: :*:z:\n/ # and again, no append, to see if that does a delete of old contents isolate (:z:) output [randtst.txt] /The quick brown fox jumped over the lazy dog's back/ input [randtst.txt] (:z:) output /prior file, no append: :*:z:\n/ # and yet again, with append, to see that append does work. isolate (:z:) output [randtst.txt] / alpha bravo charlie / input [randtst.txt] (:z:) output /prior file, and append: :*:z:\n/ # Now, grab a few words out of the middle, say, # starting at 10, for 15 chars isolate (:x:) input [randtst.txt 10 15] (:x:) output /Grab chars 10 thru 25: :*:x:\n/ # And overlap, but with some math thrown in input [randtst.txt :@:10+7: :@:7+5:] (:x:) output /Grab chars 17 and length 12: :*:x:\n/ # And overlap a write in the middle, and then a read, output [randtst.txt 16 3] /cat/ input [randtst.txt 5 30] (:x:) output /Now, change the fox to cat: :*:x:\n/ # And show it really got written: input [ randtst.txt ] (:z:) output /Final check- did it really get written? :*:z:\n/ syscall /rm -f randtst.txt/ crm114-20100106-BlameMichelson.src/mailfilter.cf0000644000000000017500000004331611321154266017446 0ustar rootwsy# mailfilter.cf -- Config file for mailfilter, mailreaver, mailtrainer # # You MUST edit the fileds for "Secret Password", "mime decoder", and # "cache_dupe_command". Just those THREE things. # # Changes to all other values are optional. # # Many of the options here have two or three alternatives; for your # convenience, we have put all of the reasonable alternatives # on sequential lines. Uncomment the one you want, and leave the # others commented out. If you leave more than one uncommented, the # last one is the one that's used. Don't do that; it's ugly. # # After you edit this file, don't forget to edit 'rewrites.mfp' # --------->>> You MUST set the following correctly! <<<------- # # If you leave it as "DEFAULT-PASSWORD", you will not be able to # access the mail-to-myself commanding system, as "DEFAULT-PASSWORD" # is specifically _disabled_ as a legal password. Just pick something, eh? # :spw: /DEFAULT_PASSWORD/ # ----- If you want a verbose startup, turn this on. Note that this is # ----- intentionally _after_ the password is set, so a verbose startup # ----- will not reveal your password. # #:verbose_startup: /SET/ :verbose_startup: // # # --------->>> You MUST set the following correctly! <<<------- # # --- Some mail systems do mime decoding with "mimencode -d" or "-u". # --- Others (such as Red Hat 8.0) use "mewdecode" . # --- Yet others (such as Fedora Core 3) use "openssl base64 -d" . # --- Yet Others (i.e. *BSDs) can use "base64" . # --- See which one is on your system and use that one- comment # --- the others out. If you can't figure out what your base64 mime # --- decoder is, or don't want mime decoding, set :do_base64: to /no/ # --- but expect a significant accuracy decrease if you do this. # #:do_base64: /no/ :do_base64: /yes/ # #:mime_decoder: /mewdecode/ #:mime_decoder: /mimencode -d/ #:mime_decoder: /mimencode -u/ #:mime_decoder: /base64 -d/ :mime_decoder: /openssl base64 -d/ #:mime_decoder: /normalizemime/ # --------->>> You MUST set the following correctly! <<<------- # # --- Linux (and Unix) systems use "hardlinks" to make a file # --- appear in more than one place, while not actually using up # --- extra disk space. Sadly, it is the case that most # --- Windows systems have no such feature. So, you must set the # --- following for what kind of system you are actually using. # -- Note to other developers: here's where to put other system-dependent # -- syscall commands. # # --- Use the default /ln/ for LINUX and UNIX systems (does a hard-link, # --- does not use up disk space or inodes). Change this to the /copy/ # --- command for WINDOWS systems (95, 98, NT, XP) # # --- Mild security issue: to avoid a theoretical exploit where a user # --- gets their commands re-aliased, make sure you use the fully qualified # --- commandname (that is, starting in the root directory). # :cache_dupe_command: /\/bin\/ln/ #:cache_dupe_command: /copy/ ########################################################################### # # END of things you absolutely MUST set. Feel free # to keep reading though... # ########################################################################### ########################################################################### # # START of things you might likely want to set. These # are probably OK for you, but many users change these things. # ########################################################################## # ----------- define an optional target for where to send spam (that is, # ------------ emails that we want to "fail", or reject to another # ------------ address. Note that this is NOT a "program fault" address, # ------------ but where to send "bad" email to in the general case. # ------------ You can specify tightly controlled conditions too, # ------------ (see the next stanza) # ----------- To NOT forward this to another account, just leave the # ----------- address as the empty string, which is '//'. # ----------- This works fine especially if your mail reader program # ----------- can sort based on the ADV and UNS (or whatever you choose # ----------- to use as flagging strings) in the "Subject:" field. # ------- CAUTION- some systems are buggy and _REQUIRE_ a user@host.domain # ----- in the following to forward spammy mail correctly. WTF??? :-( # #:general_fails_to: /somebody@somewhere.net/ :general_fails_to: // # -------- If you would prefer to send specific kinds of spam to # -------- different mailboxes, here's where to do it. # -------- (be sure to uncomment the line!). Again, these are # --------- not "program fault" conditions, just different filter results. # # :fail_priority_mail_to: /where_priority_fails_go/ # :fail_blacklist_mail_to: /where_blacklist_fails_go/ # :fail_SSM_mail_to: /where_Classifier_fails_go_for_mailFILTER/ # :fail_classify_mail_to: /where_classifier_fails_go_for_mailREAVER/ # --------- Do we give nonspam, spam, and unsure an exitcode of 0 # --------- (for most standalone apps) or something else? # --------- Usually we use an exit code of 1 for "program fault", # --------- but change it if you need to use 0/1 for good/spam # --------- Don't use an exit code greater than 128 (it breaks BASH!) # --------- If you use exit codes (procmail doesn't) change it here. :rejected_mail_exit_code: /0/ :accepted_mail_exit_code: /0/ :unsure_mail_exit_code: /0/ :program_fault_exit_code: /1/ ####################################################################### # # END of things you are likely to want to change. # # Anything following is starting to approach true customization. # Feel free to explore and poke around. ###################################################################### # -----------Do we want to add the optional headers to the mail? # -----------If turned on, will add X-CRM114-Whatever: headers on each # -----------incoming email. (note- this does NOT turn off the cache-id header # :add_headers: /yes/ #:add_headers: /no/ # --------- do we add the statistics report? :add_verbose_stats: /yes/ #:add_verbose_stats: /no/ # --------- do we add the mailtrainer report to the top of the message body # --------- after training? :add_mailtrainer_report: /yes/ #:add_mailtrainer_report: /no/ # --------- Do we enable long-form explains (with lots of text)? # -- you can have no extra stuff, add it as text, or add it as an attachment. # -- (only available in mailfilter, not mailreaver) # :add_extra_stuff: /no/ # :add_extra_stuff: /text/ # :add_extra_stuff: /attachment/ # --------- Do we want to insert a "flagging" string on the subject line, # --------- perhaps to insert an 'ADV:' ? Whatever string we put here # --------- will be inserted at the front of the subject if we think the # --------- mail is spam. # # :spam_flag_subject_string: // :spam_flag_subject_string: /ADV:/ # --------- Do we want to insert a "flagging" string on the subject line # --------- for good email? Usually we don't.... so we set this to the # --------- null string - that is, // :good_flag_subject_string: // # ------------Similarly, do we want to insert a "flagging" string on # -------------the subject line of an "unsure" email? This way we know # --------------we need to train it even if "headers" is turned off. # :unsure_flag_subject_string: // :unsure_flag_subject_string: /UNS:/ # ------------- Do we want Training ConFirmation flags on the results of # ------------- a message to be learned? Default is "TCF:". :confirm_flag_subject_string: /TCF:/ #:confirm_flag_subject_string: // # --------- Do we want to do any "rewrites" to increase generality and # ---------- (usually) accuracy? IF 'yes', be sure to edit rewrites.mfp! # --------- NOTE: this option is somewhat slow. If your mailserver is # --------- maxed out on CPU, you might want to turn this off. # :rewrites_enabled: /yes/ #:rewrites_enabled: /no/ # --------- Do we copy incoming text into allmail.txt ? default is yes, but # --------- experienced users will probably set this to 'no' after testing # --------- their configuration for functionality. # :log_to_allmail.txt: /yes/ # :log_to_allmail.txt: /no/ # ------- Another logging option - log all mail to somewhere else # ------- entirely. Whatever pathname is given here will be prefixed # ------- by :fileprefix: # ------- To not use this, set it to the null string .. // # ------- Remember to backslash-escape path slashes! :log_all_mail_to_file: // #:log_all_mail_to_file: /my_personal_mail_log_file_name.txt/ # --------- When we log messages, should we use a mail separator? # --------- And, if so, what? :mail_separator: /\n-=-=-=-=-=-=- cut here -=-=-=-=-=-=-\n/ # # ---------- Message Cacheing for retraining - do we keep a cache of # ---------- messages we've classified recently "in the wild" as retrain # ---------- texts? This uses up some disk space, but means that we can # ---------- use mailtrainer.crm on these messages to autotune the classifier. # ---------- Default is to cache into the directory reaver_cache ; # ---------- if you don't want this, set it to // . If you don't use this, # ---------- you can't really use mailtrainer.crm, and you must keep your # ---------- headers scrupulously clean in all train messages. Recommended # ---------- to leave this unchanged unless you are VERY short of disk. # :text_cache: /reaver_cache/ # :text_cache: // # ----- How do we invoke the trainer (as in, just the invocation # ------ of CRM114 on mailtrainer.crm. Usually this is just obvious, # ------- but if you don't have CRM114 installed in the search path, here's # -------- where you can set trainer invocation to be via whatever path # --------- you want it (the second example is if you haven't installed # ---------- CRM114 at all, but are running the crm114_tre static binary # ----------- right out of the local directory.) # # -- use this next one if you have installed CRM114 with "make install" # -- (This is preferred and is the default) :trainer_invoke_command: /.\/mailtrainer.crm/ # # -- use this one if you can't do a "make install" and so must run the # --- crm114_tre binary directly out of the current working directory. # :trainer_invoke_command: /.\/crm114_tre mailtrainer.crm / # ------ If we're cacheing for retraining, we're probably using # ------ mailtrainer.crm or some other variant. In that case, # ------ you will want a "randomizer" to present the training # ------ examples to the classifier in some random but balanced order. # ------ You have two choices - you can either use the "sort" # ------ command on some random character in the filename (this # ------ is NOT recommended) or use the "shuffle.crm" program. # ------ We _strongly_ recommend using shuffle.crm; the default # ------ options to shuffle.crm will work fine. Alternatively, # ------ you can use the "sort --key 1.2" on date-named files to # ----- achieve chronological training :trainer_randomizer_command: /.\/shuffle.crm/ #:trainer_randomizer_command: /.\/crm114 shuffle.crm/ #:trainer_randomizer_command: /sort --key 1.2/ # --------- Do we log rejected mail to a file? default yes, but most # --------- users should set this to no after testing their # --------- configuration to verify that rejected mail goes to the # --------- reject address. Only works in mailfilter.crm # :log_rejections: /yes/ #:log_rejections: /no/ # ------- alternate rejection logging - set this pathname to non-null # ------ to log rejections elsewhere. Only for mailreaver.crm. # ----- Set to NULL ( // ) to turn this off. :log_rejections_to_file: // #:log_rejections_to_file /this_is_my_rejected_email_log_file.txt/ # ----------Do we want to enable "inoculation by email"? # --------(leave this off unless you want RFC inoculations) # :inoculations_enabled: /no/ #:inoculations_enabled: /yes/ # --------- How many characters of the input do we really trust to be # ---------- worthy of classification? Usually the first few thousand # ----------- bytes of the message tell more than enough (though we've # ------------ been "noticed" by spammers, who are now packing 4K of # ------------- innocuous text into their headers. No problemo... :) ) # #:decision_length: /4096/ #:decision_length: /64000/ :decision_length: /16000/ # ----- for entropy users ONLY - 4K is plenty! #:decision_length: /4096/ # ------------ Do we want to expand URLs (that is, fetching the contents # ------------- of a URL and inserting that after the url itself?) # -------------- By default this is off, but turn it on if you want # --------------- to experiment. :expand_urls: /no/ # :expand_urls: /yes/ # # WGET options - 30-second timeout, output to stdout. # HACK - use the proper --user-agent="IEblahblah" for max effect! :url_fetch_cmd: /wget -T 30 -O - / # and trim the URL text to not more than 16bytes of text. :url_trim_cmd: / head -c 16000 / ####################################################################### # # ------------------- YOU REALLY SHOULD STOP HERE ------------------- # --------- values below this line are usually OK for almost all # --------- users to use unchanged - Gurus only beyond this point. # ####################################################################### # # If you want to change things here, go ahead, but realize you are # playing with things that can really hurt accuracy. # # This being open source, if you don't *think* about changing it, # what would be the use of it being open source? That said, this # _is_ open source- you break it, you get to keep _both_ pieces! # # # ------------ CLF - The Classifier Flags ---------- # # --------- Which classifier flags do we use? Default for 20060101 has # --------- been changed to OSB UNIQUE MICROGROOM. # # --------- A null setting gets you straight Markovian, without # --------- microgrooming. OSB uses less memory, is faster, # --------- and is usually more accurate. Correlative matching is # --------- 100x - 1000x slower, but can match anything (binaries, # --------- wide chars, unicode, virii, _anything_. Winnow is a # --------- non-statistical learning classificer with very nice # --------- accuracy (up to 2x SBPH). Hyperspace is a pseudogaussian # --------- KNN (K-nearest-neighbor) matcher. # # --------- This is also where we set whether to use microgrooming # --------- or Arne optimization (they're currently mutually exclusive). # --------- If you turn off microgrooming you get Arne optimization # --------- automatically. # # --------- If you _change_ this, you _must_ empty out your .css or # --------- .cow files and build fresh ones, because these # --------- classifiers do NOT use compatible data storage formats! # #:clf: /microgroom/ #:clf: /osb/ #:clf: /osb microgroom/ :clf: /osb unique microgroom/ #:clf: /correlate/ #:clf: /winnow/ #:clf: /osbf/ #:clf: /osbf microgroom/ #:clf: /hyperspace/ #:clf: /hyperspace unique/ # # # # --------Thresholds for GOOD/UNSURE/SPAM thick-threshold training # ------- # ------ A very small thick threshold (or zero!) works for Markovian. # ----- A thick threshold of 5 to 20 seems good for OSB, OSBF, # ---- Hyperspace, Bit-Entropy, and Winnow. If you want an asymmetric # --- threshold system, you can do that by having :good_threshold: # -- be different from :spam_threshold:. The defaults are +/- 10.0 # # # ---- Things rated equal to or better than this are GOOD email #:good_threshold: /0.01/ #:good_threshold: /5.0/ :good_threshold: /10.0/ #:good_threshold: /20.0/ # # ---- Things rated less than or equal to this are SPAM #:spam_threshold: /-0.01/ :spam_threshold: /-5.0/ #:spam_threshold: /-10.0/ #:spam_threshold: /-20.0/ # ---- mailfilter uses a single threshold and operates symmetrically. # --- (this is only to provide backward compatibility) :thick_threshold: /5.0/ # ---- What regex do we use for LEARN/CLASSIFY? the first is the # ---- "old standard". Other ones are handy for different spam # ---- mixes. The last one is for people who get a great deal of # ---- packed HTML spam, which is almost everybody in 2003, so it # ---- used to be the default. But since spammers have shifted away # ---- from this, it isn't the default any longer. IF you change # ---- this, you MUST rebuild your .css files with decent # ---- amounts of locally-grown spam and nonspam ( if you've been # ---- following instructions and using the "reaver" cache, this is # ---- easily done! ) # :lcr: /[[:graph:]]+/ #:lcr: /[[:alnum:]]+/ #:lcr: /[-.,:[:alnum:]]+/ #:lcr: /[[:graph:]][-[:alnum:]]*[[:graph:]]?/ #:lcr: /[[:graph:]][-.,:[:alnum:]]*[[:graph:]]?/ # # this next one is pretty incomprehensible, and probably wrong... #:lcr: /[[:print:]][/!?\#]?[-[[:alnum:]][[:punct:]]]*(?:[*'=;]|/?>|:/*)? # # # Expansions for antispamming. You almost _always_ want these on, # unless you're debugging something really bizarre. # --------- Do we enable spammus interruptus undo? :undo_interruptus: /no/ #:undo_interruptus: /yes/ # # # # ------------ HIGHLY EXPERIMENTAL - automatic training! # enable this only if you really want to live VERY dangerously! # "Do you feel lucky today, punk? Well, do ya?" # :automatic_training: /no/ # # ---- if you are living dangerously and have turned on autotraining, # you should also set the following to point to an address that # will get read on a quick basis, becuause this is where autotrain # verifications will go. # #:autotrain_address: /root/ # crm114-20100106-BlameMichelson.src/Hound_of_the_Baskervilles_first_500_lines.txt0000644000000000017500000005550311321154266025642 0ustar rootwsyThe Hound of the Baskervilles by A. Conan Doyle Chapter 1 Mr. Sherlock Holmes Mr. Sherlock Holmes, who was usually very late in the mornings, save upon those not infrequent occasions when he was up all night, was seated at the breakfast table. I stood upon the hearth-rug and picked up the stick which our visitor had left behind him the night before. It was a fine, thick piece of wood, bulbous-headed, of the sort which is known as a "Penang lawyer." Just under the head was a broad silver band nearly an inch across. "To James Mortimer, M.R.C.S., from his friends of the C.C.H.," was engraved upon it, with the date "1884." It was just such a stick as the old-fashioned family practitioner used to carry--dignified, solid, and reassuring. "Well, Watson, what do you make of it?" Holmes was sitting with his back to me, and I had given him no sign of my occupation. "How did you know what I was doing? I believe you have eyes in the back of your head." "I have, at least, a well-polished, silver-plated coffee-pot in front of me," said he. "But, tell me, Watson, what do you make of our visitor's stick? Since we have been so unfortunate as to miss him and have no notion of his errand, this accidental souvenir becomes of importance. Let me hear you reconstruct the man by an examination of it." "I think," said I, following as far as I could the methods of my companion, "that Dr. Mortimer is a successful, elderly medical man, well-esteemed since those who know him give him this mark of their appreciation." "Good!" said Holmes. "Excellent!" "I think also that the probability is in favour of his being a country practitioner who does a great deal of his visiting on foot." "Why so?" "Because this stick, though originally a very handsome one has been so knocked about that I can hardly imagine a town practitioner carrying it. The thick-iron ferrule is worn down, so it is evident that he has done a great amount of walking with it." "Perfectly sound!" said Holmes. "And then again, there is the 'friends of the C.C.H.' I should guess that to be the Something Hunt, the local hunt to whose members he has possibly given some surgical assistance, and which has made him a small presentation in return." "Really, Watson, you excel yourself," said Holmes, pushing back his chair and lighting a cigarette. "I am bound to say that in all the accounts which you have been so good as to give of my own small achievements you have habitually underrated your own abilities. It may be that you are not yourself luminous, but you are a conductor of light. Some people without possessing genius have a remarkable power of stimulating it. I confess, my dear fellow, that I am very much in your debt." He had never said as much before, and I must admit that his words gave me keen pleasure, for I had often been piqued by his indifference to my admiration and to the attempts which I had made to give publicity to his methods. I was proud, too, to think that I had so far mastered his system as to apply it in a way which earned his approval. He now took the stick from my hands and examined it for a few minutes with his naked eyes. Then with an expression of interest he laid down his cigarette, and carrying the cane to the window, he looked over it again with a convex lens. "Interesting, though elementary," said he as he returned to his favourite corner of the settee. "There are certainly one or two indications upon the stick. It gives us the basis for several deductions." "Has anything escaped me?" I asked with some self-importance. "I trust that there is nothing of consequence which I have overlooked?" "I am afraid, my dear Watson, that most of your conclusions were erroneous. When I said that you stimulated me I meant, to be frank, that in noting your fallacies I was occasionally guided towards the truth. Not that you are entirely wrong in this instance. The man is certainly a country practitioner. And he walks a good deal." "Then I was right." "To that extent." "But that was all." "No, no, my dear Watson, not all--by no means all. I would suggest, for example, that a presentation to a doctor is more likely to come from a hospital than from a hunt, and that when the initials 'C.C.' are placed before that hospital the words 'Charing Cross' very naturally suggest themselves." "You may be right." "The probability lies in that direction. And if we take this as a working hypothesis we have a fresh basis from which to start our construction of this unknown visitor." "Well, then, supposing that 'C.C.H.' does stand for 'Charing Cross Hospital,' what further inferences may we draw?" "Do none suggest themselves? You know my methods. Apply them!" "I can only think of the obvious conclusion that the man has practised in town before going to the country." "I think that we might venture a little farther than this. Look at it in this light. On what occasion would it be most probable that such a presentation would be made? When would his friends unite to give him a pledge of their good will? Obviously at the moment when Dr. Mortimer withdrew from the service of the hospital in order to start a practice for himself. We know there has been a presentation. We believe there has been a change from a town hospital to a country practice. Is it, then, stretching our inference too far to say that the presentation was on the occasion of the change?" "It certainly seems probable." "Now, you will observe that he could not have been on the staff of the hospital, since only a man well-established in a London practice could hold such a position, and such a one would not drift into the country. What was he, then? If he was in the hospital and yet not on the staff he could only have been a house-surgeon or a house-physician--little more than a senior student. And he left five years ago--the date is on the stick. So your grave, middle-aged family practitioner vanishes into thin air, my dear Watson, and there emerges a young fellow under thirty, amiable, unambitious, absent-minded, and the possessor of a favourite dog, which I should describe roughly as being larger than a terrier and smaller than a mastiff." I laughed incredulously as Sherlock Holmes leaned back in his settee and blew little wavering rings of smoke up to the ceiling. "As to the latter part, I have no means of checking you," said I, "but at least it is not difficult to find out a few particulars about the man's age and professional career." From my small medical shelf I took down the Medical Directory and turned up the name. There were several Mortimers, but only one who could be our visitor. I read his record aloud. "Mortimer, James, M.R.C.S., 1882, Grimpen, Dartmoor, Devon. House-surgeon, from 1882 to 1884, at Charing Cross Hospital. Winner of the Jackson prize for Comparative Pathology, with essay entitled 'Is Disease a Reversion?' Corresponding member of the Swedish Pathological Society. Author of 'Some Freaks of Atavism' (Lancet 1882). 'Do We Progress?' (Journal of Psychology, March, 1883). Medical Officer for the parishes of Grimpen, Thorsley, and High Barrow." "No mention of that local hunt, Watson," said Holmes with a mischievous smile, "but a country doctor, as you very astutely observed. I think that I am fairly justified in my inferences. As to the adjectives, I said, if I remember right, amiable, unambitious, and absent-minded. It is my experience that it is only an amiable man in this world who receives testimonials, only an unambitious one who abandons a London career for the country, and only an absent-minded one who leaves his stick and not his visiting-card after waiting an hour in your room." "And the dog?" "Has been in the habit of carrying this stick behind his master. Being a heavy stick the dog has held it tightly by the middle, and the marks of his teeth are very plainly visible. The dog's jaw, as shown in the space between these marks, is too broad in my opinion for a terrier and not broad enough for a mastiff. It may have been--yes, by Jove, it is a curly-haired spaniel." He had risen and paced the room as he spoke. Now he halted in the recess of the window. There was such a ring of conviction in his voice that I glanced up in surprise. "My dear fellow, how can you possibly be so sure of that?" "For the very simple reason that I see the dog himself on our very door-step, and there is the ring of its owner. Don't move, I beg you, Watson. He is a professional brother of yours, and your presence may be of assistance to me. Now is the dramatic moment of fate, Watson, when you hear a step upon the stair which is walking into your life, and you know not whether for good or ill. What does Dr. James Mortimer, the man of science, ask of Sherlock Holmes, the specialist in crime? Come in!" The appearance of our visitor was a surprise to me, since I had expected a typical country practitioner. He was a very tall, thin man, with a long nose like a beak, which jutted out between two keen, gray eyes, set closely together and sparkling brightly from behind a pair of gold-rimmed glasses. He was clad in a professional but rather slovenly fashion, for his frock-coat was dingy and his trousers frayed. Though young, his long back was already bowed, and he walked with a forward thrust of his head and a general air of peering benevolence. As he entered his eyes fell upon the stick in Holmes's hand, and he ran towards it with an exclamation of joy. "I am so very glad," said he. "I was not sure whether I had left it here or in the Shipping Office. I would not lose that stick for the world." "A presentation, I see," said Holmes. "Yes, sir." "From Charing Cross Hospital?" "From one or two friends there on the occasion of my marriage." "Dear, dear, that's bad!" said Holmes, shaking his head. Dr. Mortimer blinked through his glasses in mild astonishment. "Why was it bad?" "Only that you have disarranged our little deductions. Your marriage, you say?" "Yes, sir. I married, and so left the hospital, and with it all hopes of a consulting practice. It was necessary to make a home of my own." "Come, come, we are not so far wrong, after all," said Holmes. "And now, Dr. James Mortimer--" "Mister, sir, Mister--a humble M.R.C.S." "And a man of precise mind, evidently." "A dabbler in science, Mr. Holmes, a picker up of shells on the shores of the great unknown ocean. I presume that it is Mr. Sherlock Holmes whom I am addressing and not--" "No, this is my friend Dr. Watson." "Glad to meet you, sir. I have heard your name mentioned in connection with that of your friend. You interest me very much, Mr. Holmes. I had hardly expected so dolichocephalic a skull or such well-marked supra-orbital development. Would you have any objection to my running my finger along your parietal fissure? A cast of your skull, sir, until the original is available, would be an ornament to any anthropological museum. It is not my intention to be fulsome, but I confess that I covet your skull." Sherlock Holmes waved our strange visitor into a chair. "You are an enthusiast in your line of thought, I perceive, sir, as I am in mine," said he. "I observe from your forefinger that you make your own cigarettes. Have no hesitation in lighting one." The man drew out paper and tobacco and twirled the one up in the other with surprising dexterity. He had long, quivering fingers as agile and restless as the antennae of an insect. Holmes was silent, but his little darting glances showed me the interest which he took in our curious companion. "I presume, sir," said he at last, "that it was not merely for the purpose of examining my skull that you have done me the honour to call here last night and again today?" "No, sir, no; though I am happy to have had the opportunity of doing that as well. I came to you, Mr. Holmes, because I recognized that I am myself an unpractical man and because I am suddenly confronted with a most serious and extraordinary problem. Recognizing, as I do, that you are the second highest expert in Europe--" "Indeed, sir! May I inquire who has the honour to be the first?" asked Holmes with some asperity. "To the man of precisely scientific mind the work of Monsieur Bertillon must always appeal strongly." "Then had you not better consult him?" "I said, sir, to the precisely scientific mind. But as a practical man of affairs it is acknowledged that you stand alone. I trust, sir, that I have not inadvertently--" "Just a little," said Holmes. "I think, Dr. Mortimer, you would do wisely if without more ado you would kindly tell me plainly what the exact nature of the problem is in which you demand my assistance." Chapter 2 The Curse of the Baskervilles "I have in my pocket a manuscript," said Dr. James Mortimer. "I observed it as you entered the room," said Holmes. "It is an old manuscript." "Early eighteenth century, unless it is a forgery." "How can you say that, sir?" "You have presented an inch or two of it to my examination all the time that you have been talking. It would be a poor expert who could not give the date of a document within a decade or so. You may possibly have read my little monograph upon the subject. I put that at 1730." "The exact date is 1742." Dr. Mortimer drew it from his breast- pocket. "This family paper was committed to my care by Sir Charles Baskerville, whose sudden and tragic death some three months ago created so much excitement in Devonshire. I may say that I was his personal friend as well as his medical attendant. He was a strong-minded man, sir, shrewd, practical, and as unimaginative as I am myself. Yet he took this document very seriously, and his mind was prepared for just such an end as did eventually overtake him." Holmes stretched out his hand for the manuscript and flattened it upon his knee. "You will observe, Watson, the alternative use of the long s and the short. It is one of several indications which enabled me to fix the date." I looked over his shoulder at the yellow paper and the faded script. At the head was written: "Baskerville Hall," and below in large, scrawling figures: "1742." "It appears to be a statement of some sort." "Yes, it is a statement of a certain legend which runs in the Baskerville family." "But I understand that it is something more modern and practical upon which you wish to consult me?" "Most modern. A most practical, pressing matter, which must be decided within twenty-four hours. But the manuscript is short and is intimately connected with the affair. With your permission I will read it to you." Holmes leaned back in his chair, placed his finger-tips together, and closed his eyes, with an air of resignation. Dr. Mortimer turned the manuscript to the light and read in a high, cracking voice the following curious, old-world narrative: "Of the origin of the Hound of the Baskervilles there have been many statements, yet as I come in a direct line from Hugo Baskerville, and as I had the story from my father, who also had it from his, I have set it down with all belief that it occurred even as is here set forth. And I would have you believe, my sons, that the same Justice which punishes sin may also most graciously forgive it, and that no ban is so heavy but that by prayer and repentance it may be removed. Learn then from this story not to fear the fruits of the past, but rather to be circumspect in the future, that those foul passions whereby our family has suffered so grievously may not again be loosed to our undoing. "Know then that in the time of the Great Rebellion (the history of which by the learned Lord Clarendon I most earnestly commend to your attention) this Manor of Baskerville was held by Hugo of that name, nor can it be gainsaid that he was a most wild, profane, and godless man. This, in truth, his neighbours might have pardoned, seeing that saints have never flourished in those parts, but there was in him a certain wanton and cruel humour which made his name a by-word through the West. It chanced that this Hugo came to love (if, indeed, so dark a passion may be known under so bright a name) the daughter of a yeoman who held lands near the Baskerville estate. But the young maiden, being discreet and of good repute, would ever avoid him, for she feared his evil name. So it came to pass that one Michaelmas this Hugo, with five or six of his idle and wicked companions, stole down upon the farm and carried off the maiden, her father and brothers being from home, as he well knew. When they had brought her to the Hall the maiden was placed in an upper chamber, while Hugo and his friends sat down to a long carouse, as was their nightly custom. Now, the poor lass upstairs was like to have her wits turned at the singing and shouting and terrible oaths which came up to her from below, for they say that the words used by Hugo Baskerville, when he was in wine, were such as might blast the man who said them. At last in the stress of her fear she did that which might have daunted the bravest or most active man, for by the aid of the growth of ivy which covered (and still covers) the south wall she came down from under the eaves, and so homeward across the moor, there being three leagues betwixt the Hall and her father's farm. "It chanced that some little time later Hugo left his guests to carry food and drink--with other worse things, perchance--to his captive, and so found the cage empty and the bird escaped. Then, as it would seem, he became as one that hath a devil, for, rushing down the stairs into the dining-hall, he sprang upon the great table, flagons and trenchers flying before him, and he cried aloud before all the company that he would that very night render his body and soul to the Powers of Evil if he might but overtake the wench. And while the revellers stood aghast at the fury of the man, one more wicked or, it may be, more drunken than the rest, cried out that they should put the hounds upon her. Whereat Hugo ran from the house, crying to his grooms that they should saddle his mare and unkennel the pack, and giving the hounds a kerchief of the maid's, he swung them to the line, and so off full cry in the moonlight over the moor. "Now, for some space the revellers stood agape, unable to understand all that had been done in such haste. But anon their bemused wits awoke to the nature of the deed which was like to be done upon the moorlands. Everything was now in an uproar, some calling for their pistols, some for their horses, and some for another flask of wine. But at length some sense came back to their crazed minds, and the whole of them, thirteen in number, took horse and started in pursuit. The moon shone clear above them, and they rode swiftly abreast, taking that course which the maid must needs have taken if she were to reach her own home. "They had gone a mile or two when they passed one of the night shepherds upon the moorlands, and they cried to him to know if he had seen the hunt. And the man, as the story goes, was so crazed with fear that he could scarce speak, but at last he said that he had indeed seen the unhappy maiden, with the hounds upon her track. 'But I have seen more than that,' said he, 'for Hugo Baskerville passed me upon his black mare, and there ran mute behind him such a hound of hell as God forbid should ever be at my heels.' So the drunken squires cursed the shepherd and rode onward. But soon their skins turned cold, for there came a galloping across the moor, and the black mare, dabbled with white froth, went past with trailing bridle and empty saddle. Then the revellers rode close together, for a great fear was on them, but they still followed over the moor, though each, had he been alone, would have been right glad to have turned his horse's head. Riding slowly in this fashion they came at last upon the hounds. These, though known for their valour and their breed, were whimpering in a cluster at the head of a deep dip or goyal, as we call it, upon the moor, some slinking away and some, with starting hackles and staring eyes, gazing down the narrow valley before them. "The company had come to a halt, more sober men, as you may guess, than when they started. The most of them would by no means advance, but three of them, the boldest, or it may be the most drunken, rode forward down the goyal. Now, it opened into a broad space in which stood two of those great stones, still to be seen there, which were set by certain forgotten peoples in the days of old. The moon was shining bright upon the clearing, and there in the centre lay the unhappy maid where she had fallen, dead of fear and of fatigue. But it was not the sight of her body, nor yet was it that of the body of Hugo Baskerville lying near her, which raised the hair upon the heads of these three dare-devil roysterers, but it was that, standing over Hugo, and plucking at his throat, there stood a foul thing, a great, black beast, shaped like a hound, yet larger than any hound that ever mortal eye has rested upon. And even as they looked the thing tore the throat out of Hugo Baskerville, on which, as it turned its blazing eyes and dripping jaws upon them, the three shrieked with fear and rode for dear life, still screaming, across the moor. One, it is said, died that very night of what he had seen, and the other twain were but broken men for the rest of their days. "Such is the tale, my sons, of the coming of the hound which is said to have plagued the family so sorely ever since. If I have set it down it is because that which is clearly known hath less terror than that which is but hinted at and guessed. Nor can it be denied that many of the family have been unhappy in their deaths, which have been sudden, bloody, and mysterious. Yet may we shelter ourselves in the infinite goodness of Providence, which would not forever punish the innocent beyond that third or fourth generation which is threatened in Holy Writ. To that Providence, my sons, I hereby commend you, and I counsel you by way of caution to forbear from crossing the moor in those dark hours when the powers of evil are exalted. crm114-20100106-BlameMichelson.src/call_return_test.crm0000755000000000017500000001336311321154266021062 0ustar rootwsy#! /usr/bin/crm # # call_return_test.crm - test the call-return statements # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. window # alter (:_dw:) /This is the original data window. It shouldn't change/ # output /Starting \n/ output /\n\nTesting local and forking call and returns. (level :*:_cd:) \n/ call /:foo:/ # output /Middle (level :*:_cd:)\n/ isolate (:retval:) call /:bar:/ [ a b c d e ] (:retval:) output /Got back >:*:retval:<\n/ output /End (level :*:_cd:)\n/ # output / Doing factorial with a mutating argument \n/ isolate (:z:) /5/ call /:factorial:/ [ :*:z: 1 ] output / :*:z: factorial is :*:out: (level :*:_cd:)\n\n/ # output / Doing factorial with inplace args. \n/ isolate (:z:) /5/ call /:factorial_inplace:/ [ :*:z: 1 ] output / :*:z: factorial_inplace is :*:out: (level :*:_cd:) \n\n/ # output / Doing factorial with return args \n/ isolate (:z:) /5/ isolate (:out:) // call /:factorial_returnarg:/ [ :*:z: ] (:out:) output / :*:z: factorial_returnarg is :*:out: (level :*:_cd:)\n\n/ # output / \nand now do some fully isolated forking calls \n/ isolate (:out: :status:) syscall /:my_fork:/ ( this string is your input ) (:out:) (:status:) output /Returned output = :*:out: (level :*:_cd:)\n/ output /Returned status: \n:*:status:\n/ output /And the data window is now: :*:_dw:\n/ ####################### # Must... Control... Fist... Of.... Runaway Line Count!!! #################### exit /0/ ######################################## # Start of the callable routines ######################################## ###################################### # # Just Print out something. # :foo: output /The foo (level :*:_cd:)\n/ return ###################################### # # Print out our incoming argument # :bar: (:zeta:) output /The bar was >:*:zeta:< (level :*:_cd:)\n/ return /z y x w v/ ######################################### # # Calculate the factorial. Note that # this routine uses a global :out: to capture the # accumulated result # # Yes, I know it's dirtylooking, but most routines don't need # to be recursive, and of those that do, most really are changing # a data structure in-place, and hence don't really need to # do this twitchy magic. # :factorial: (:arg:) { output / arglist return factorial call entry, args = :*:arg: (level :*:_cd:)\n/ isolate (:nm1: :out:) match [:arg:] (:: :n: :out:) /([0-9]+) ([0-9]+)/ { eval /:@: :*:n: > 1 :/ eval (:nm1:) /:@: :*:n: - 1 :/ eval (:out:) /:@: :*:out: * :*:n: :/ call /:factorial:/ [ :*:nm1: :*:out: ] output / Call return, :out: = :*:out: ( level :*:_cd:) \n/ return } alius { output / Recursion bottomed out, returning :out: = :*:out: \n/ return } } ############################################## # # Here's factorial again, but we're treating :arg: as a known data # structure (that we MATCH into) and then we just ALTER fields within it. # # Other than the fact that we mutilate :arg: , this is not a bad way # to write code. :factorial_inplace: (:arg:) { output / downward recurse call entry, args = :*:arg: (level :*:_cd:)\n/ match [:arg:] (:: :n: :out:) /([0-9]+) ([0-9]+)/ { eval /:@: :*:n: > 1 :/ eval (:out:) /:@::*:out: * :*:n::/ eval (:n:) /:@: :*:n: - 1:/ call /:factorial_inplace:/ [:*:arg:] output / Call return, :out: = :*:out: (level :*:_cd:)\n/ return } alius { output / Recursion bottomed out, returning :out: = :*:out: \n/ return } } ################################################## # # Here's factorial yet again, with an incoming transfer arg AND # an output transfer arg. This is also a good way to write code. # # Because we don't have local variables (yet- I haven't figured out # a way to reconcile the overlapped-strings principle with local # variables) note that: # # FOR A RECURSIVE ROUTINE, _ALL_ OF THE STATE MUST MOVE DOWN AND UP # IN THE ARGS. NO EXCEPTIONS - OTHERWISE LOWER CALLS WILL OVERWRITE # THE STATE AND YOUR CODE WON'T WORK AS EXPECTED. # # The call arg looks like this going down: # # 5 # 4 5 # 3 4 5 # 2 3 4 5 # 1 2 3 4 5 # # then like this coming back up. # 2 3 4 5 # 6 4 5 # 24 5 # 120 :factorial_returnarg: (:arg:) { output / Call factorial_returnarg entry, arg = :*:arg: (level :*:_cd:)\n/ isolate (:nm1: :out: ) match [:arg:] (:: :n: ) /([0-9]+)/ { eval /:@: :*:n: > 1 :/ eval (:nm1:) /:@: :*:n: - 1 :/ output / N is :*:n: NM1 is :*:nm1: \n/ call /:factorial_returnarg:/ [ :*:nm1: :*:arg:] (:out:) output / Call returned, return value was :*:out: (level :*:_cd:)\n / { # Do we have at least two things to multiply? match [:out:] (:: :p1: :p2: :rest:) \ /([0123456789]+) ([0123456789]+) (.*)/ output / p1: :*:p1:, p2: :*:p2:, rest: :*:rest: \n/ eval (:out:) /:@: :*:p1: * :*:p2:: :*:rest:/ output / multiply p1 and p2, put that on the front, and return :out: = :*:out: (level :*:_cd:)\n/ return /:*:out:/ } # If we got to here, then it was just one thing to return return /:*:out:/ } alius { output / Calling bottomed out with N-1 <= 0.00 (level :*:_cd:)\n/ return /:*:arg:/ } } ################################################### # # Here's the fork code. Note that this code executes # in an entirely separate process, and so _never_ returns. # # All data transfer must happen in pipes, except that the # original data is still there (it's a fork, after all) # :my_fork: output /In the fork... data window is :*:_dw: (level :*:_cd:)\n/ input output /Inputted; now the data window is: :*:_dw:\n/ output /Changing the data window in this fork. \n/ alter (:_dw:) /This is a _changed_ data window, local to this SYSCALLed fork./ output /Now the data window is: :*:_dw:\n/ exit /123/ crm114-20100106-BlameMichelson.src/crm_osbf_bayes.c0000644000000000017500000013107411321154266020124 0ustar rootwsy// crm_osbf_bayes.c - OSBF Bayes classifier // Copyright 2004 Fidelis Assis // Copyright 2004-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // This is the OSBF-Bayes classifier. It differs from SBPH-Markovian // and OSB-Bayes in the way P(F|C) is estimated. See function // crm_expr_osbf_bayes_classify, below, for details. // -- Fidelis Assis - 2004/10/20 // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // include the routine declarations file #include "crm114.h" // include OSBF structures #include "crm114_osbf.h" // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *tempbuf; //////////////////////////////////////////////////////////////////// // // the hash coefficient table (hctable) should be full of relatively // prime numbers, and preferably superincreasing, though both of those // are not strict requirements. // static long hctable[] = { 1, 7, 3, 13, 5, 29, 11, 51, 23, 101, 47, 203, 97, 407, 197, 817, 397, 1637, 797, 3277 }; // Where does the nominative data start? static unsigned long spectra_start; /* structure for token searching */ struct token_search { unsigned char *ptok; unsigned long toklen; unsigned long hash; unsigned char *max_ptok; const char *pattern; regex_t *regcb; unsigned max_long_tokens; }; /************************************************************/ static int get_next_token (struct token_search *pts) { unsigned char *p_end = NULL; /* points to end of the token */ int error = 0; /* default: no error */ if (pts->pattern[0] != '\0') { regmatch_t match[5]; if (pts->ptok < pts->max_ptok) { error = crm_regexec (pts->regcb, (char *) pts->ptok, pts->max_ptok - pts->ptok, 5, match, 0, NULL); if (error == REG_NOMATCH) { match[0].rm_so = 0; match[0].rm_eo = 0; error = 0; } /* fprintf(stderr, "%s %ld %ld\n", pts->pattern, match[0].rm_so, match[0].rm_eo); */ } else { match[0].rm_so = 0; match[0].rm_eo = 0; } if (error == 0) { p_end = pts->ptok + match[0].rm_eo; pts->ptok += match[0].rm_so; } } else { /* find nongraph delimited token */ p_end = pts->ptok; while ((pts->ptok < pts->max_ptok) && !isgraph ((int) *pts->ptok)) pts->ptok++; p_end = pts->ptok; while ((p_end < pts->max_ptok) && isgraph ((int) *p_end)) p_end++; } if (error == 0) { /* update token length */ pts->toklen = p_end - pts->ptok; } /* return error status */ /* { unsigned long i = 0; while (error == 0 && i < pts->toklen) fputc (pts->ptok[i++], stderr); fprintf (stderr, " %lu", pts->toklen); } */ return error; } /*****************************************************************/ static unsigned long get_next_hash (struct token_search *pts) { unsigned long hash_acc = 0; unsigned long count_long_tokens = 0; int error; /* get next token */ error = get_next_token (pts); /* long tokens, probably base64 lines */ while (error == 0 && pts->toklen > OSBF_MAX_TOKEN_SIZE && count_long_tokens < pts->max_long_tokens) { count_long_tokens++; /* XOR new hash with previous one */ hash_acc ^= strnhash ((char *) pts->ptok, pts->toklen); /* fprintf (stderr, " %0lX +\n ", hash_acc); */ /* advance the pointer and get next token */ pts->ptok += pts->toklen; error = get_next_token (pts); } if (error == 0) { if (pts->toklen > 0 || count_long_tokens > 0) { hash_acc ^= strnhash ((char *) pts->ptok, pts->toklen); /* fprintf (stderr, " %0lX %lu\n", hash_acc, pts->toklen); */ pts->hash = hash_acc; } else { /* no more hashes */ /* fprintf (stderr, "End of text %0lX %lu\n", hash_acc, pts->toklen); */ error = 1; } } return error; } /*****************************************************************/ // How to learn Osb_Bayes style - in this case, we'll include the single // word terms that may not strictly be necessary. // int crm_expr_osbf_bayes_learn (CSL_CELL * csl, ARGPARSE_BLOCK * apb, char *txtptr, long txtstart, long txtlen) { // learn the osb_bayes transform spectrum of this input window as // belonging to a particular type. // learn (classname) /word/ // long i, j, k; long h; // h is our counter in the hashpipe; char ptext[MAX_PATTERN]; // the regex pattern long plen; char htext[MAX_PATTERN]; // the hash name long hlen; long cflags, eflags; struct stat statbuf; // for statting the hash file OSBF_FEATUREBUCKET_STRUCT *hashes; // the text of the hash file OSBF_FEATURE_HEADER_STRUCT *header; // header of the hash file //char *seen_features; unsigned int hashpipe[OSB_BAYES_WINDOW_LEN + 1]; regex_t regcb; long textoffset; long textmaxoffset; long sense; long fev; char *fname; struct token_search ts; /* fprintf(stderr, "Starting learning...\n"); */ if (user_trace) fprintf (stderr, "OSBF Learn\n"); if (internal_trace) fprintf (stderr, "executing a LEARN\n"); // Keep the gcc compiler from complaining about unused variables // i = hctable[0]; // extract the hash file name crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar (htext, hlen, MAX_PATTERN); // get the "this is a word" regex ptext[0] = '\0'; // start with empty regex crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len); plen = apb->s1len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); // set our cflags, if needed. The defaults are // "case" and "affirm", (both zero valued). // and "microgroom" disabled. cflags = REG_EXTENDED; eflags = 0; sense = +1; if (apb->sflags & CRM_NOCASE) { cflags = cflags | REG_ICASE; eflags = 1; if (user_trace) fprintf (stderr, "turning oncase-insensitive match\n"); }; if (apb->sflags & CRM_REFUTE) { sense = -sense; if (user_trace) fprintf (stderr, " refuting learning\n"); }; if (apb->sflags & CRM_MICROGROOM) { // enable microgroom crm_osbf_set_microgroom(1);; // if not set by command line, use default if (microgroom_chain_length == 0) microgroom_chain_length = OSBF_MICROGROOM_CHAIN_LENGTH; // if not set by command line, use default if (microgroom_stop_after == 0) microgroom_stop_after = OSBF_MICROGROOM_STOP_AFTER; if (user_trace) fprintf (stderr, " enabling microgrooming.\n"); } else { // disable microgroom crm_osbf_set_microgroom(0); } // // grab the filename, and stat the file // note that neither "stat", "fopen", nor "open" are // fully 8-bit or wchar clean... i = 0; while (htext[i] < 0x021) i++; j = i; while (htext[j] >= 0x021) j++; // filename starts at i, ends at j. null terminate it. htext[j] = '\000'; // and stat it to get it's length k = stat (&htext[i], &statbuf); // quick check- does the file even exist? if (k != 0) { if (crm_osbf_create_cssfile (&htext[i], (sparse_spectrum_file_length != 0) ? sparse_spectrum_file_length : OSBF_DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH, OSBF_VERSION, 0, OSBF_CSS_SPECTRA_START) != EXIT_SUCCESS) { fprintf (stderr, "\n Couldn't create file %s; errno=%d .\n", &htext[i], errno); exit (EXIT_FAILURE); } // and reset the statbuf to be correct k = stat (&htext[i], &statbuf); }; // // open the hash file into memory so we can bitwhack it // fname = strdup (&htext[i]); header = crm_mmap_file (fname, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); if (header == MAP_FAILED) { fev = fatalerror ("Couldn't memory-map the .cfc file named: ", &htext[i]); return (fev); }; // if (user_trace) fprintf (stderr, "Sparse spectra file %s has length %ld bins\n", &htext[i], header->buckets); hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start; // check the version of the file // if (*((unsigned long *) header->version) != OSBF_VERSION || header->flags != 0) { fprintf (stderr, "Version was: %ld, flags was %ld\n", *((unsigned long *) header->version), header->flags); fev = fatalerror ("The .cfc file is the wrong type! We're expecting " "a OSBF_Bayes-spectrum file. The filename is: ", &htext[i]); return (fev); }; // // spectra_start = header->buckets_start; // compile the word regex // if (internal_trace) fprintf (stderr, "\nWordmatch pattern is %s", ptext); // compile regex if not empty - empty regex means "plain regex" if (ptext[0] != '\0') { i = crm_regcomp (®cb, ptext, plen, cflags); if (i > 0) { crm_regerror (i, ®cb, tempbuf, data_window_size); nonfatalerror ("Regular Expression Compilation Problem:", tempbuf); goto regcomp_failed; }; } // Start by priming the pipe... we will shift to the left next. // sliding, hashing, xoring, moduloing, and incrmenting the // hashes till there are no more. k = 0; j = 0; i = 0; textoffset = txtstart; textmaxoffset = txtstart + txtlen; // init the hashpipe with 0xDEADBEEF for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++) { hashpipe[h] = 0xDEADBEEF; }; // and the big loop... i = 0; // initialize the token search structure ts.ptok = (unsigned char *) &(txtptr[textoffset]); ts.max_ptok = (unsigned char *) &(txtptr[textmaxoffset]); ts.toklen = 0; ts.pattern = ptext; ts.regcb = ®cb; ts.max_long_tokens = OSBF_MAX_LONG_TOKENS; while (get_next_hash (&ts) == 0) { if (internal_trace) { memmove (tempbuf, ts.ptok, ts.toklen); tempbuf[ts.toklen] = '\000'; fprintf (stderr, " Learn #%ld t.o. %ld strt %ld end %ld len %lu is -%s-\n", i, textoffset, ts.ptok - (unsigned char *) &(txtptr[textoffset]), (ts.ptok + ts.toklen) - (unsigned char *) &(txtptr[textoffset]), ts.toklen, tempbuf); }; // Shift the hash pipe down one for (h = OSB_BAYES_WINDOW_LEN - 1; h > 0; h--) { hashpipe[h] = hashpipe[h - 1]; }; // and put new hash into pipeline hashpipe[0] = ts.hash; if (internal_trace) { fprintf (stderr, " Hashpipe contents: "); for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++) fprintf (stderr, " %u", hashpipe[h]); fprintf (stderr, "\n"); }; /* prepare for next token */ ts.ptok += ts.toklen; textoffset += ts.ptok - (unsigned char *) &(txtptr[textoffset]); i++; { unsigned long hindex, bindex; unsigned long h1, h2; long th = 0; // a counter used for TSS tokenizing long j; // // old Hash polynomial: h0 + 3h1 + 5h2 +11h3 +23h4 // (coefficients chosen by requiring superincreasing, // as well as prime) // th = 0; // for (j = 1; j < OSB_BAYES_WINDOW_LEN; j++) { h1 = hashpipe[0] * hctable[0] + hashpipe[j] * hctable[j << 1]; h2 = hashpipe[0] * hctable[1] + hashpipe[j] * hctable[(j << 1) - 1]; hindex = h1 % header->buckets; if (internal_trace) fprintf (stderr, "Polynomial %ld has h1:%ld h2: %ld\n", j, h1, h2); // // we now look at both the primary (h1) and // crosscut (h2) indexes to see if we've got // the right bucket or if we need to look further // bindex = crm_osbf_find_bucket (header, h1, h2); if (VALID_BUCKET (header, bindex)) { if (!EMPTY_BUCKET (hashes[bindex])) { if (!BUCKET_IS_LOCKED (hashes[bindex])) { crm_osbf_update_bucket (header, bindex, sense); if (internal_trace) fprintf (stderr, "Updated feature at %ld\n", hindex); } } else if (sense > 0) { crm_osbf_insert_bucket (header, bindex, h1, h2, sense); if (internal_trace) fprintf (stderr, "New feature at %ld\n", hindex); } } else { nonfatalerror ("Your program is stuffing too many " "features into this size .cfc file. " "Adding any more features is " "impossible in this file.", "You are advised to build a larger " ".cfc file and merge your data into " "it."); goto learn_end_regex_loop; } } } } // end the while k==0 learn_end_regex_loop: // unlock features locked during learning for (i = 0; i < header->buckets; i++) UNLOCK_BUCKET (hashes[i]); // update the number of learnings if (sense > 0) { header->learnings += sense; if (header->learnings >= (OSBF_FEATUREBUCKET_VALUE_MAX - 1)) { header->learnings >>= 1; for (i = 0; i < header->buckets; i++) BUCKET_RAW_VALUE (hashes[i]) = BUCKET_RAW_VALUE (hashes[i]) >> 1; nonfatalerror ("You have managed to LEARN so many documents that" " you have forced rescaling of the entire database.", " If you are the first person to do this, Fidelis " " owes you a bottle of good singlemalt scotch"); } } else if (header->learnings >= (unsigned long) (-sense)) { header->learnings += sense; } regcomp_failed: // and remember to let go of the mmaps and the pattern bufffer // (because we may have written it, force a cache flush) // crm_munmap_all (); crm_munmap_file ((void *) header); #ifndef CRM_WINDOWS // Because mmap/munmap doesn't set atime, nor set the "modified" // flag, some network filesystems will fail to mark the file as // modified and so their cacheing will make a mistake. // // The fix is to do a trivial read/write on the .cfc ile, to force // the filesystem to repropagate it's caches. // { int hfd; // hashfile fd OSBF_FEATURE_HEADER_STRUCT foo; hfd = open (fname, O_RDWR); dontcare = read (hfd, &foo, sizeof (foo)); lseek (hfd, 0, SEEK_SET); dontcare = write (hfd, &foo, sizeof (foo)); close (hfd); } #endif // !CRM_WINDOWS if (ptext[0] != '\0') crm_regfree (®cb); return (0); } // How to do a Osb_Bayes CLASSIFY some text. // int crm_expr_osbf_bayes_classify (CSL_CELL * csl, ARGPARSE_BLOCK * apb, char *txtptr, long txtstart, long txtlen) { // classify the sparse spectrum of this input window // as belonging to a particular type. // // This code should look very familiar- it's cribbed from // the code for LEARN // long i, j, k; long h; // we use h for our hashpipe counter, as needed. char ptext[MAX_PATTERN]; // the regex pattern long plen; char ostext[MAX_PATTERN]; // optional pR offset long oslen; double pR_offset; // the hash file names char htext[MAX_PATTERN + MAX_CLASSIFIERS * MAX_FILE_NAME_LEN]; long htext_maxlen = MAX_PATTERN + MAX_CLASSIFIERS * MAX_FILE_NAME_LEN; long hlen; // the match statistics variable char stext[MAX_PATTERN + MAX_CLASSIFIERS * (MAX_FILE_NAME_LEN + 100)]; long stext_maxlen = MAX_PATTERN + MAX_CLASSIFIERS * (MAX_FILE_NAME_LEN + 100); long slen; char svrbl[MAX_PATTERN]; // the match statistics text buffer long svlen; long fnameoffset; char fname[MAX_FILE_NAME_LEN]; long eflags; long cflags; // long vhtindex; long not_microgroom = 1; struct stat statbuf; // for statting the hash file unsigned int hashpipe[OSB_BAYES_WINDOW_LEN + 1]; regex_t regcb; double hits[MAX_CLASSIFIERS]; // actual hits per feature per classifier unsigned long totalhits[MAX_CLASSIFIERS]; // actual total hits per classifier unsigned long learnings[MAX_CLASSIFIERS]; // total learnings per classifier unsigned long total_learnings = 0; unsigned long totalfeatures; // total features unsigned long uniquefeatures[MAX_CLASSIFIERS]; // found features per class unsigned long missedfeatures[MAX_CLASSIFIERS]; // missed features per class double htf; // hits this feature got. double tprob; // total probability in the "success" domain. double min_success = 0.5; // minimum probability to be considered success // double textlen; // text length - rougly corresponds to // information content of the text to classify double ptc[MAX_CLASSIFIERS]; // current running probability of this class double renorm = 0.0; OSBF_FEATURE_HEADER_STRUCT *header[MAX_CLASSIFIERS]; OSBF_FEATUREBUCKET_STRUCT *hashes[MAX_CLASSIFIERS]; char *seen_features[MAX_CLASSIFIERS]; long hashlens[MAX_CLASSIFIERS]; char *hashname[MAX_CLASSIFIERS]; long succhash; long vbar_seen; // did we see '|' in classify's args? long maxhash; long fnstart, fnlen; long fn_start_here; long textoffset; long textmaxoffset; long bestseen; long thistotal; struct token_search ts; // cubic weights seem to work well with this new code... - Fidelis //float feature_weight[] = { 0, 125, 64, 27, 8, 1, 0 }; // cubic // these empirical weights give better accuracy with // the CF * unique/totalfeatures used in this code - Fidelis float feature_weight[] = { 0, 3125, 256, 27, 4, 1 }; float confidence_factor; int asymmetric = 0; /* for testings */ int voodoo = 1; /* default */ //double top10scores[10]; //long top10polys[10]; //char top10texts[10][MAX_PATTERN]; /* fprintf(stderr, "Starting classification...\n"); */ if (user_trace) fprintf (stderr, "OSBF classify\n"); if (internal_trace) fprintf (stderr, "executing a CLASSIFY\n"); // extract the hash file names crm_get_pgm_arg (htext, htext_maxlen, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar (htext, hlen, htext_maxlen); // extract the "this is a word" regex // ptext[0] = '\0'; // assume empty regex crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len); plen = apb->s1len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); // extract the optional pR offset value // crm_get_pgm_arg (ostext, MAX_PATTERN, apb->s2start, apb->s2len); oslen = apb->s2len; pR_offset = 0; min_success = 0.5; if (oslen > 0) { oslen = crm_nexpandvar (ostext, oslen, MAX_PATTERN); pR_offset = strtod (ostext, NULL); min_success = 1.0 - 1.0 / (1 + pow (10, pR_offset)); } // extract the optional "match statistics" variable // crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len); svlen = apb->p2len; svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN); { long vstart, vlen; crm_nextword (svrbl, svlen, 0, &vstart, &vlen); memmove (svrbl, &svrbl[vstart], vlen); svlen = vlen; svrbl[vlen] = '\000'; }; // status variable's text (used for output stats) // stext[0] = '\000'; slen = 0; // set our flags, if needed. The defaults are // "case" cflags = REG_EXTENDED; eflags = 0; if (apb->sflags & CRM_NOCASE) { cflags += REG_ICASE; eflags = 1; }; not_microgroom = 1; if (apb->sflags & CRM_MICROGROOM) { not_microgroom = 0; if (user_trace) fprintf (stderr, " disabling fast-skip optimization.\n"); }; // compile the word regex if not empty if (ptext[0] != '\0') { if (internal_trace) fprintf (stderr, "\nWordmatch pattern is |%s|", ptext); i = crm_regcomp (®cb, ptext, plen, cflags); if (i > 0) { crm_regerror (i, ®cb, tempbuf, data_window_size); nonfatalerror ("Regular Expression Compilation Problem:", tempbuf); goto regcomp_failed; }; } // Now, the loop to open the files. bestseen = 0; thistotal = 0; //for (i = 0; i < 10; i++) // { // top10scores[i] = 0; // top10polys[i] = 0; // strcpy (top10texts[i], ""); // }; // -- probabilistic evaluator --- // S = success; A = a testable attribute of success // ns = not success, na = not attribute // the chain rule we use is: // // P(A|S) P(S) // P (S|A) = ------------------------- // P(A|S) P(S) + P(A|NS) P(NS) // // and we apply it repeatedly to evaluate the final prob. For // the initial a-priori probability, we use 0.5. The output // value (here, P(S|A) ) becomes the new a-priori for the next // iteration. // // Extension - we generalize the above to I classes as and feature // F as follows: // // P(F|Ci) P(Ci) // P(Ci|F) = ---------------------------------------- // Sum over all classes Ci of P(F|Ci) P(Ci) // // We also correct for the unequal corpus sizes by multiplying // the probabilities by a renormalization factor. if Tg is the // total number of good features, and Te is the total number of // evil features, and Rg and Re are the raw relative scores, // then the corrected relative scores Cg aqnd Ce are // // Cg = (Rg / Tg) // Ce = (Re / Te) // // or Ci = (Ri / Ti) // // Cg and Ce can now be used as "corrected" relative counts // to calculate the naive Bayesian probabilities. // // Lastly, the issue of "over-certainty" rears it's ugly head. // This is what happens when there's a zero raw count of either // good or evil features at a particular place in the file; the // strict but naive mathematical interpretation of this is that // "feature A never/always occurs when in good/evil, hence this // is conclusive evidence of good/evil and the probabilities go // to 1.0 or 0.0, and are stuck there forevermore. We use the // somewhat ad-hoc interpretation that it is unreasonable to // assume that any finite number of samples can appropriately // represent an infinite continuum of spewage, so we can bound // the certainty of any meausre to be in the range: // // limit: [ 1/featurecount+2 , 1 - 1/featurecount+2]. // // The prior bound is strictly made-up-on-the-spot and has NO // strong theoretical basis. It does have the nice behavior // that for feature counts of 0 the probability is clipped to // [0.5, 0.5], for feature counts of 1 to [0.333, 0.666] // for feature counts of 2 to [0.25, 0.75], for 3 to // [0.2, 0.8], for 4 to [0.166, 0.833] and so on. // vbar_seen = 0; maxhash = 0; succhash = 0; fnameoffset = 0; // now, get the file names and mmap each file // get the file name (grody and non-8-bit-safe, but doesn't matter // because the result is used for open() and nothing else. // GROT GROT GROT this isn't NULL-clean on filenames. But then // again, stdio.h itself isn't NULL-clean on filenames. if (user_trace) fprintf (stderr, "Classify list: -%s- \n", htext); fn_start_here = 0; fnlen = 1; while (fnlen > 0 && ((maxhash < MAX_CLASSIFIERS - 1))) { crm_nextword (htext, hlen, fn_start_here, &fnstart, &fnlen); if (fnlen > 0) { strncpy (fname, &htext[fnstart], fnlen); fn_start_here = fnstart + fnlen + 1; fname[fnlen] = '\000'; if (user_trace) fprintf (stderr, "Classifying with file -%s- " "succhash=%ld, maxhash=%ld\n", fname, succhash, maxhash); if (fname[0] == '|' && fname[1] == '\000') { if (vbar_seen) { nonfatalerror ("Only one ' | ' allowed in a CLASSIFY. \n", "We'll ignore it for now."); } else { succhash = maxhash; }; vbar_seen++; } else { // be sure the file exists // stat the file to get it's length k = stat (fname, &statbuf); // quick check- does the file even exist? if (k != 0) { nonfatalerror ("Nonexistent Classify table named: ", fname); } else { // file exists - do the open/process/close // hashlens[maxhash] = statbuf.st_size; // mmap the hash file into memory so we can bitwhack it header[maxhash] = (OSBF_FEATURE_HEADER_STRUCT *) crm_mmap_file ( fname, 0, hashlens[maxhash], PROT_READ | PROT_WRITE, MAP_SHARED, NULL); if (header[maxhash] == MAP_FAILED) { nonfatalerror ("Couldn't memory-map the table file", fname); } else { // // Check to see if this file is the right version // long fev; if (* ((unsigned long *) header[maxhash]->version) != OSBF_VERSION || header[maxhash]->flags != 0) { fev = fatalerror ("The .cfc file is the wrong version! Filename is: ", fname); return (fev); }; // grab the start of the actual spectrum data. // hashes[maxhash] = (OSBF_FEATUREBUCKET_STRUCT *) header[maxhash] + header[maxhash]->buckets_start; spectra_start = header[maxhash]->buckets_start; learnings[maxhash] = header[maxhash]->learnings; // // increment learnings to avoid division by 0 if (learnings[maxhash] == 0) learnings[maxhash]++; // update total learnings total_learnings += learnings[maxhash]; // set this hashlens to the length in features instead // of the length in bytes. hashlens[maxhash] = header[maxhash]->buckets; hashname[maxhash] = (char *) malloc (fnlen + 10); if (!hashname[maxhash]) untrappableerror ("Couldn't malloc hashname[maxhash]\n", "We need that part later, so we're stuck. Sorry."); strncpy (hashname[maxhash], fname, fnlen); hashname[maxhash][fnlen] = '\000'; maxhash++; }; }; }; if (maxhash > MAX_CLASSIFIERS - 1) nonfatalerror ("Too many classifier files.", "Some may have been disregarded"); }; }; for (i = 0; i < maxhash; i++) { seen_features[i] = malloc (header[i]->buckets); if (!seen_features[i]) untrappableerror ("Couldn't malloc seen features array\n", "We need that part later, so we're stuck. Sorry."); memset (seen_features[i], 0, header[i]->buckets); // initialize our arrays for N .cfc files hits[i] = 0.0; // absolute hit counts totalhits[i] = 0; // absolute hit counts uniquefeatures[i] = 0; // features counted per class missedfeatures[i] = 0; // missed features per class // a priori probability ptc[i] = (double) learnings[i] / total_learnings; // ptc[i] = 0.5; } // // If there is no '|', then all files are "success" files. if (succhash == 0) succhash = maxhash; // a CLASSIFY with no arguments is always a "success". if (maxhash == 0) return (0); if (user_trace) fprintf (stderr, "Running with %ld files for success out of %ld files\n", succhash, maxhash); // sanity checks... Uncomment for super-strict CLASSIFY. // // do we have at least 1 valid .cfc files? if (maxhash == 0) { fatalerror ("Couldn't open at least 2 .cfc files for classify().", ""); }; // do we have at least 1 valid .cfc file at both sides of '|'? //if (!vbar_seen || succhash < 0 || (maxhash < succhash + 2)) // { // nonfatalerror ( // "Couldn't open at least 1 .cfc file per SUCC | FAIL classes " // " for classify().\n","Hope you know what are you doing."); // }; // // now all of the files are mmapped into memory, // and we can do the polynomials and add up points. i = 0; j = 0; k = 0; thistotal = 0; textoffset = txtstart; textmaxoffset = txtstart + txtlen; // init the hashpipe with 0xDEADBEEF for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++) { hashpipe[h] = 0xDEADBEEF; }; totalfeatures = 0; // stop when we no longer get any regex matches // possible edge effect here- last character must be matchable, yet // it's also the "end of buffer". // initialize the token search structure ts.ptok = (unsigned char *) &(txtptr[textoffset]); ts.max_ptok = (unsigned char *) &(txtptr[textmaxoffset]); ts.toklen = 0; ts.pattern = ptext; ts.regcb = ®cb; ts.max_long_tokens = OSBF_MAX_LONG_TOKENS; while (get_next_hash (&ts) == 0) { if (internal_trace) { memmove (tempbuf, ts.ptok, ts.toklen); tempbuf[ts.toklen] = '\000'; fprintf (stderr, " Classify #%ld t.o. %ld strt %ld end %ld len %lu is -%s-\n", i, textoffset, ts.ptok - (unsigned char *) &(txtptr[textoffset]), (ts.ptok + ts.toklen) - (unsigned char *) &(txtptr[textoffset]), ts.toklen, tempbuf); }; // slide previous hashes up 1 for (h = OSB_BAYES_WINDOW_LEN - 1; h > 0; h--) { hashpipe[h] = hashpipe[h - 1]; }; // and put new hash into pipeline hashpipe[0] = ts.hash; if (0) { fprintf (stderr, " Hashpipe contents: "); for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++) fprintf (stderr, " %u", hashpipe[h]); fprintf (stderr, "\n"); }; /* prepare for next token */ ts.ptok += ts.toklen; textoffset += ts.ptok - (unsigned char *) &(txtptr[textoffset]); i++; { int j, k; unsigned th = 0; // a counter used only in TSS hashing unsigned long hindex; unsigned long h1, h2; // remember indexes of classes with min and max local probabilities int i_min_p, i_max_p; // remember min and max local probabilities of a feature double min_local_p, max_local_p; int already_seen; // th = 0; // for (j = 1; j < OSB_BAYES_WINDOW_LEN; j++) { h1 = hashpipe[0] * hctable[0] + hashpipe[j] * hctable[j << 1]; h2 = hashpipe[0] * hctable[1] + hashpipe[j] * hctable[(j << 1) - 1]; hindex = h1; if (internal_trace) fprintf (stderr, "Polynomial %d has h1:%ld h2: %ld\n", j, h1, h2); // // Note - a strict interpretation of Bayesian // chain probabilities should use 0 as the initial // state. However, because we rapidly run out of // significant digits, we use a much less strong // initial state. Note also that any nonzero // positive value prevents divide-by-zero // // Zero out "Hits This Feature" htf = 0; totalfeatures++; // // calculate the precursors to the local probabilities; // these are the hits[k] array, and the htf total. // min_local_p = 1.0; max_local_p = 0; i_min_p = i_max_p = 0; already_seen = 0; for (k = 0; k < maxhash; k++) { long lh, lh0; float p_feat = 0; lh = hindex % (hashlens[k]); lh0 = lh; hits[k] = 0; lh = crm_osbf_find_bucket (header[k], h1, h2); // if the feature isn't found in the class, the index lh // will point to the first empty bucket after the chain // and its value will be 0. // // the bucket is valid if its index is valid. if the // index "lh" is >= the number of buckets, it means that // the .cfc file is full and the bucket wasn't found if (VALID_BUCKET (header[k], lh) && seen_features[k][lh] == 0) { // only not previously seen features are considered if (GET_BUCKET_VALUE (hashes[k][lh]) != 0) { uniquefeatures[k] += 1; // count unique features used hits[k] = GET_BUCKET_VALUE (hashes[k][lh]); totalhits[k] += hits[k]; // remember totalhits htf += hits[k]; // and hits-this-feature p_feat = hits[k] / learnings[k]; // find class with minimum P(F) if (p_feat <= min_local_p) { i_min_p = k; min_local_p = p_feat; } // find class with maximum P(F) if (p_feat >= max_local_p) { i_max_p = k; max_local_p = p_feat; } // mark the feature as seen seen_features[k][lh] = 1; } else { // a feature that wasn't found can't be marked as // already seen in the doc because the index lh // doesn't refer to it, but to the first empty bucket // after the chain, which is common to all not-found // features in the same chain. This is not a problem // though, because if the feature is found in another // class, it'll be marked as seen on that class, // which is enough to mark it as seen. If it's not // found in any class, it will have zero count on all // classes and will be ignored as well. So, only // found features are marked as seen. i_min_p = k; min_local_p = p_feat = 0; // for statistics only (for now...) missedfeatures[k] += 1; } } else { // ignore already seen features if (VALID_BUCKET (header[k], lh)) { min_local_p = max_local_p = 0; already_seen = 1; if (asymmetric != 0) break; } else { /* bucket not valid. treat like feature not found */ i_min_p = k; min_local_p = p_feat = 0; // for statistics only (for now...) missedfeatures[k] += 1; } } } //======================================================= // Update the probabilities using Bayes: // // P(F|S) P(S) // P(S|F) = ------------------------------- // P(F|S) P(S) + P(F|NS) P(NS) // // S = class spam; NS = class nonspam; F = feature // // Here we adopt a different method for estimating // P(F|S). Instead of estimating P(F|S) as (hits[S][F] / // (hits[S][F] + hits[NS][F])), like in the original // code, we use (hits[S][F] / learnings[S]) which is the // ratio between the number of messages of the class S // where the feature F was observed during learnings and // the total number of learnings of that class. Both // values are kept in the respective .cfc file, the // number of learnings in the header and the number of // occurrences of the feature F as the value of its // feature bucket. // // It's worth noting another important difference here: // as we want to estimate the *number of messages* of a // given class where a certain feature F occurs, we // count only the first ocurrence of each feature in a // message (repetitions are ignored), both when learning // and when classifying. // // Advantages of this method, compared to the original: // // - First of all, and the most important: accuracy is // really much better, at about the same speed! With // this higher accuracy, it's also possible to increase // the speed, at the cost of a low decrease in accuracy, // using smaller .cfc files; // // - It is not affected by different sized classes // because the numerator and the denominator belong to // the same class; // // - It allows a simple and fast pruning method that // seems to introduce little noise: just zero features // with lower count in a overflowed chain, zeroing first // those in their right places, to increase the chances // of deleting older ones. // // Disadvantages: // // - It breaks compatibility with previous css file // format because of different header structure and // meaning of the counts. // // Confidence factors // // The motivation for confidence factors is to reduce // the noise introduced by features with small counts // and/or low significance. This is an attempt to mimic // what we do when inspecting a message to tell if it is // spam or not. We intuitively consider only a few // tokens, those which carry strong indications, // according to what we've learned and remember, and // discard the ones that may occur (approximately) // equally in both classes. // // Once P(Feature|Class) is estimated as above, the // calculated value is adjusted using the following // formula: // // CP(Feature|Class) = 0.5 + // CF(Feature) * (P(Feature|Class) - 0.5) // // Where CF(Feature) is the confidence factor and // CP(Feature|Class) is the adjusted estimate for the // probability. // // CF(Feature) is calculated taking into account the // weight, the max and the min frequency of the feature // over the classes, using the empirical formula: // // (((Hmax - Hmin)^2 + Hmax*Hmin - K1/SH) / SH^2) ^ K2 // CF(Feature) = ------------------------------------------ // 1 + K3 / (SH * Weight) // // Hmax - Number of documents with the feature "F" on // the class with max local probability; // Hmin - Number of documents with the feature "F" on // the class with min local probability; // SH - Sum of Hmax and Hmin // K1, K2, K3 - Empirical constants // // OBS: - Hmax and Hmin are normalized to the max number // of learnings of the 2 classes involved. // - Besides modulating the estimated P(Feature|Class), // reducing the noise, 0 <= CF < 1 is also used to // restrict the probability range, avoiding the // certainty falsely implied by a 0 count for a given // class. // // -- Fidelis Assis //========================================================= // ignore less significant features (confidence factor = 0) if (already_seen != 0 || (max_local_p - min_local_p) < 1.0E-6) continue; // testing speed-up... if (min_local_p > 0 && (max_local_p / min_local_p) < min_pmax_pmin_ratio) continue; // code under testing.... // calculate confidence_factor { // hmmm, unsigned long gives better precision than float... //float hits_max_p, hits_min_p, sum_hits, diff_hits; //unsigned long hits_max_p, hits_min_p, sum_hits, diff_hits; unsigned long hits_max_p, hits_min_p, sum_hits; long diff_hits; float K1, K2, K3; hits_min_p = hits[i_min_p]; hits_max_p = hits[i_max_p]; // normalize hits to max learnings if (learnings[i_min_p] < learnings[i_max_p]) hits_min_p *= (float) learnings[i_max_p] / (float) learnings[i_min_p]; else hits_max_p *= (float) learnings[i_min_p] / (float) learnings[i_max_p]; sum_hits = hits_max_p + hits_min_p; diff_hits = hits_max_p - hits_min_p; if (diff_hits < 0) diff_hits = -diff_hits; // constants used in the CF formula above // K1 = 0.25; K2 = 10; K3 = 8; K1 = 0.25; K2 = 10; K3 = 8; // calculate confidence factor (CF) if (voodoo == 0) /* || min_local_p > 0) */ confidence_factor = 1 - DBL_MIN; else confidence_factor = pow ((diff_hits * diff_hits + hits_max_p * hits_min_p - K1 / sum_hits) / (sum_hits * sum_hits), K2) / (1.0 + K3 / (sum_hits * feature_weight[j])); if (internal_trace) printf ("CF: %.4f, max_hits = %3ld, min_hits = %3ld, " "weight: %5.1f\n", confidence_factor, hits_max_p, hits_min_p, feature_weight[j]); } // calculate the numerators P(F|C) * P(C) renorm = 0.0; for (k = 0; k < maxhash; k++) { // P(F|C) = hits[k]/learnings[k], adjusted with a // confidence factor, to reduce the influence // of features common to all classes ptc[k] = ptc[k] * (0.5 + confidence_factor * (hits[k] / learnings[k] - 0.5)); // if we have underflow (any probability == 0.0 ) then // bump the probability back up to 10^-308, or // whatever a small multiple of the minimum double // precision value is on the current platform. if (ptc[k] < 10 * DBL_MIN) ptc[k] = 10 * DBL_MIN; renorm += ptc[k]; if (internal_trace) printf ("CF: %.4f, totalhits[k]: %lu, missedfeatures[k]: %lu, " "uniquefeatures[k]: %lu, totalfeatures: %lu, " "weight: %5.1f\n", confidence_factor, totalhits[k], missedfeatures[k], uniquefeatures[k], totalfeatures, feature_weight[j]); } // renormalize probabilities for (k = 0; k < maxhash; k++) ptc[k] = ptc[k] / renorm; if (internal_trace) { for (k = 0; k < maxhash; k++) { fprintf (stderr, " poly: %d filenum: %d, HTF: %7.0f, " "learnings: %7lu, hits: %7.0f, " "Pc: %6.4e\n", j, k, htf, header[k]->learnings, hits[k], ptc[k]); }; }; // // avoid the fencepost error for window=1 if (OSB_BAYES_WINDOW_LEN == 1) { j = 99999; }; }; }; }; // end of repeat-the-regex loop // cleanup time! // remember to let go of the fd's and mmaps for (k = 0; k < maxhash; k++) { // let go of the file, but allow caches to be retained if (header[k]) crm_munmap_file ((void *) header[k]); free (seen_features[k]); }; // and let go of the regex buffery if (ptext[0] != '\0') crm_regfree (®cb); // and one last chance to force probabilities into the non-stuck zone // // if (pic == 0.0 ) pic = DBL_MIN; //if (pnic == 0.0 ) pnic = DBL_MIN; /* for (k = 0; k < maxhash; k++) if (ptc[k] < 10 * DBL_MIN) ptc[k] = 10 * DBL_MIN; */ if (user_trace) { for (k = 0; k < maxhash; k++) fprintf (stderr, "Probability of match for file %ld: %f\n", k, ptc[k]); }; // tprob = 0.0; for (k = 0; k < succhash; k++) tprob = tprob + ptc[k]; if (svlen > 0) { char buf[1024]; double accumulator; double remainder; double overall_pR; long m; buf[0] = '\000'; accumulator = 10 * DBL_MIN; for (m = 0; m < succhash; m++) { accumulator = accumulator + ptc[m]; }; remainder = 10 * DBL_MIN; for (m = succhash; m < maxhash; m++) { remainder = remainder + ptc[m]; }; overall_pR = log10 (accumulator) - log10 (remainder); // note also that strcat _accumulates_ in stext. // There would be a possible buffer overflow except that _we_ control // what gets written here. So it's no biggie. if (tprob > min_success) { // if a pR offset was given, print it together with the real pR if (oslen > 0) { sprintf (buf, "CLASSIFY succeeds; success probability: " "%6.4f pR: %6.4f/%6.4f\n", tprob, overall_pR, pR_offset); } else { sprintf (buf, "CLASSIFY succeeds; success probability: " "%6.4f pR: %6.4f\n", tprob, overall_pR); } } else { // if a pR offset was given, print it together with the real pR if (oslen > 0) { sprintf (buf, "CLASSIFY fails; success probability: " "%6.4f pR: %6.4f/%6.4f\n", tprob, overall_pR, pR_offset); } else { sprintf (buf, "CLASSIFY fails; success probability: " "%6.4f pR: %6.4f\n", tprob, overall_pR); } }; if (strlen (stext) + strlen (buf) <= stext_maxlen) strcat (stext, buf); bestseen = 0; for (k = 0; k < maxhash; k++) if (ptc[k] > ptc[bestseen]) bestseen = k; remainder = 10 * DBL_MIN; for (m = 0; m < maxhash; m++) if (bestseen != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "Best match to file #%ld (%s) " "prob: %6.4f pR: %6.4f \n", bestseen, hashname[bestseen], ptc[bestseen], (log10 (ptc[bestseen]) - log10 (remainder))); if (strlen (stext) + strlen (buf) <= stext_maxlen) strcat (stext, buf); sprintf (buf, "Total features in input file: %ld\n", totalfeatures); if (strlen (stext) + strlen (buf) <= stext_maxlen) strcat (stext, buf); for (k = 0; k < maxhash; k++) { long m; remainder = 10 * DBL_MIN; for (m = 0; m < maxhash; m++) if (k != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "#%ld (%s):" " hits: %ld, ufeats: %ld, prob: %3.2e, pR: %6.2f \n", k, hashname[k], totalhits[k], uniquefeatures[k], ptc[k], (log10 (ptc[k]) - log10 (remainder))); // strcat (stext, buf); if (strlen (stext) + strlen (buf) <= stext_maxlen) strcat (stext, buf); } // check here if we got enough room in stext to stuff everything // perhaps we'd better rise a nonfatalerror, instead of just // whining on stderr if (strcmp (&(stext[strlen (stext) - strlen (buf)]), buf) != 0) { nonfatalerror ("WARNING: not enough room in the buffer to create " "the statistics text. Perhaps you could try bigger " "values for MAX_CLASSIFIERS or MAX_FILE_NAME_LEN?", " "); } crm_destructive_alter_nvariable (svrbl, svlen, stext, strlen (stext)); } // // Free the hashnames, to avoid a memory leak. // for (i = 0; i < maxhash; i++) free (hashname[i]); if (tprob <= min_success) { if (user_trace) fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n"); // and do what we do for a FAIL here csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk[csl->mct[csl->cstmt]->nest_level] = -1; return (0); } // // all done... if we got here, we should just continue execution if (user_trace) fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n"); regcomp_failed: return (0); } crm114-20100106-BlameMichelson.src/unionintersecttest.crm0000755000000000017500000000152611321154266021460 0ustar rootwsy#! /usr/bin/crm # # unionintersecttest.crm - Testing union and intersection # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. window output /:*:_nl: CRM114: Testing union and intersection :*:_nl: :*:_nl:/ { alter (:_dw:) / a b c d e f g h i j k l m n o p q r s t u v w x y z / output /We start with this: ':*:_dw:' :*:_nl:/ match (:alpha:) /a/ match (:lima:) /l/ match (:sierra:) /s/ match (:zulu:) /z/ match (:abc:) /a b c/ match (:cde:) /c d e/ intersect (:t1:) [:abc: :cde:] output /intersection of abc and cde is t1: ':*:t1:' :*:_nl:/ union (:t2:) [:lima: :sierra:] output /union of l thru s is t2: ':*:t2:' :*:_nl:/ intersect (:t3:) [:abc: :t2:] output /intersection of abc and t2 is t3: ':*:t3:' :*:_nl:/ union (:t4:) [:zulu: :t1:] output /union of zulu and t1 is t4: ':*:t4:' :*:_nl:/ } crm114-20100106-BlameMichelson.src/classifytest.crm0000755000000000017500000000100511321154266020214 0ustar rootwsy#! /usr/bin/crm # # classifytest.crm - test classifying between files # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. { match (:data:) /.*/ isolate (:stats:) output /classifying between files :*:_arg2: and :*:_arg3: :*:_nl:/ { classify [:data:] (:*:_arg2: | :*:_arg3:) (:stats:) /[[:graph:]]+/ output / file :*:_arg2: matches better :*:_nl: :*:_nl: :*:stats::*:_nl:/ exit } output / file :*:_arg3: matches better :*:_nl::*:_nl::*:stats::*:_nl:/ } crm114-20100106-BlameMichelson.src/COLOPHON.txt0000644000000000017500000000365711321154266016772 0ustar rootwsy# # COLOPHON.txt - Production notes of CRM114 # # Copyright 2001-2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # The CRM114 Discriminator system was written mostly while going to and from my day job on Boston's MBTA commuter rail trains. The development machine was a Sony Picturebook C1VP running Red Hat Linux 7.2 ( soon upgraded to Red Hat 7.3, then RH 7.3 on a Fujitsu P2120). Editing was with GNU Emacs 21.2.1 , compiling was with GCC 2.96, and debugging with GDB frontended with DDD 3.3.1 . It took about 100 days of commuting to do the initial work, mostly in 1/2 hour stretches. This included design, coding, testing, and documentation. I expect that it shows. The upside of all this is that the code is simple enough to understand because it's all comprehendable in 1/2 hour stretches. The downside is that it probably reads in a somewhat choppy style. If CRM114 is useful code to someone, please use it; if you find a bug or an wierdness, send in an email and we'll create a fix or an update. Like the readme says, this isn't the PERL swiss army knife, this is a razor-sharp katana that can talk. Much of the power of CRM114 versus Perl, awk, et al is due to the linear-time and approximate regex matching engines written by Ville Laurikari, and all the glory for that particular section of the code belongs to Ville, not me. I would like to thank Darren Leigh, David Kramer, Reto Lichtensteiger, John Bowker, Ville Laurikari, Eric Johanssen, Adolfo Santiago, Danko Miklos, Dave Corcoran, Ben Livingood, George Burdell, P Oscar Boykin, Corrado Cau, Ruven Gottlieb, Kurt Bigler, Barry Jaspan, Fidelis Assis, Christian Siefkes, Shalendra Chhabra, Paolo Pazolli, and many others for their sharp eyes and analytic skills. I would also like to thank Richard M. Stallman and Linus Torvalds, for leading by example. As Napoleon said: "When all else fails, march toward the sound of the guns." -Bill Yerazunis crm114-20100106-BlameMichelson.src/calc.crm0000755000000000017500000000120311321154266016401 0ustar rootwsy#! /usr/bin/crm # # calc.crm - desk calculator # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. window output /Enter expressions. Hit return to evaluate, empty line to exit\n / { isolate (:y:) input (:x:) match [:x:] /./ { # Eval once -- this one does variable subsitutions eval (:x:) /:+:x:/ # Eval again -- this one does the actual math eval (:y:) / :@::*:x:: / output /:*:y:\n / trap (:z:) /.*/ isolate (:y:) /I couldn't evaluate ':*:x:' ./ output /:*:y:\n/ match [:z:] /\*WARNING\* \n(.*)\nI'll try/ (:: :R:) output /:*:R:\n\n / } liaf } crm114-20100106-BlameMichelson.src/crm_expr_syscall.c0000644000000000017500000006321611321154266020522 0ustar rootwsy// crm_expr_syscall.c - system call expression handling // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *inbuf; extern char *outbuf; #ifndef CRM_WINDOWS // Normal options for UNIX/Linux #else // CRM_WINDOWS typedef struct { HANDLE to_minion; char *inbuf; long inlen; long internal_trace; long keep_proc; } pusherparams; typedef struct { HANDLE from_minion; int timeout; } suckerparams; DWORD WINAPI pusher_proc(LPVOID lpParameter) { DWORD bytesWritten; pusherparams *p = (pusherparams *)lpParameter; WriteFile(p->to_minion, p->inbuf, p->inlen, &bytesWritten, NULL); free(p->inbuf); if (p->internal_trace) fprintf (stderr, "pusher: input sent to minion.\n"); // if we don't want to keep this proc, we close it's input, and // wait for it to exit. if (! p->keep_proc) { CloseHandle (p->to_minion); if (internal_trace) fprintf (stderr, "minion input pipe closed\n"); } if (p->internal_trace) fprintf (stderr, "pusher: exiting pusher\n"); return 0; } DWORD WINAPI sucker_proc(LPVOID lpParameter) { DWORD bytesRead; suckerparams *p = (suckerparams *)lpParameter; char *outbuf = malloc(sizeof(char) * 8192); // we're in the sucker process here- just throw away // everything till we get EOF, then exit. while (1) { Sleep (p->timeout); ReadFile(p->from_minion, outbuf, 8192, &bytesRead, NULL); if (bytesRead == 0) break; }; return 0; } #endif // CRM_WINDOWS int crm_expr_syscall ( CSL_CELL *csl, ARGPARSE_BLOCK *apb) { // Go off and fork a process, sending that process // one pattern evaluated as input, and then accepting // all the returns from that process as the new value // for a variable. // // syntax is: // exec (:to:) (:from:) (:ctl:) /commandline/ long inlen; long outlen; char from_var [MAX_VARNAME]; char sys_cmd [MAX_PATTERN]; long cmd_len; char keep_buf [MAX_PATTERN]; long keep_len; char exp_keep_buf[MAX_PATTERN]; long exp_keep_len; long vstart; long vlen; long done, charsread; int keep_proc; int async_mode; int to_minion[2]; int from_minion[2]; pid_t minion; int minion_exit_status; pid_t pusher; pid_t sucker; pid_t random_child; int status; long timeout; #ifndef CRM_WINDOWS if (user_trace) fprintf (stderr, "executing an SYSCALL statement"); timeout = MINION_SLEEP_USEC; // clean up any prior processes - note that // we don't keep track of these. For that matter, we have // no context to keep track of 'em. // while ( (random_child = waitpid ( 0, &status, WNOHANG)) > 0 ); #else // CRM_WINDOWS SECURITY_ATTRIBUTES pipeSecAttr; HANDLE hminion; timeout = MINION_SLEEP_USEC / 1000; // need milliseconds for Sleep() if (MINION_SLEEP_USEC > 0 && timeout < 1) { timeout = 1; } #endif // CRM_WINDOWS // get the flags // keep_proc = 0; if (apb->sflags & CRM_KEEP) { if (user_trace) fprintf (stderr, "Keeping the process around if possible\n"); keep_proc = 1; }; async_mode = 0; if (apb->sflags & CRM_ASYNC) { if (user_trace) fprintf (stderr, "Letting the process go off on it's own"); async_mode = 1; }; // Sanity check - is incompatible with // if (keep_proc && async_mode) { nonfatalerror5 ("This syscall uses both async and keep, but async is " "incompatible with keep. Since keep is safer" "we will use that.\n", "You need to fix this program.", CRM_ENGINE_HERE); async_mode = 0; }; // get the input variable(s) // crm_get_pgm_arg (inbuf, data_window_size, apb->p1start, apb->p1len); inlen = crm_nexpandvar (inbuf, apb->p1len, data_window_size); if (user_trace) fprintf (stderr, " command's input wil be: ***%s***\n", inbuf); // now get the name of the variable where the return will be // placed... this is a crock and should be fixed someday. // the output goes only into a single var (the first one) // so we extract that // crm_get_pgm_arg (from_var, MAX_PATTERN, apb->p2start, apb->p2len); outlen = crm_nexpandvar (from_var, apb->p2len, MAX_PATTERN); done = 0; vstart = 0; while (from_var[vstart] < 0x021 && from_var[vstart] > 0x0 ) vstart++; vlen = 0; while (from_var[vstart+vlen] >= 0x021) vlen++; memmove (from_var, &from_var[vstart], vlen); from_var[vlen] = '\000'; if (user_trace) fprintf (stderr, " command output will overwrite var ***%s***\n", from_var); // now get the name of the variable (if it exists) where // the kept-around minion process's pipes and pid are stored. crm_get_pgm_arg (keep_buf, MAX_PATTERN, apb->p3start, apb->p3len); keep_len = crm_nexpandvar (keep_buf, apb->p3len, MAX_PATTERN); if (user_trace) fprintf (stderr, " command status kept in var ***%s***\n", keep_buf); // Get the command to execute // // GROT GROT GROT // In retrospect, putting the command to execute in /slashes/ // was a design error. It's not a pattern to match, it's a // source to operate on (in the meta sense, at least). And, // from a practical point of view, it means that pathnames with // embedded slashes are a pain in the neck to write. So- we'll // allow the boxed [string] syntax as well as the slash /string/ // syntax for now. // GROT GROT GROT if (apb->s1len > 0) { crm_get_pgm_arg (sys_cmd, MAX_PATTERN, apb->s1start, apb->s1len); cmd_len = crm_nexpandvar (sys_cmd, apb->s1len, MAX_PATTERN); }; if (apb->b1len > 0) { crm_get_pgm_arg (sys_cmd, MAX_PATTERN, apb->b1start, apb->b1len); cmd_len = crm_nexpandvar (sys_cmd, apb->b1len, MAX_PATTERN); }; if (user_trace) fprintf (stderr, " command will be ***%s***\n", sys_cmd); // Do we reuse an already-existing process? Check to see if the // keeper variable has it... note that we have to :* prefix it // and expand it again. minion = 0; to_minion[0] = 0; from_minion[1] = 0; exp_keep_buf [0] = '\000'; // this is 8-bit-safe because vars are never wchars. strcat (exp_keep_buf, ":*"); strncat (exp_keep_buf, keep_buf, keep_len); exp_keep_len = crm_nexpandvar (exp_keep_buf, keep_len+2, MAX_PATTERN); sscanf (exp_keep_buf, "MINION PROC PID: %d from-pipe: %d to-pipe: %d", &minion, &from_minion[0], &to_minion[1]); #ifndef CRM_WINDOWS // if, no minion already existing, we create // communications pipes and launch the subprocess. This // code borrows concepts from both liblaunch and from // netcat (thanks, *Hobbit*!) // if (minion == 0) { long status1, status2; if (user_trace) fprintf (stderr, " Must start a new minion.\n"); status1 = pipe (to_minion); status2 = pipe (from_minion); if (status1 > 0 || status2 > 0) { nonfatalerror5 ("Problem setting up the to/from pipes to a minion. ", "Perhaps the system file descriptor table is full?", CRM_ENGINE_HERE); return (1); }; minion = fork(); if (minion < 0) { nonfatalerror5 ("Tried to fork your minion, but it failed.", "Your system may have run out of process slots", CRM_ENGINE_HERE); return (1); }; if (minion == 0) { // START OF IN THE MINION // // if minion == 0, then We're in the minion here int retcode; long vstart, vlen; long varline; // close the ends of the pipes we don't need. // // NOTE: if this gets messed up, you end up with a race // condition, because both master and minion processes // can both read and write both pipes (effectively a // process could write something out, then read it again // right back out of the pipe)! So, it's REALLY REALLY // IMPORTANT that you use two pipe structures, (one for // each direction) and you keep track of which process // should write to which pipe!!! // close (to_minion[1]); close (from_minion[0]); dup2 (to_minion[0], fileno(stdin)); dup2 (from_minion[1], fileno(stdout)); // Are we a syscall to a :label:, or should we invoke the // shell on an external command? // crm_nextword (sys_cmd, strlen (sys_cmd), 0, &vstart, &vlen); varline = crm_lookupvarline (vht, sys_cmd, vstart, vlen); if (varline > 0) { // sys_cmd[vstart+vlen] = '\0'; if (user_trace) fprintf (stderr, "FORK transferring control to line %s\n", &sys_cmd[vstart]); // set the current pid and parent pid. { char pidstr [32]; long pid; pid = (long) getpid(); sprintf (pidstr, "%ld", pid); crm_set_temp_var (":_pid:", pidstr); if (user_trace) fprintf (stderr, "My new PID is %s\n", pidstr); pid = (long) getppid(); sprintf (pidstr, "%ld", pid); crm_set_temp_var (":_ppid:", pidstr); } // See if we have redirection of stdin and stdout while (crm_nextword (sys_cmd, strlen (sys_cmd), vstart+vlen, &vstart, &vlen)) { char filename[MAX_PATTERN]; if (sys_cmd[vstart] == '<') { strncpy (filename, &sys_cmd[vstart+1], vlen); filename[vlen-1] = '\0'; if (user_trace) fprintf (stderr, "Redirecting minion stdin to %s\n", filename); dontcareptr = freopen (filename, "rb", stdin); }; if (sys_cmd[vstart] == '>') { if (sys_cmd[vstart+1] != '>') { strncpy (filename, &sys_cmd[vstart+1], vlen); filename[vlen-1] = '\0'; if (user_trace) fprintf (stderr, "Redirecting minion stdout to %s\n", filename); dontcareptr = freopen (filename, "wb", stdout); } else { strncpy (filename, &sys_cmd[vstart+2], vlen); filename[vlen-2] = '\0'; if (user_trace) fprintf (stderr, "Appending minion stdout to %s\n", filename); dontcareptr = freopen (filename, "a+", stdout); } }; } csl->cstmt = varline; // and note that this isn't a failure. csl->aliusstk [ csl->mct[csl->cstmt]->nest_level ] = 1; // The minion's real work should now start; get out of // the syscall code and go run something real. :) return (0); } else { if (user_trace) fprintf (stderr, "Systemcalling on shell command %s\n", sys_cmd); retcode = system (sys_cmd); // // This code only ever happens if an error occurs... // if (retcode == -1 ) { char errstr [4096]; sprintf (errstr, "The command was >%s< and returned exit code %d .", sys_cmd, WEXITSTATUS (retcode)); nonfatalerror5 ("This program tried a shell command that " "didn't run correctly. ", errstr, CRM_ENGINE_HERE); if (engine_exit_base != 0) { exit (engine_exit_base + 11); } else exit (WEXITSTATUS (retcode )); }; exit ( WEXITSTATUS (retcode) ); }; }; // END OF IN THE MINION } else { if (user_trace) fprintf (stderr, " reusing old minion PID: %d\n", minion); }; // Now, we're out of the minion for sure. // so we close the pipe ends we know we won't be using. if (to_minion[0] != 0) { close (to_minion[0]); close (from_minion[1]); }; // // launch "pusher" process to send the buffer to the minion // (this hint from Dave Soderberg). This avoids the deadly // embrace situation where both processes are waiting to read // (or, equally, both processes have written and filled up // their buffers, and are now held up waiting for the other // process to empty some space in the output buffer) // if (strlen (inbuf) > 0) { pusher = fork (); // we're in the "input pusher" process if we got here. // shove the input buffer out to the minion if (pusher == 0) { dontcare = write (to_minion[1], inbuf, inlen ); if (internal_trace) fprintf (stderr, "pusher: input sent to minion.\n"); close (to_minion[1]); if (internal_trace) fprintf (stderr, "pusher: minion input pipe closed\n"); if (internal_trace) fprintf (stderr, "pusher: exiting pusher\n"); // The pusher always exits with success, so do NOT // do not use the engine_exit_base value exit ( EXIT_SUCCESS ); }; }; // now we're out of the pusher process. // if we don't want to keep this proc, we close it's input, and // wait for it to exit. if (! keep_proc) { close (to_minion[1]); if (internal_trace) fprintf (stderr, "minion input pipe closed\n"); } // and see what is in the pipe for us. outbuf[0] = '\000'; done = 0; outlen = 0; // grot grot grot this only works if varnames are not widechars if (strlen (from_var) > 0) { if (async_mode == 0 && keep_proc == 0) { usleep (timeout); // synchronous read- read till we hit EOF, which is read // returning a char count of zero. readloop: if (internal_trace) fprintf (stderr, "SYNCH READ "); usleep (timeout); charsread = read (from_minion[0], &outbuf[done], (data_window_size >> SYSCALL_WINDOW_RATIO) - done - 2); done = done + charsread; if ( charsread > 0 && done + 2 < (data_window_size >> SYSCALL_WINDOW_RATIO)) goto readloop; if (done < 0) done = 0; outbuf [done] = '\000'; outlen = done ; }; if (keep_proc == 1 || async_mode == 1) { // we're in either 'keep' 'async' mode. Set nonblocking mode, then // read it once; then put it back in regular mode. //fcntl (from_minion[0], F_SETFL, O_NONBLOCK); // usleep (timeout); charsread = read (from_minion[0], &outbuf[done], (data_window_size >> SYSCALL_WINDOW_RATIO)); done = charsread; if (done < 0) done = 0; outbuf [done] = '\000'; outlen = done ; //fcntl (from_minion[0], F_SETFL, 0); }; // If the minion process managed to fill our buffer, and we // aren't "keep"ing it around, OR if the process is "async", // then we should also launch a sucker process to // asynchronously eat all of the stuff we couldn't get into // the buffer. The sucker proc just reads stuff and throws it // away asynchronously... and exits when it gets EOF. // if ( async_mode || (outlen >= ((data_window_size >> SYSCALL_WINDOW_RATIO) - 2 ) && keep_proc == 0)) { sucker = fork (); if (sucker == 0) { // we're in the sucker process here- just throw away // everything till we get EOF, then exit. while (1) { usleep (timeout); charsread = read (from_minion[0], &outbuf[0], data_window_size >> SYSCALL_WINDOW_RATIO ); // in the sucker here, don't use engine_exit_base exit if (charsread == 0) exit (EXIT_SUCCESS); }; }; }; // and set the returned value into from_var. if (user_trace) fprintf (stderr, "SYSCALL output: %ld chars ---%s---.\n ", outlen, outbuf); if (internal_trace) fprintf (stderr, " storing return str in var %s\n", from_var); crm_destructive_alter_nvariable ( from_var, vlen, outbuf, outlen); }; // Record useful minion data, if possible. if (strlen (keep_buf) > 0) { sprintf (exp_keep_buf, "MINION PROC PID: %d from-pipe: %d to-pipe: %d", minion, from_minion[0], to_minion[1]); if (internal_trace) fprintf (stderr, " saving minion state: %s \n", exp_keep_buf); crm_destructive_alter_nvariable (keep_buf, keep_len, exp_keep_buf, strlen (exp_keep_buf)); }; // If we're keeping this minion process around, record the useful // information, like pid, in and out pipes, etc. if (keep_proc || async_mode) { } else { if (internal_trace) fprintf (stderr, "No keep, no async, so not keeping minion, closing everything.\n"); // de-zombify any dead minions; waitpid ( minion, &minion_exit_status, 0); // we're not keeping it around, so close the pipe. // close (from_minion [0]); if ( crm_vht_lookup (vht, keep_buf, strlen (keep_buf))) { char exit_value_string[MAX_VARNAME]; if (internal_trace) fprintf (stderr, "minion waitpid result :%d; whacking %s\n", minion_exit_status, keep_buf); sprintf (exit_value_string, "DEAD MINION, EXIT CODE: %d", WEXITSTATUS (minion_exit_status)); if (keep_len > 0) crm_destructive_alter_nvariable (keep_buf, keep_len, exit_value_string, strlen (exit_value_string)); }; }; #else // CRM_WINDOWS // if, no minion already existing, we create // communications pipes and launch the subprocess. This // code borrows concepts from both liblaunch and from // netcat (thanks, *Hobbit*!) // if (minion == 0) { int retcode; long vstart, vlen; long varline; if (user_trace) fprintf (stderr, " Must start a new minion.\n"); pipeSecAttr.nLength = sizeof(SECURITY_ATTRIBUTES); pipeSecAttr.bInheritHandle = TRUE; pipeSecAttr.lpSecurityDescriptor = NULL; status = CreatePipe(&to_minion[0], &to_minion[1], &pipeSecAttr, 2^10 * 32); status = CreatePipe(&from_minion[0], &from_minion[1], &pipeSecAttr, 2^10 * 32); crm_nextword (sys_cmd, strlen (sys_cmd), 0, &vstart, &vlen); varline = crm_lookupvarline (vht, sys_cmd, vstart, vlen); if (varline > 0) { fatalerror5 (" Sorry, syscall to a label isn't implemented in this version", "", CRM_ENGINE_HERE); } else { STARTUPINFO si; PROCESS_INFORMATION pi; HANDLE stdout_save, stdin_save; HANDLE to_minion_write, from_minion_read; stdout_save = GetStdHandle(STD_OUTPUT_HANDLE); SetStdHandle(STD_OUTPUT_HANDLE, from_minion[1]); stdin_save = GetStdHandle(STD_INPUT_HANDLE); SetStdHandle(STD_INPUT_HANDLE, to_minion[0]); DuplicateHandle(GetCurrentProcess(), from_minion[0], GetCurrentProcess(), &from_minion_read , 0, FALSE, DUPLICATE_SAME_ACCESS); CloseHandle(from_minion[0]); from_minion[0] = from_minion_read; DuplicateHandle(GetCurrentProcess(), to_minion[1], GetCurrentProcess(), &to_minion_write , 0, FALSE, DUPLICATE_SAME_ACCESS); CloseHandle(to_minion[1]); to_minion[1] = to_minion_write; if (user_trace) fprintf (stderr, "systemcalling on shell command %s\n", sys_cmd); ZeroMemory( &si, sizeof(si) ); si.cb = sizeof(si); ZeroMemory( &pi, sizeof(pi) ); retcode = CreateProcess(NULL, sys_cmd, NULL, NULL, TRUE , NULL, NULL, NULL, &si, &pi); if (!retcode) { char errstr [4096]; sprintf (errstr, "The command was >>%s<< and returned exit code %d .", sys_cmd, retcode); fatalerror5 ("This program tried a shell command that " "didn't run correctly. ", errstr, CRM_ENGINE_HERE); { if (engine_exit_base != 0) { exit (engine_exit_base + 13); } else exit ( EXIT_FAILURE ); } } else { minion = pi.dwProcessId; hminion = pi.hProcess; SetStdHandle(STD_OUTPUT_HANDLE, stdout_save); SetStdHandle(STD_INPUT_HANDLE, stdin_save); CloseHandle(pi.hThread); } }; } else { if (user_trace) fprintf (stderr, " reusing old minion PID: %d\n", minion); hminion = OpenProcess(PROCESS_ALL_ACCESS, 0, minion); if (hminion == NULL) fatalerror5 ("Couldn't open the existing minion process", "", CRM_ENGINE_HERE); }; // Now, we're out of the minion for sure. // so we close the pipe ends we know we won't be using. if (to_minion[0] != 0) { CloseHandle (to_minion[0]); CloseHandle (from_minion[1]); }; // // launch "pusher" process to send the buffer to the minion // (this hint from Dave Soderberg). This avoids the deadly // embrace situation where both processes are waiting to read // (or, equally, both processes have written and filled up // their buffers, and are now held up waiting for the other // process to empty some space in the output buffer) // if (strlen (inbuf) > 0) { HANDLE hThread; pusherparams pp; char *inbuf_copy = malloc(sizeof(char) * inlen+1); int i; //Since the pusher thread may continue executing after the //syscall statement has finished, we need to make a copy of //inbuf for the pusher thread to use. The pusher process will //free the memory. for (i=0; i 0) { if (async_mode == 0) { Sleep (timeout); // synchronous read- read till we hit EOF, which is read // returning a char count of zero. readloop: if (internal_trace) fprintf (stderr, "SYNCH READ "); Sleep (timeout); charsread = 0; ReadFile(from_minion[0], outbuf + done, (data_window_size >> SYSCALL_WINDOW_RATIO) - done - 2, &charsread, NULL); done = done + charsread; if (charsread > 0 && done + 2 < (data_window_size >> SYSCALL_WINDOW_RATIO)) goto readloop; if (done < 0) done = 0; outbuf [done] = '\000'; outlen = done ; } else { // we're in 'async' mode. Just grab what we can ReadFile(from_minion[0], &outbuf[done], (data_window_size >> SYSCALL_WINDOW_RATIO), &charsread, NULL); done = charsread; if (done < 0) done = 0; outbuf [done] = '\000'; outlen = done ; } // If the minion process managed to fill our buffer, and we // aren't "keep"ing it around, OR if the process is "async", // then we should also launch a sucker process to // asynchronously eat all of the stuff we couldn't get into // the buffer. The sucker proc just reads stuff and throws it // away asynchronously... and exits when it gets EOF. // if ( async_mode || (outlen >= ((data_window_size >> SYSCALL_WINDOW_RATIO) - 2 ) && keep_proc == 0)) { HANDLE hThread; suckerparams sp; sp.from_minion = from_minion[0]; sp.timeout = timeout; CreateThread(NULL, 0, sucker_proc , &sp , NULL, &hThread); } // and set the returned value into from_var. if (user_trace) fprintf (stderr, "SYSCALL output: %ld chars ---%s---.\n ", outlen, outbuf); if (internal_trace) fprintf (stderr, " storing return str in var %s\n", from_var); crm_destructive_alter_nvariable ( from_var, vlen, outbuf, outlen); } // Record useful minion data, if possible. if (strlen (keep_buf) > 0) { sprintf (exp_keep_buf, "MINION PROC PID: %d from-pipe: %d to-pipe: %d", minion, from_minion[0], to_minion[1]); if (internal_trace) fprintf (stderr, " saving minion state: %s \n", exp_keep_buf); crm_destructive_alter_nvariable (keep_buf, keep_len, exp_keep_buf, strlen (exp_keep_buf)); }; // If we're keeping this minion process around, record the useful // information, like pid, in and out pipes, etc. if (!keep_proc && !async_mode) { DWORD exit_code; if (internal_trace) fprintf (stderr, "No keep, no async, so not keeping minion, closing everything.\n"); // no, we're not keeping it around, so close the pipe. // CloseHandle(from_minion [0]); WaitForSingleObject(hminion, INFINITE); if (!GetExitCodeProcess(hminion, &exit_code)) { DWORD error = GetLastError(); } if ( crm_vht_lookup (vht, keep_buf, strlen (keep_buf))) { char exit_value_string[MAX_VARNAME]; if (internal_trace) fprintf (stderr, "minion exit code :%d; whacking %s\n", exit_code, keep_buf); sprintf (exit_value_string, "DEAD MINION, EXIT CODE: %d", exit_code); if (keep_len > 0) crm_destructive_alter_nvariable (keep_buf, keep_len, exit_value_string, strlen (exit_value_string)); }; CloseHandle(hminion); }; #endif // CRM_WINDOWS return (0); }; crm114-20100106-BlameMichelson.src/defaulttest.crm0000755000000000017500000000051211321154266020025 0ustar rootwsy#! /usr/bin/crm # # defaulttest.crm - test that default actually works # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. window output / \n\nCRM114: testing default command line args \n/ output /blah 1 = :*:blah:\n/ isolate < default > (:blah:) /new value/ output /blah 2 = :*:blah:\n/ crm114-20100106-BlameMichelson.src/paolo_overvars.crm0000755000000000017500000000216111321154266020544 0ustar rootwsy#! /usr/bin/crm # # paolo_overvars.com - paolo written testscript # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # { window output /isolate :a: as 'hdgdgb aaa hdgdb', match b as \/aaa\/\n/ isolate (:a:) /hdgdgb aaa hdgdb/ match (:b:) [:a:] /aaa/ output /a=:*:a: - b=:*:b:\n/ output /alter :a: as 'x'\n/ alter (:a:) /x/ output /a=:*:a: - b=:*:b:\n/ output /\nre-isolate :a: as 'hdgdgb bbb hdgdb'\n/ isolate (:a:) /hdgdgb bbb hdgdb/ output /a=:*:a: - b=:*:b:\n/ output /\nnow match :b: to :a:'s 'bbb' section\n/ match (:b:) [:a:] /bbb/ output /a=:*:a: - b=:*:b:\n/ output /\nnow alter :a: to 'x' again\n/ alter (:a:) /x/ output /a=:*:a: - b=:*:b:\n/ output /\nre-re-isolate :a: as 'hdgdgb ccc hdgdb'\n/ isolate (:a:) /hdgdgb ccc hdgdb/ output /a=:*:a: - b=:*:b:\n/ output /\nnow match :b: to :a:'s 'ccc' section\n/ match (:b:) [:a:] /ccc/ output /a=:*:a: - b=:*:b:\n/ output /\nnow alter :a: to 'x' again\n/ alter (:a:) /x/ output /a=:*:a: - b=:*:b:\n/ } crm114-20100106-BlameMichelson.src/crm114.h0000644000000000017500000004057411321154266016167 0ustar rootwsy// crm114.h - general include settings for crm114.h // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. #ifndef __CRM114_H__ #define __CRM114_H__ // // Global variables // The VHT (Variable Hash Table) VHT_CELL **vht; // The pointer to the global Current Stack Level (CSL) frame CSL_CELL *csl; // the data window CSL_CELL *cdw; // the temporarys data window (where argv, environ, newline etc. live) CSL_CELL *tdw; // the pointer to a CSL that we use during matching. This is flipped // to point to the right data window during matching. It doesn't have // it's own data, unlike cdw and tdw. CSL_CELL *mdw; // a pointer to the current statement argparse block. This gets whacked // on every new statement. ARGPARSE_BLOCK *apb; // the microcompiler int crm_microcompiler (CSL_CELL *csl, VHT_CELL **vht); #define CRM_ENGINE_HERE (char*)__FILE__, (char *)__FUNCTION__, (unsigned)__LINE__ // helper routine for untrappable errors void untrappableerror (char *msg1, char *msg2); void untrappableerror5 (char *msg1, char *msg2, char *filename, char *function, unsigned lineno); // helper routine for fatal errors long fatalerror (char *msg1, char *msg2); long fatalerror5 (char *msg1, char *msg2, char *filename, char *function, unsigned lineno); // helper routine for nonfatal errors long nonfatalerror (char *msg1, char *msg2); long nonfatalerror5 (char *msg1, char *msg2, char *filename, char *function, unsigned lineno ); // hash function for variable tables unsigned int strnhash (char *str, long len); // string translate function - for the TRANSLATE function long strntrn ( unsigned char *datastr, long *datastrlen, long maxdatastrlen, unsigned char *fromstr, long fromstrlen, unsigned char *tostr, long tostrlen, long flags); // basic math evaluator top function long strmath (char *buf, long inlen, long maxlen, long *retstat); // basic math evaluator in RPN long strpnmath (char *buf, long inlen, long maxlen, long *retstat); // basic math evaluator in RPN long stralmath (char *buf, long inlen, long maxlen, long *retstat); // load a file with info in a partially filled out csl cell. int crm_load_csl (CSL_CELL *csl); // alter a variable to another value (this is destructive!) void crm_destructive_alter_nvariable (char *varname, long varlen, char *newstr, long newlen); // setting a program label in the VHT void crm_setvar ( char *filename, // file where first defined (or NULL) int filedesc, // filedesc of defining file (or NULL) char *nametxt, // block of text hosting variable name long nstart, // index into nametxt to start varname long nlen, // length of name char *valtxt, // text block hosts the captured value long vstart, // index of start of cap. value long vlen, // length of captured value long linenumber, // linenumber (if pgm, else -1) long lazy_redirects // if nonzero, this is a lazy redirect ); // put a variable and a value into the temporary area void crm_set_temp_nvar (char *varname, char *value, long vallen); void crm_set_temp_var (char *varname, char *value); // put a counted-length var and a data-window-based value into the temp area. void crm_set_windowed_nvar (char *varname, long varlen, char *valtext, long start, long len, long stmtnum); // preprocess the program... including fixing up semicolons. int crm_preprocessor (CSL_CELL *csl, int flags); void crm_break_statements (long ini, long nchars, CSL_CELL *csl); // actually execute a compiled CRM file int crm_invoke (); // look up a variable line number (for GOTOs among other things) long crm_lookupvarline (VHT_CELL **vht, char *text, long start, long len); // grab_delim_string looks thru char *in for the first occurrence // of delim[0]. It then looks for the next occurrence of delim[1], // (with an escape character of delim[2]), // and copies the resulting string (without the delims) into res, // null-terminating the result. At most reslen-1 charscters // are copied, and at most inlen characters are checked. The return // value of this function is the address of the closing delimiter in *in. // // flags: CRM_FIRST_CLOSE - first close delimiter found ends string. // CRM_LAST_CLOSE - last close delimiter found ends string // CRM_COUNT_CLOSE - keep a count of open and close delims NYI // char *grab_delimited_string (char *res, char *in, char *delim, long inlen, long reslen, long flags); // expand the variable in the input buffer (according to the :*: operator long crm_expandvar (char *buf, long maxlen); // look up a vht cell, from a variable name. Returns the VHT cell // it's either stored in, or ought to be stored in (i.e. check for a NULL // VHT cell before use). long crm_vht_lookup (VHT_CELL **vht, char *vname, long vlen); // initialize the vht, insert some some useful variables void crm_vht_init (int argc, char **argv); // Surgically lengthen or shorten a window. The window pointed to by // mdw gets delta extra characters added or cut at "where". If the // allocated length is not enough, additional space can be malloced. // Finally, the vht is fixed up so everything still points "correctly". void crm_slice_and_splice_window ( CSL_CELL *mdw, long where, long delta); // Update the start and length of all captured variables whenever // the input buffer gets mangled. Mangles are all expressed in // the form of a start point, and a delta. void crm_updatecaptures (char *text, long loc, long delta); // A helper function to calculate what the proper changes are for // any marked point, given a dot and a delta on that dot. (sl is // 0 for a start, and 1 for an end mark). long crm_mangle_offset ( long mark, long dot, long delta, long sl); // Possibly reclaim storage in the given zone. long crm_compress_tdw_section (char *oldtext, long oldstart, long oldend); // create a new .css file int crm_create_cssfile(char *cssfile, long buckets, long major, long minor, long spectrum_start); // The magic flag parser. Given a string of input, and the builtin // crm_flags array, returns the flags that are set. // // for each input[i], is it equal to some member of flag_string[j]? // if YES, then // out_code[i] gets the value of flag_code[j] // count_code[j] gets incremented. // if NONE match, then out_code[j] is zero // // This makes it easy to parse a flag set for presence // // Note that this is a long long- which limits us to no more than // 64 discrete flags. unsigned long long crm_flagparse (char *input, long inlen); // the user input // get the next word in the input. (note- the regex stops only when // one hits a NULL, which may yield a slightly bogus result. long crm_nextword ( char *input, long inlen, long starthere, long *start, long *len); // The big one - matching... int crm_expr_match (CSL_CELL *csl, ARGPARSE_BLOCK *apb); // the learner... in variant forms... int crm_expr_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb); int crm_expr_markov_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_expr_osb_bayes_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_expr_correlate_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_expr_osb_winnow_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_expr_osb_hyperspace_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_expr_bit_entropy_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_expr_svm_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_svm_learn(CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen); int crm_pca_learn(CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen); int crm_expr_sks_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_neural_net_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_fast_substring_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); // The bigger one - classifying... int crm_expr_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb); int crm_expr_markov_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_expr_osb_bayes_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_expr_correlate_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_expr_osb_winnow_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_expr_osb_hyperspace_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_expr_bit_entropy_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_expr_svm_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_svm_classify(CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen); int crm_pca_classify(CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen); int crm_expr_sks_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_neural_net_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); int crm_fast_substring_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txt, long start, long len); // surgically alter a variable int crm_expr_alter (CSL_CELL *csl, ARGPARSE_BLOCK *apb); // EVAL - double-evaluate for indirectiion's sake. Otherwise, it's just // like ALTER int crm_expr_eval (CSL_CELL *csl, ARGPARSE_BLOCK *apb); // WINDOW - do a windowing operation on a variable int crm_expr_window ( CSL_CELL *csl, ARGPARSE_BLOCK *apb); // ISOLATE - do an isolation int crm_expr_isolate ( CSL_CELL *csl, ARGPARSE_BLOCK *apb); int crm_isolate_this (long *vptr, char *nametext, long namestart, long namelen, char *valuetext, long valuestart, long valuelen); // INPUT - do input int crm_expr_input ( CSL_CELL *csl, ARGPARSE_BLOCK *apb); // OUTPUT - do an output int crm_expr_output ( CSL_CELL *csl, ARGPARSE_BLOCK *apb); // SYSCALL - fork another process int crm_expr_syscall ( CSL_CELL *csl, ARGPARSE_BLOCK *apb); // TRANSLATE - translate character sets int crm_expr_translate ( CSL_CELL *csl, ARGPARSE_BLOCK *apb); // CLUMP and PMULC int crm_expr_clump (CSL_CELL *csl, ARGPARSE_BLOCK *apb); int crm_expr_pmulc (CSL_CELL *csl, ARGPARSE_BLOCK *apb); // parse a CRM114 statement; this is mostly a setup routine for // the generic parser. int crm_statement_parse ( char *in, long slen, ARGPARSE_BLOCK *apb); // and a genric parser routine for parsing a line according // to the type of qoting done. int crm_generic_parse_line ( char *txt, // the start of the program line long len, // how long is the line char *schars, // characters that can "start" an arg char *fchars, // characters that "finish" an arg char *echars, // characters that escape in an arg long maxargs, // howm many things to search for (max) long *ftype, // type of thing found (index by schars) long *fstart, // starting location of found arg long *flen // length of found arg ); // and to avoid all the mumbo-jumbo, an easy way to get a copy of // an arg found by the declensional parser. void crm_get_pgm_arg (char *to, long tolen, char *from, long fromlen) ; // The vector tokenizer - used to turn text into hash vectors. // long crm_vector_tokenize_selector ( ARGPARSE_BLOCK *apb, // The args for this line of code char *text, // input string (null-safe!) long textlen, // how many bytes of input. long start_offset, // start tokenizing at this byte. char *regex, // the parsing regex (might be ignored) int regexlen, // length of the parsing regex int *coeff_array, // the pipeline coefficient control array int pipe_len, // how long a pipeline (== coeff_array row length) int pipe_iters, // how many rows are there in coeff_array unsigned *features, // where the output features go long featureslen, // how many output features (max) long *features_out, // how many longs did we actually use up long *next_offset // next invocation should start at this offset ); // crm execution-time debugging environment - an interpreter unto itself // long crm_debugger (); // expand a variable or string with known length (8-bit and null-safe) long crm_nexpandvar (char *buf, long inlen, long maxlen); // execute a FAULT triggering. long crm_trigger_fault (char *reason); // do a microgroom of a hashed file. long crm_microgroom (FEATUREBUCKET_STRUCT *h, unsigned char *seen_features, long hs, unsigned long hindex ); void crm_packcss (FEATUREBUCKET_STRUCT *h, unsigned char *seen_features, long hs, long packstart, long packlen); void crm_packseg (FEATUREBUCKET_STRUCT *h, unsigned char *seen_features, long hs, long packstart, long packlen); // // and microgrooming for winnow files long crm_winnow_microgroom (WINNOW_FEATUREBUCKET_STRUCT *h, unsigned char *seen_features , unsigned long hfsize, unsigned long hindex); void crm_pack_winnow_css (WINNOW_FEATUREBUCKET_STRUCT *h, unsigned char* xhashes, long hs, long packstart, long packlen); void crm_pack_winnow_seg (WINNOW_FEATUREBUCKET_STRUCT *h, unsigned char* xhashes, long hs, long packstart, long packlen); // print out timings of each statement void crm_output_profile ( CSL_CELL *csl); // var-expansion operators // simple (escapes and vars) expansion long crm_nexpandvar (char *buf, long inlen, long maxlen); // complex (escapes, vars, strlens, and maths) expansion long crm_qexpandvar (char *buf, long inlen, long maxlen, long *retstat); // generic (everything, as you want it, bitmasked) expansion long crm_zexpandvar (char *buf, long inlen, long maxlen, long *retstat, long exec_bitmask); // Var-restriction operators (do []-vars, like subscript and regex ) long crm_restrictvar ( char *boxstring, long boxstrlen, long *vht_idx, char **outblock, long *outoffset, long *outlen, char *errstr); // helper function for LEARN/CLASSIFY long crm_exec_box_restriction(CSL_CELL *csl, ARGPARSE_BLOCK *apb, char **txt, long *start, long *len); // crm114-specific regex compilation int crm_regcomp (regex_t *preg, char *regex, long regex1_len, int cflags); int crm_regexec ( regex_t *preg, char *string, long string_len, size_t nmatch, regmatch_t pmatch[], int eflags, char *aux_string); size_t crm_regerror (int errocode, regex_t *preg, char *errbuf, size_t errbuf_size); void crm_regfree (regex_t *preg); char * crm_regversion (); // Portable mmap/munmap // void *crm_mmap_file (char *filename, long start, long len, long prot, long mode, long *actual_len); void crm_munmap_file (void *where); void crm_munmap_all (); void crm_force_munmap_filename (char *filename); void crm_force_munmap_addr (void *addr); // Some statistics functions // double crm_norm_cdf(double x); double crm_log(double x); double norm_pdf(double x); double normalized_gauss(double x, double s); double crm_frand (); // The following mumbo-jumbo needed for BSD to compile cleanly, because // BSD's logl function is not defined in all builds! What a crock! #ifdef NO_LOGL #warning Redefinining crm_logl as log because logl is missing #define crm_logl(x) log(x) #else #define crm_logl(x) logl(x) #endif #ifdef NO_SQRTF #warning Redefining sqrtf as sqrt because sqrtf is missing #define sqrtf(x) sqrt((x)) #endif // End BSD crapola. #endif // !__CRM114_H__ crm114-20100106-BlameMichelson.src/crm_expr_window.c0000644000000000017500000005214011321154266020351 0ustar rootwsy// crm_expr_window.c - window operation // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the auxilliary input buffer (for WINDOW input) extern char *newinputbuf; // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *tempbuf; // a helper function that should be in the C runtime lib but isn't. // char *my_strnchr (const char *str, long len, int c) { long i; i = 0; for (i = 0; i < len; i++) { if (str[i] == (char) c) return ((char *) &(str[i])); }; return (NULL); } int crm_expr_window (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { // a window operation - two steps...first is to discard // everything up till the first regex match, and second // is to add more data until the second regex is // satisfied by the incoming data. We just add the // incoming data onto the back of the window buffer, and // when we get a read completes. // // Yes, there are more efficient, less memory-intensive ways // to do this, but this is simple and unlikely to be broken in // subtle ways. static long newbuflen = 0; char pch [MAX_PATTERN]; long i; long srcidx; int inputsrc; char inputsrcname[MAX_VARNAME]; long inputsrclen; char *savedinputtxt; long savedinputtxtlen; char wvname [MAX_VARNAME]; long wvnamelen; CSL_CELL* mdw; long flen; int regexflags; regex_t preg; // int inputmode; int inputsize; // can be by char or by EOF < bychar byeof > int inputretryEOF; // do we retry an EOF? < eoffails eofretry > int inputEOFaccept; // accept an EOF as pat-end < acceptEOF > int saweof; int failout; long vmidx; regmatch_t matches [2]; // we're only interested in the first match. int done; int firsttime; inputsrcname[0] = '\000'; inputsrclen = 0; // wvname[0] = '\000'; //wvnamelen = 0; srcidx = 0; savedinputtxt = NULL; savedinputtxtlen = 0; failout = 0; if (user_trace) fprintf (stderr, "Executing a 'window' operation\n"); // there's the choice of input from a // variable, or input from stdin. This is controlled strictly // by whether there's a [] in the statement (someday it may // allow other files than stdin, but not yet.) So right now, it's- // 1) read from the variable [:foo:] if supplied, else // 2) read from STDIN (default) // these are inputsrc=FROM_VAR vs FROM_STDIN #define FROM_STDIN 0 #define FROM_VAR 1 #define FROM_VAR_DONE 2 // // Second, there's how much to read "preemptively", that is, // to read ahead, but with the possibility of reading ahead too // much (and thereby messing up a script or other typeahead that // another program sharing stdin was meant to actually read. // The three choices we support are: // 1) read everything available (BYEOF), else // 3) read one character at a time (BYCHAR) (default) // these are inputsize = bychar, byeof #define BYCHAR 0 #define BYEOF 1 #define BYCHUNK 2 #define BYLINE 999 // DANGER - BYLINE IS NOT SUPPORTED ANY MORE!!! // // Third, there's the question of what to do if the read doesn't // have enough material to satisfy the second regex (i.e. we hit // end of variable or EOF first). // // Our options are // // 1) just fail. (the default) // 2) just accept what we got, even though it doesn't fulfill // the paste regex (accepteof). // these are expressed as inputEOFaccept= ... #define EOFFAILS 0 #define EOFACCEPTS 1 // // As to other behavior, we can also clear the eof, wait a // bit, and try again, so we have: // // 1) leave EOF's alone. // 2) try to reset the EOF before reading // these are denoted by inputretryEOF = ... #define EOFSTAYS 0 #define EOFRETRY 1 // check for the flags // // default is BYCHAR inputsrc = 0; inputEOFaccept = EOFFAILS; inputsize = BYCHAR; inputretryEOF = EOFSTAYS; if (apb->sflags & CRM_BYCHAR) { if (user_trace) fprintf (stderr, " window input by character\n"); inputsize = BYCHAR; }; if (apb->sflags & CRM_BYCHUNK) { if (user_trace) fprintf (stderr, " window input by chunks\n"); inputsize = BYCHUNK; }; if (apb->sflags & CRM_BYEOF) { if (user_trace) fprintf (stderr, " window input by EOFs\n"); inputsize = BYEOF; }; inputEOFaccept = EOFFAILS; if (apb->sflags & CRM_EOFACCEPTS) { if (user_trace) fprintf (stderr, " window input EOF is always accepted\n"); inputEOFaccept = EOFACCEPTS; }; inputretryEOF = EOFSTAYS; if (apb->sflags & CRM_EOFRETRY) { if (user_trace) fprintf (stderr, " window input EOF is retried\n"); inputretryEOF = EOFRETRY; }; regexflags = REG_EXTENDED; if (apb->sflags & CRM_NOCASE) { if (user_trace) fprintf (stderr, " no case matching turned on\n "); regexflags = regexflags | REG_ICASE; }; if (apb->sflags & CRM_NOCASE) { if (user_trace) fprintf (stderr, " no case matching turned on\n "); regexflags = regexflags | REG_ICASE; }; if (apb->sflags & CRM_LITERAL) { if (user_trace) fprintf (stderr, " no case matching turned on\n "); regexflags = regexflags | REG_LITERAL; }; // part 1: dispose of old window worth of data. If no match, // dispose of all of the old window. // // get the disposal pattern // crm_get_pgm_arg (pch, MAX_PATTERN, apb->s1start, apb->s1len); // null window check - if no cut or paste patterns, then we // just skip to the end of the WINDOW statement code // which is how a WINDOW statement can be used to have a // program "come out running" before reading stdin. if (apb->s1len == 0 && apb->s2len == 0) goto crm_window_no_changes_made; // We have the first pattern in pch. We ought to look for the // appropriate flags here (common code, anyone?) but for now, // we'll just do a brutally straightforward expansion and then // matching. if (internal_trace) fprintf (stderr, " window cut pattern ---%s---\n", pch); flen = apb->s1len; // expand the match pattern flen = crm_nexpandvar (pch, apb->s1len, MAX_PATTERN); // // compile the regex i = crm_regcomp (&preg, pch, flen, regexflags); if ( i > 0) { crm_regerror ( i, &preg, tempbuf, data_window_size); nonfatalerror5 ("Regular Expression Compilation Problem:", tempbuf, CRM_ENGINE_HERE); goto invoke_bailout; }; // Get the variable we're windowing. If there's no such // variable, we default to :_dw: crm_get_pgm_arg (wvname, MAX_PATTERN, apb->p1start, apb->p1len); wvnamelen = crm_nexpandvar (wvname, apb->p1len, MAX_PATTERN); // if no svname, then we're defaulted to :_dw: if (strlen (wvname) == 0) { strcat (wvname, ":_dw:"); wvnamelen = strlen (":_dw:"); }; vmidx = crm_vht_lookup (vht, wvname, strlen (wvname)); if (vht[vmidx] == NULL) { nonfatalerror5 ("We seem to be windowing a nonexistent variable.", "How very bizarre.", CRM_ENGINE_HERE); goto invoke_bailout; } mdw = NULL; if (vht[vmidx]->valtxt == cdw->filetext) mdw = cdw; if (vht[vmidx]->valtxt == tdw->filetext) mdw = tdw; if (mdw == NULL) { nonfatalerror5 ("We seem to have lost the windowed var buffer", "This is just plain sick.", CRM_ENGINE_HERE); goto invoke_bailout; } // // // OK, we've got the arguments for part 1 - the cutting out // of the old data. So, let's do the cut. // // execute the regex. i = crm_regexec ( &preg, &(vht[vmidx]->valtxt[vht[vmidx]->vstart]), vht[vmidx]->vlen, 1, matches, 0, NULL); crm_regfree (&preg); // starting offset of the "keep section" is at matches[0].rm.eo // so we use crm_slice_and_splice_window to get rid of it. // if (i == 0) { // delete everything up to and including the delimiter crm_slice_and_splice_window (mdw, vht[vmidx]->vstart, -matches[0].rm_eo); } else { // didn't find the terminator pattern at all, which means we // flush the input window completely. crm_slice_and_splice_window (mdw, vht[vmidx]->vstart, -vht[vmidx]->vlen); }; if (user_trace) fprintf (stderr, " cut completed, variable length after cut is %ld\n", vht[vmidx]->vlen); //************************************************************** // OK, part one is done- we've windowed off the first // part of the input. // // Now we put the new // // Now we get the "put" half of the regex. if (user_trace) fprintf (stderr, " now finding new section to add to end.\n"); crm_get_pgm_arg (pch, MAX_PATTERN, apb->s2start, apb->s2len); flen = apb->s2len; if (user_trace) fprintf (stderr, "adding input with terminator of --%s--,", pch); // expand the match pattern flen = crm_nexpandvar (pch, flen, MAX_PATTERN); if (user_trace) fprintf (stderr, " which expands to --%s--", pch); // // compile the paste match regex i = crm_regcomp (&preg, pch, flen, regexflags); if ( i > 0) { crm_regerror ( i, &preg, tempbuf, data_window_size); nonfatalerror5 ("Regular Expression Compilation Problem:", tempbuf, CRM_ENGINE_HERE); goto invoke_bailout; }; // decide - do we suck input from stdin, or from // a variable that's already here? // // Get the input source, if one is supplied (2nd set of parens is // the var to use as input source, if it exists) crm_get_pgm_arg (inputsrcname, MAX_PATTERN, apb->p2start, apb->p2len); inputsrclen = apb->p2len; if (apb->p2start) { // NonZero input source variable, so we're gonna take our input // from this input variable. inputsrc = FROM_VAR; if (user_trace) fprintf (stderr, " getting input from var %s\n", inputsrcname); }; // // Now, depending on inputmode, we set up the final pasting // to do the right thing (the final pasting params are in // matches[0] ). // // we'll set up dummy limits for now though... // matches[0].rm_so = 0; matches[0].rm_eo = 0; // Now, the WHILE loop to find satisfaction for the second // regex, within the boundaries of from_var vs from_stdin, and // byline vs bychar vs byeof. So it's really a read/test/maybe_loop // loop. done = 0; saweof = 0; firsttime = 1; while (! done) { // // Switch on whether we're reading from a var or from // standard input. (either way, we use the newinputbuf) // switch (inputsrc) { case FROM_VAR: { // we're supposed to grab our input from an input variable. // so we fake it as though it came from a file. // // Later on, we have to undo the faking, and also modify // the length of the input variable (cutting out the stuff // that went into the WINDOW). // diagnostic - what was in the newinputbuf before this stmt? if (user_trace) { fprintf (stderr, " Using input source from variable %s\n", inputsrcname); fprintf (stderr, " prior newinput buf --%s--\n", newinputbuf); } // Get the source input stuff // srcidx = crm_vht_lookup (vht, inputsrcname, inputsrclen); if (vht[srcidx] == NULL) { nonfatalerror5 ("Trying to take WINDOW input from" "nonexistent variables doesn't work," "in this case, from :", inputsrcname, CRM_ENGINE_HERE); goto invoke_bailout; }; // // // malloc up some temporary space to keep the static input // buffer's stored text savedinputtxt = (char *) malloc (sizeof (char) * (32 + newbuflen )); if (savedinputtxt == NULL) { fatalerror5 ("Malloc in WINDOW failed. Aw, crud.", "Can't WINDOW this way", CRM_ENGINE_HERE); goto invoke_bailout; }; // // save our newinputbuf txt strncpy (savedinputtxt, newinputbuf, newbuflen); savedinputtxtlen = newbuflen; // // and push the contents of the variable into newinputbuf // (we know it's no bigger than data_window_len) strncpy (newinputbuf, &vht[srcidx]->valtxt[vht[srcidx]->vstart], vht[srcidx]->vlen ); newinputbuf[vht[srcidx]->vlen] = '\000'; newbuflen = vht[srcidx]->vlen; // // and there we have it - newintputbuf has all we will // get from this variable. // // Mark the fact that we're done with this variable by // setting inputsrc to FROM_VAR_DONE; inputsrc = FROM_VAR_DONE; saweof = 1; }; break; case FROM_VAR_DONE: { if (user_trace) fprintf (stderr, " got to FROM_VAR_DONE - this should" " NEVER happen. You've found a bug."); saweof = 1; } break; case FROM_STDIN: { int icount; icount = 0; // // the reason we _don't_ do this on te first interation // is that we may already have data in the temp // buffer, and we should use that data up first. if (!firsttime ) { // If we're reading from stdin, then we have three options: // read a character, read up to (and including) the newline, // or read till EOF. After each one, we set if (feof(stdin)) saweof = 1; if (inputretryEOF == EOFRETRY && (feof (stdin) || ferror (stdin) ) ) { if (user_trace) fprintf (stderr, " resetting the stdin stream\n"); clearerr (stdin); }; if (user_trace) fprintf (stderr, " getting window input from STDIN\n"); switch (inputsize) { case BYLINE: { fatalerror5 (" Sorry, but BYLINE input is not supported;", " we recommend using '\\n' in your match " "pattern", CRM_ENGINE_HERE); } break; case BYEOF: { // if BYEOF, we read as big a hunk as will fit. // If that's less than the full buffer, we declare // that we got an EOF as well. if (user_trace) fprintf (stderr, " bigchunk BYEOF read starting \n"); // // fread doesn't stop on pipe empty, while icount = fread (&(newinputbuf[newbuflen]), 1, data_window_size - (newbuflen + 256), stdin); if (feof (stdin)) saweof = 1; } break; case BYCHUNK: { // if BYCHUNK, we read all we can, and then we're // off and running. // Since we read everything available, we always // declare we saw EOF. Use EOFRETRY to run again. if (user_trace) fprintf (stderr, " bigchunk BYEOF read starting \n"); // // fread (stdin) doesn't return on pipe // empty, while read on STDIN_FILENO does. // So, for reading by chunks, we use read (STDIN icount = read ( fileno (stdin), &(newinputbuf[newbuflen]), data_window_size / 4 ); saweof = 1; } break; case BYCHAR: default: { // if BYCHAR, read one character and we're done // icount = read (0, &(newinputbuf[newbuflen]), 1); // if (user_trace) fprintf (stderr, " single character BYCHAR read \n"); icount = fread (&(newinputbuf[newbuflen]), 1, 1, stdin); }; break; }; } // // end of major part of BYCHAR / BYEOF specialized code. // if (icount > 0) { newbuflen = newbuflen + icount; newinputbuf[newbuflen] = '\000'; // put on the terminator }; // icount < 0 means an error occurred if (icount < 0) { nonfatalerror5 (" Something went wrong in WINDOW " "while trying to read", "I will keep trying. ", CRM_ENGINE_HERE); }; if (feof (stdin)) saweof = 1; }; }; // END OF SWITCH ON INPUTSRC // mark that this is not the first time through the loop // firsttime = 0; // now have an newinputbuf with something worth examining // in it, of length newbuflen (i.e. using chars [0...newbuflen-1]) // // So, we run the paste regex on it, and depending on the outcome, // set "done" or not. i = crm_regexec ( &preg, newinputbuf, newbuflen, 1, matches, 0, NULL); // // Now we deal with the result of the regex matching (or not // matching. i== 0 for success, i > 0 for failure. // if (i == 0) { // we found the regex; do the cut/paste // done = 1; if (user_trace) fprintf (stderr, " Found the paste pattern\n"); // (and the cut/paste is already set up correctly in // matches[0]; we don't have to do anything. } else { // Nope, the regex was not found. But if we had inputEOFaccept= // EOFACCEPTS, then we accept it anyway. if (saweof) { done = 1; failout = 1; if (user_trace) fprintf (stderr, " saw EOF, EOFAccept= %d\n", inputEOFaccept); switch (inputEOFaccept) { case EOFACCEPTS: { // In EOFENDS and EOFAIL, we take the available // input, shove it in, and go onward. We do this // by "faking" the matches[0] variable. matches[0].rm_so = 0; matches[0].rm_eo = newbuflen; if (matches[0].rm_eo < 0) matches[0].rm_eo = 0; failout = 0; } break; case EOFFAILS: default: { // Nope - got an EOF, and we aren't supposed to // accept it. So we MIGHT be done. Or maybe not... // if we have EOFRETRY set then we clear it and // try again. if (inputretryEOF == EOFRETRY) { clearerr (stdin); done = 0; failout = 0; } // But, if we are reading from a var, there will never // be any more, so we are -always- done. if (inputsrc == FROM_VAR || inputsrc == FROM_VAR_DONE) { done = 1 ; }; }; break; }; }; }; }; // end of the (!done) loop... // // It's just use the computed values from here on. crm_regfree (&preg); if (internal_trace) fprintf (stderr, " now newinput buf --%s--\n", newinputbuf); // Once we get to here, we have the new input in newinputbuf, and // matches[0].rm_eo is the length. So, we copy the new data onto // the end of the cdw window, and slide the new input up. // // start by making some space at the end of the input buffer crm_slice_and_splice_window (mdw, vht[vmidx]->vstart+vht[vmidx]->vlen, matches[0].rm_eo); // copy the pertinent part of newinputbuf into the space // we just made. memmove (&(vht[vmidx]->valtxt[vht[vmidx]->vstart + vht[vmidx]->vlen - matches[0].rm_eo]), newinputbuf, matches[0].rm_eo); // and get rid of the same characters out of newinputbuf if (newbuflen > 0 ) memmove (newinputbuf, &(newinputbuf[matches[0].rm_eo]), newbuflen - matches[0].rm_eo + 1); newbuflen = newbuflen - matches[0].rm_eo; newinputbuf[newbuflen] = '\000'; // Now, if we had EOFFAILS, and we hit the fail condition, // we have to set up the CSL so that it will continue execution // in the "right" place. if (failout == 1) { if (user_trace) fprintf (stderr, " CUT match failed so we're going to fail.\n"); csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk [ csl->mct[csl->cstmt]->nest_level ] = -1; }; // and, if we got a nonfatal error, we skip all the stuff above; // this is cleanup that we have to do eiher way. Failure here // is Really Bad. invoke_bailout: // // // Last little bit of cleanup is that IF we fetched from a // variable (not a file) we have to undo our fakery of stuffing // the var's contents into newinputbuf. // // This cleanup is two parts - stuffing the remains of inputsrcname // back into inputsrcname, and then restoring the old stdin buffer // contents from savedinputtxt and freeing the temporary // space, if (inputsrc == FROM_VAR || inputsrc == FROM_VAR_DONE) { // stuff the remaining characters back into the src var if (user_trace) fprintf (stderr, " restoring remains of input src variable.\n"); crm_destructive_alter_nvariable (inputsrcname, inputsrclen, newinputbuf, newbuflen); // and restore the old stdin buffer strncpy (newinputbuf, savedinputtxt, savedinputtxtlen + 1); newbuflen = savedinputtxtlen; // }; // and free the temporary space if (savedinputtxt) free (savedinputtxt); crm_window_no_changes_made: return (0); } crm114-20100106-BlameMichelson.src/inserttest_b.crm0000755000000000017500000000036511321154266020214 0ustar rootwsy#! /usr/bin/crm # # inserttest_b.crm - test insertion? # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. output / the first middle bit... / insert inserttest_c.crm output / the last middle bit... / crm114-20100106-BlameMichelson.src/isolate_reclaim_test.crm0000755000000000017500000000521011321154266021674 0ustar rootwsy#! /usr/bin/crm # # isolate_reclaim_test.crm - test reclamation of ISOLATEd variables # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # # Test the reclamation of ISOLATEd variables. window isolate (:l:) // alter (:_dw:) /foo bar baz/ match (:z:) /.*/ output /\n isol: :*:_iso:\n/ isolate (:l:) // # Note that :i: contains the trailing linefeed, etc. from the input, but # that :: does not. So, when we reclaim :i:, we get back one character. # isolate (:i:) /:*:z:/ output /i is >:*:i:<\n/ match [:i:] (:: :f: :c: :b:) /(foo)(.*)(baz)/ isolate (:endmarker:) /endmark/ eval (:l:) /:#:_iso:/ output /starting used: :*:l:\n/ match (:i:) /./ eval (:l:) /:#:_iso:/ output /release :i: used: :*:l:\n/ match (::) /./ eval (:l:) /:#:_iso:/ output /release :: used: :*:l:\n/ match (:i:) /./ eval (:l:) /:#:_iso:/ output /release :i: used: :*:l:\n/ match (:f:) /./ eval (:l:) /:#:_iso:/ output /release :f: used: :*:l:\n/ match (:c:) /./ eval (:l:) /:#:_iso:/ output /release :c: used: :*:l:\n/ match (:b:) /./ eval (:l:) /:#:_iso:/ output /release :b: used: :*:l:\n/ exit isolate (:i:) /:*:z:/ output /i is >:*:i:<\n/ match [:i:] (:: :f: :c: :b:) /(foo)(.*)(baz)/ eval (:l:) /:#:_iso:/ output /starting used: :*:l:\n/ match (::) /./ eval (:l:) /:#:_iso:/ output /release :: used: :*:l:\n/ match (:i:) /./ eval (:l:) /:#:_iso:/ output /release :i: used: :*:l:\n/ match (:f:) /./ eval (:l:) /:#:_iso:/ output /release :f: used: :*:l:\n/ match (:c:) /./ eval (:l:) /:#:_iso:/ output /release :c: used: :*:l:\n/ match (:b:) /./ eval (:l:) /:#:_iso:/ output /release :b: used: :*:l:\n/ isolate (:i:) /:*:z:/ output /i is >:*:i:<\n/ match [:i:] (:: :f: :c: :b:) /(foo)(.*)(baz)/ eval (:l:) /:#:_iso:/ output /starting used: :*:l:\n/ match (::) /./ eval (:l:) /:#:_iso:/ output /release :: used: :*:l:\n/ match (:i:) /./ eval (:l:) /:#:_iso:/ output /release :i: used: :*:l:\n/ match (:f:) /./ eval (:l:) /:#:_iso:/ output /release :f: used: :*:l:\n/ match (:c:) /./ eval (:l:) /:#:_iso:/ output /release :c: used: :*:l:\n/ match (:b:) /./ eval (:l:) /:#:_iso:/ output /release :b: used: :*:l:\n/ isolate (:i:) /:*:z:/ output /i is >:*:i:<\n/ match [:i:] (:: :f: :c: :b:) /(foo)(.*)(baz)/ eval (:l:) /:#:_iso:/ output /starting used: :*:l:\n/ match (::) /./ eval (:l:) /:#:_iso:/ output /release :: used: :*:l:\n/ match (:i:) /./ eval (:l:) /:#:_iso:/ output /release :i: used: :*:l:\n/ match (:f:) /./ eval (:l:) /:#:_iso:/ output /release :f: used: :*:l:\n/ match (:c:) /./ eval (:l:) /:#:_iso:/ output /release :c: used: :*:l:\n/ match (:b:) /./ eval (:l:) /:#:_iso:/ output /release :b: used: :*:l:\n/ crm114-20100106-BlameMichelson.src/crmregex_tre.c0000644000000000017500000001671411321154266017640 0ustar rootwsy// crmregex_tre.c - CRM114 Regex redirection bounce package for TRE regex // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // This file bounces CRM114 regex requests to whichever regex package // has been compiled and linked in to CRM114. // // Adding a new regex package is relatively easy- just mimic the // ifdef stanzas below to map the functions // // crm_regcomp // crm_regexec // crm_regerror // crm_regfree // crm_regversion // // into whatever calls your preferred regex package uses. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // Cache for regex compilations typedef struct { char *regex; regex_t reg; long regex_len; int cflags; int status; } REGEX_CACHE_BLOCK; #if CRM_REGEX_CACHESIZE > 0 REGEX_CACHE_BLOCK regex_cache[CRM_REGEX_CACHESIZE] = { { NULL, {0, NULL}, 0, 0, 0} } ; #endif // CRM_REGEX_CACHESIZE > 0 // debug helper: print a counted regex on stderr, quoted, with trimmings static void fpe_regex(char *before, char *regex, long regex_len, char *after) { long i; if (before != NULL) fprintf(stderr, "%s", before); fputc('"', stderr); for (i = 0; i < regex_len; i++) fputc(regex[i], stderr); fputc('"', stderr); if (after != NULL) fprintf(stderr, "%s", after); } #if CRM_REGEX_CACHESIZE > 0 // debug helper: print supplied description, cache bucket number, regex static void fpe_mishmash(char *str, unsigned int i, char *regex, int regex_len) { char tmp[128]; // make sure this is big enough sprintf(tmp, "%sregex_cache[%u]: ", str, i); fpe_regex(tmp, regex, regex_len, "\n"); } // debug helper: print a cache bucket with a supplied description static void fpe_bucket(char *str, unsigned int i) { fpe_mishmash(str, i, regex_cache[i].regex, regex_cache[i].regex_len); } #endif // CRM_REGEX_CACHESIZE > 0 // // How to do a register compilation // int crm_regcomp (regex_t *preg, char *regex, long regex_len, int cflags) { // compile it with the TRE regex compiler // // bug workaround - many regex compilers don't compile the null // regex correctly, but _do_ compile "()" correctly, which // matches the same thing). if (regex_len == 0) { return (regncomp (preg, "()", 2, cflags)); }; // Are we cacheing compiled regexes? Maybe not... #if CRM_REGEX_CACHESIZE == 0 if (internal_trace) fpe_regex("compiling regex ", regex, regex_len, "\n"); return ( regncomp (preg, regex, regex_len, cflags)); #else // !CRM_REGEX_CACHESIZE == 0 // We are cacheing. Scan our cache for the compiled version of this // regex. A NULL pointer to regex means "empty bucket". { unsigned int i; // subscript of bucket found or filled // ..unsigned cuz strnhash() val can have high bit set int found_it; // boolean REGEX_CACHE_BLOCK new; #ifdef REGEX_CACHE_LINEAR_SEARCH // // Linear Search uses a strict LRU algorithm to cache // the precompiled regexes, where used means compiled. // found_it = 0; for (i = 0; i < CRM_REGEX_CACHESIZE && regex_cache[i].regex != NULL; i++) { if (regex_len == regex_cache[i].regex_len && cflags == regex_cache[i].cflags && strncmp (regex_cache[i].regex, regex, regex_len) == 0) { // We Found It! i is where found_it = 1; break; // don't increment i }; }; if (i == CRM_REGEX_CACHESIZE) // ran off end, not found, cache full i = CRM_REGEX_CACHESIZE - 1; // bucket to throw away #endif // REGEX_CACHE_LINEAR_SEARCH #ifdef REGEX_CACHE_RANDOM_ACCESS // // Random Access uses an associative cache based on // the hash of the regex (mod the size of the cache). // found_it = 0; i = strnhash (regex, regex_len) % (unsigned)CRM_REGEX_CACHESIZE; if (regex_cache[i].regex != NULL && regex_len == regex_cache[i].regex_len && cflags == regex_cache[i].cflags && strncmp (regex_cache[i].regex, regex, regex_len) == 0) { // We Found It! i is where found_it = 1; }; #endif // REGEX_CACHE_RANDOM_ACCESS if (internal_trace) fpe_mishmash((found_it ? "found in " : "not found in "), i, regex, regex_len); if ( ! (found_it)) { // copy and compile new regex into new new.regex = (char *) malloc (regex_len); if (new.regex == NULL) fatalerror5("Can't allocate cache copy of new regex", "", CRM_ENGINE_HERE); memcpy (new.regex, regex, regex_len); new.regex_len = regex_len; new.cflags = cflags; new.status = regncomp (&new.reg, new.regex, new.regex_len, new.cflags); // i is the bucket to throw away, if any // i may or may not be where new stuff will go if (regex_cache[i].regex != NULL) { if (internal_trace) fpe_bucket("discarding ", i); regfree (®ex_cache[i].reg); free (regex_cache[i].regex); } } #ifdef REGEX_CACHE_LINEAR_SEARCH if ( !found_it) { // i is first free; shift array up one into bucket i while (i > 0) { regex_cache[i] = regex_cache[i - 1]; i--; } // i is now 0, which is where to put the new stuff }; #endif // REGEX_CACHE_LINEAR_SEARCH if ( !found_it) { // for both cache algorithms, i is now the bucket // to fill in with the new regex regex_cache[i] = new; if (internal_trace) fpe_bucket("new ", i); } // Just about done. Set up the return values. *preg = regex_cache[i].reg; return (regex_cache[i].status); }; #endif // !CRM_REGEX_CACHESIZE == 0 } // // // How to do a regex execution from the compiled register // int crm_regexec ( regex_t *preg, char *string, long string_len, size_t nmatch, regmatch_t pmatch[], int eflags, char *aux_string) { if (!string) { nonfatalerror5("crm_regexec - Regular Expression Execution Problem:\n", "NULL pointer to the string to match .", CRM_ENGINE_HERE); return (REG_NOMATCH); }; if (aux_string == NULL || strlen (aux_string) < 1) { return (regnexec (preg, string, string_len, nmatch, pmatch, eflags)); } else { int i; // parse out the aux string for approximation parameters regamatch_t mblock; regaparams_t pblock; mblock.nmatch = nmatch; mblock.pmatch = pmatch; sscanf (aux_string, "%d %d %d %d", &pblock.cost_subst, &pblock.cost_ins, &pblock.max_cost, &pblock.cost_del); if (user_trace) fprintf (stderr, "Using approximate match. Costs: Subst %d Ins %d Max %d Del %d \n", pblock.cost_subst, pblock.cost_ins, pblock.max_cost, pblock.cost_del); // now we can run the actual match i = reganexec (preg, string, string_len, &mblock, pblock, eflags); if (user_trace) fprintf (stderr, "approximate Regex match returned %d .\n", i); return (i); }; } size_t crm_regerror (int errorcode, regex_t *preg, char *errbuf, size_t errbuf_size) { return (regerror (errorcode, preg, errbuf, errbuf_size)); }; void crm_regfree (regex_t *preg) { #if CRM_REGEX_CACHESIZE > 0 // nothing! yes indeed, if we are using cacheing, we don't free // till and unless we decache, so crm_regfree is a noop. return; #else // !CRM_REGEX_CACHESIZE > 0 return (regfree (preg)); #endif // !CRM_REGEX_CACHESIZE > 0 }; char *crm_regversion () { return (tre_version()); }; crm114-20100106-BlameMichelson.src/rewriteutil.crm0000755000000000017500000000330511321154266020063 0ustar rootwsy#! /usr/bin/crm # # rewriteutil.crm - rewrite utility using rewrites.mfp # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # # do rewriting, based on rewrite rules in rewrites.mfp { isolate (:rewrites:) input (:rewrites:) [:*:_arg2:] # reset matching on rewrites to start of string } # Be sure we're at the start of the rewrites, and the start of the :_dw: match [:rewrites:] // { # Grab the next regexturn the one-per-line patterns into a regex # First, do the line-spanning regexes. match // match (:ch: :fr: :to:) [:rewrites:] /(.+)>-->(.*)/ output [/dev/tty] /:*:_nl: checking :*:ch: -- / # see if the "fr" regex matches anywhere { match (:place:) /:*:fr:/ # Yep, it matched... alter it and do it again # alter (:place:) /:*:to:/ output [/dev/tty] /./ output [/dev/tty] /:*:_dw:\n/ liaf } # Nope, didn't match... grab the next regex and try again, liaf } # reset back to the start of the rewrites, and the start of the :_dw:. # match [:rewrites:] // # and do it again for non-line-spanners { # Go through and do it again, except this time do it for # the non-line-spanning regexes. match // match (:ch: :fr: :to:) [:rewrites:] /(.+)>->(.*)/ output [/dev/tty] /:*:_nl:checking :*:ch: --/ # see if the "fr" regex matches anywhere { match (:place:) /:*:fr:/ # Yep, it matched... alter it and do it again # alter (:place:) /:*:to:/ output [/dev/tty] /./ liaf } # Nope, didn't match... grab the next regex and try again, liaf } output [/dev/tty] /:*:_nl:/ accept crm114-20100106-BlameMichelson.src/crm_expr_sks.c0000644000000000017500000014467711321154266017663 0ustar rootwsy// crm_expr_sks.c - String kernel Support Vector Machine /////////////////////////////////////////////////////////////////////////// // This code is originally copyright and owned by William // S. Yerazunis. In return for addition of significant derivative // work, Yimin Wu is hereby granted a full unlimited license to use // this code, includng license to relicense under other licenses. /////////////////////////////////////////////////////////////////////// // // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // crm_expr_sks.c specific system include file #include // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *tempbuf; ////////////////////////////////////////////////////////////////////////// // // Support Vector Machine (SVM) Classification // // This is an implementation of a support vector machine classification. // The current version only implement one type of SVM called C-Support // Vector Classification (C-SVC, Boser et al., 1992; Cortes and Vapnik, // 1995). // // The dual formulation of C-SVC is to find // // min 0.5 ( \alpha^T Q \alpha) - e^T \alpha // // subject to y^T \alpha = 0 // y_i = +1 or -1 // 0 <= alpha_i <= C, i=1,...,sizeof(corpus). // // Where "e" is the vector of all ones, // Q is the sizeof(corpus) by sizeof(corpus) matrix containing the // calculated distances between any two documents (that is, // Q_ij = y_i * y_j * kernel(x_i, x_j) which may be HUGE and so // we only calculate part of it at any one time. // x_i is the feature vector of document i. // // The decision function is // // sgn (sum(y_i * \alpha_i * kernel(x_i, x)) + b) // // In the optimization, we set the kernel parameters at the start and // then modify only the weighting parameters till it (hopefully) converges. //////////////////////////////////////////////////////////////////////////// // // SMO-type Decomposition Method // // Here we used SMO-type decomposition method ( Platt, 1998) to solve // the quadratic optimization problem --dual formulation of C-SVC, using // the method of Fan, Chen, and Lin ("Working Set Selection using Second // Order Information for Training Support Vector Machines", 2005) // to select the working set. // /////////////////////////////////////////////////////////////////////////////// // // String Kernel // // Here we implemented simple fixed-length string kernel. The original // idea is from Lodhi, Saunders, Shawe-Taylor, Cristianini and Watkins // ("Text Classification Using String Kernels", 2002). But we found // that simple string kernel, which uses fixed-length substrings as features // and maps documents to the substring space, can achieve pretty good // accuracy and very fast to calculate. // #define C_SVC 0 // Tau is a small minimum positive number (divide-by-zero noodge) #define TAU 1e-12 // Type of the string kernel. // Now only support simple fixed-length substring kernel #define SIMPLE_SK 0 //simple string kernel // We use the same old hyperspace structures for the intermediate storage. typedef struct mythical_hyperspace_cell { unsigned int hash; } HYPERSPACE_FEATUREBUCKET_STRUCT; // Parameter block to control the SVM solver. // typedef struct mythical_svm_param { int svm_type; int kernel_type; double cache_size; // in MB double eps; // convergence stop criterion double C; // parameter in C_SVC double max_run_time; // time control for microgroom (in second). // If computing time exceeds max_run_time, // then start microgrooming to delete the // documents far away from the hyperplane. int k; // fixed length of substrings // parameter for simple string kernel } SVM_PARAM; // And a struct to hold the actual data we're trying to solve. // typedef struct mythical_svm_problem { int l; // number of documents int *y; // label of documents -1/+1 HYPERSPACE_FEATUREBUCKET_STRUCT **x; // x[i] is the ith document's // feature vector } SVM_PROBLEM; // A structure to hold the cache node - these hold one row worth of // Q matrix. // typedef struct mythical_cache_node { struct mythical_cache_node *prev, *next; float *data; int len; } CACHE_NODE; // This is the cache representaton of the whole matrix; this is the // "first column" and points to the start of each row. typedef struct mythical_cache { int l; // The number of documents in the corpus long size; // The cache size (bytes) CACHE_NODE *head; CACHE_NODE lru_headnode; // least-recent-use node } CACHE; // This stores the result - alpha is the weighting vector (what we are // searching for) and // typedef struct mythical_solver{ double *alpha; double *G; // Gradient of objective function in each dimension double *deci_array; // decision values for all training data } SOLVER; // And a few file-wide static globals: // static SVM_PARAM param; static SVM_PROBLEM svm_prob; static CACHE svmcache; static float *DiagQ; // diagonal Qmatrix static SOLVER solver; static int hash_compare (void const *a, void const *b) { HYPERSPACE_FEATUREBUCKET_STRUCT *pa, *pb; pa = (HYPERSPACE_FEATUREBUCKET_STRUCT *) a; pb = (HYPERSPACE_FEATUREBUCKET_STRUCT *) b; if (pa->hash < pb->hash) return (-1); if (pa->hash > pb->hash) return (1); return (0); } /////////////////////////////////////////////////////////////////////////// // // Cache with least-recent-use strategy // This will be used to store the part of the Q matrix that we know // about. We recalculate parts as needed... this lets us solve the // problem without requiring enough memory to build the entire Q // matrix. // static void cache_init(int len, long size, CACHE *svmcache) { svmcache->l = len; svmcache->size = size; svmcache->head = (CACHE_NODE *)calloc(len, sizeof(CACHE_NODE)); size /= sizeof(float); size -= len * (sizeof(CACHE_NODE)/sizeof(float)); if(size < (2 * len)) size = 2 * len; // cache size must at least // as large as two columns of Qmatrix (svmcache->lru_headnode).prev = (svmcache->lru_headnode).next = &(svmcache->lru_headnode); } // Release the whole Q-matrix cache // static void cache_free(CACHE *svmcache) { CACHE_NODE *temp; for(temp = (svmcache->lru_headnode).next; temp != &(svmcache->lru_headnode); temp = temp->next) free(temp->data); free(svmcache->head); } // Delete one node (that is, one row of Q-matrix) from the LRU. // (we then usually move that row to the front of the LRU. static void lru_delete(CACHE_NODE *h){ h->prev->next = h->next; h->next->prev = h->prev; } // Insert to the last positoin in the cache node list // static void lru_insert(CACHE_NODE *h, CACHE *svmcache) { h->next = &(svmcache->lru_headnode); h->prev = (svmcache->lru_headnode).prev; h->prev->next = h; (svmcache->lru_headnode).prev = h; } // Get a line of Q matrix data for certain document, and return the // length of cached data. If it is smaller than the request length, // then we need to fill in the uncached data. static int get_data(CACHE *svmcache, const int doc_index, float **data, int length) { int result = length; CACHE_NODE *doc = svmcache->head + doc_index; if(doc->len) lru_delete(doc); //least-recent-use strategy //need to allocate more space if(length > (doc->len)) { // Cache hasn't enough free space, we need to release some old space while((svmcache->size) < (length - doc->len)) { CACHE_NODE *temp = (svmcache->lru_headnode).next; lru_delete(temp); free(temp->data); svmcache->size += temp->len; temp->data = 0; temp->len = 0; } //allocate new space doc->data = (float *)realloc(doc->data, length * sizeof(float)); svmcache->size -= (length - doc->len); result = doc->len; doc->len = length; } lru_insert(doc, svmcache); *data = doc->data; return result; } //Dot operation of two feature vectors // static double dot(void const *a, void const *b) { HYPERSPACE_FEATUREBUCKET_STRUCT *pa, *pb; int j = 0; int i = 0; double sum = 0; pa = (HYPERSPACE_FEATUREBUCKET_STRUCT *) a; pb = (HYPERSPACE_FEATUREBUCKET_STRUCT *) b; while(pa[i].hash != 0 && pb[j].hash != 0) { if(pa[i].hash == pb[j].hash && pa[i].hash != 0) { sum ++; i++; j++; } else { if(pa[i].hash > pb[j].hash) j++; else i++; } } return sum; } // Hide fixed-length substrings into the statistics file for later use. // static void simple_string_hide (char *s, HYPERSPACE_FEATUREBUCKET_STRUCT *hs, long *hashcounts) { long i; int len; len = strlen(s); *hashcounts = 0; for(i = 0; i <= (len - param.k); i++) { memmove(tempbuf, &(s[i]), param.k); tempbuf[param.k] = '\0'; if (internal_trace) { fprintf (stderr, " Learn #%ld is -%s-\n", i, tempbuf); }; hs[i].hash = strnhash (tempbuf, param.k); if (hs[i].hash == 0) hs[i].hash = 0xdeadbeef; (*hashcounts)++; } } // Ask the cache for the ith row in the Q matrix for C-Support Vector // Classification(C-SVC) // static float *get_rowQ(int i,int length) { float *rowQ; int uncached; if((uncached = get_data(&svmcache, i, &rowQ, length)) < length) { int temp; for(temp = uncached; temp < length; temp++) { if(param.svm_type == C_SVC) rowQ[temp] = (float)(svm_prob.y[i] * svm_prob.y[temp] * dot(svm_prob.x[i], svm_prob.x[temp])); } } return rowQ; } // Request of the diagonal in Qmatrix for C- Support Vector // Classification(C-SVC) static float *get_DiagQ() { float *DiagQ = (float *)malloc(svm_prob.l * sizeof(float)); int i; for(i = 0; i 0))) && select_times[t] < 10) { if ( -svm_prob.y[t] * solver.G[t] >= G_max) { i = t; G_max = -svm_prob.y[t] * solver.G[t]; } } } // select j as second member of working set; j = -1; obj_min = HUGE_VAL; for (t = 0; t< svm_prob.l; t++) { if((((svm_prob.y[t] == -1) && (solver.alpha[t] < param.C)) || ((svm_prob.y[t] == 1) && (solver.alpha[t] > 0))) && select_times[t] < 10) { b = G_max + svm_prob.y[t] * solver.G[t]; if(-svm_prob.y[t] * solver.G[t] <= G_min) G_min = -svm_prob.y[t] * solver.G[t]; if(b > 0) { if(i != -1) { Qi = get_rowQ(i,svm_prob.l); a = Qi[i] + DiagQ[t] - 2 * svm_prob.y[i] * svm_prob.y[t] * Qi[t]; if (a <= 0) a = TAU; if(-(b * b) / a <= obj_min) { j = t; obj_min = -(b * b) / a; } } } } } // Are we done? if(G_max - G_min < param.eps) { workset[0] = -1; workset[1] = -1; } else { workset[0] = i; workset[1] = j; } } static void solve(){ int t,workset[2],i,j; double a,b, oldi, oldj, sum; float *Qi, *Qj; // Array for storing how many times a particular document has been // selected in working set. int select_times[svm_prob.l]; for(i = 0; i < svm_prob.l; i++) { select_times[i] = 0; } solver.alpha = (double *)malloc(svm_prob.l * sizeof(double)); solver.G = (double *)malloc(svm_prob.l * sizeof(double)); if(param.svm_type == C_SVC) { //initialize alpha to all zero; //initialize G to all -1; for(t = 0; t < svm_prob.l; t++) { solver.alpha[t] = 0; solver.G[t] = -1; } } while(1) { selectB(workset, select_times); i = workset[0]; j = workset[1]; if(i != -1) select_times[i] ++; if(j != -1) select_times[j] ++; if(j == -1) break; Qi = get_rowQ(i, svm_prob.l); Qj = get_rowQ(j, svm_prob.l); // Calculate the incremental step forward. a = Qi[i] + DiagQ[j] - 2 * svm_prob.y[i] * svm_prob.y[j] * Qi[j]; if(a <= 0) a = TAU; b = -svm_prob.y[i] * solver.G[i] + svm_prob.y[j] * solver.G[j]; //update alpha oldi = solver.alpha[i]; oldj = solver.alpha[j]; solver.alpha[i] += svm_prob.y[i] * b/a; solver.alpha[j] -= svm_prob.y[j] * b/a; // Project alpha back to the feasible region(that is, where // where 0 <= alpha <= C ) sum = svm_prob.y[i] * oldi + svm_prob.y[j] * oldj; if (solver.alpha[i] > param.C) solver.alpha[i] = param.C; if (solver.alpha[i] < 0 ) solver.alpha[i] = 0; solver.alpha[j] = svm_prob.y[j] * (sum - svm_prob.y[i] * (solver.alpha[i])); if (solver.alpha[j] > param.C) solver.alpha[j] = param.C; if (solver.alpha[j] < 0 ) solver.alpha[j] = 0; solver.alpha[i] = svm_prob.y[i] * (sum - svm_prob.y[j] * (solver.alpha[j])); //update gradient array for(t = 0; t < svm_prob.l; t++) { solver.G[t] += Qi[t] * (solver.alpha[i] - oldi) + Qj[t] * (solver.alpha[j] - oldj); } } } // Calculate b (hyperplane offset in // SUM (y[i] alpha[i] kernel (x[i],x)) + b form // after calculating error margin alpha static double calc_b() { int count, i; double upper, lower, sum, b; count = 0; upper = HUGE_VAL; lower = -HUGE_VAL; sum = 0; for (i = 0; i < svm_prob.l; i++) { if(svm_prob.y[i] == 1) { if(solver.alpha[i] == param.C) { if(solver.G[i] > lower) { lower = solver.G[i]; } } else if(solver.alpha[i] == 0) { if(solver.G[i] < upper) { upper = solver.G[i]; } } else { count++; sum += solver.G[i]; } } else { if(solver.alpha[i] == 0) { if(-solver.G[i] > lower) { lower = -solver.G[i]; } } else if(solver.alpha[i] == param.C) { if(-solver.G[i] < upper) { upper = -solver.G[i]; } } else { count++; sum -= solver.G[i]; } } } if(count > 0) b = -sum/count; else b = -(upper + lower)/2; return b; } // Calculate the decision function static double calc_decision(HYPERSPACE_FEATUREBUCKET_STRUCT *x, double *alpha, double b) { int i; double sum; sum = 0; i=0; if (param.svm_type == C_SVC) { for (i = 0; i < svm_prob.l; i++) { if(alpha[i] != 0) sum += svm_prob.y[i] * alpha[i] * dot(x,svm_prob.x[i]); } sum += b; } return sum; } // Implementation of Lin's 2003 improved algorithm on Platt's // probabilistic outputs for binary SVM // Input parameters: deci_array = array of svm decision values // svm.prob // posn = number of positive examples // negn = number of negative examples // Outputs: parameters of sigmoid function-- A and B (AB[0] = A, AB[1] = B) static void calc_AB(double *AB, double *deci_array, int posn, int negn) { int maxiter, i, j; double minstep, sigma, fval, hiTarget, loTarget, *t; double fApB, h11, h22, h21, g1, g2, p, q, d1, d2, det, dA, dB, gd, stepsize, newA, newB, newf; maxiter = 100; minstep = 1e-10; sigma = 1e-3; fval = 0.0; hiTarget = (posn + 1.0) / (posn + 2.0); loTarget = 1 / (negn + 2.0); t = (double *)malloc(svm_prob.l * sizeof(double)); for(i = 0; i< svm_prob.l; i++) { if(svm_prob.y[i] > 0) t[i] = hiTarget; else t[i] = loTarget; } AB[0] = 0.0; AB[1] = log((negn + 1.0) / (posn + 1.0)); for (i = 0; i < svm_prob.l; i++) { fApB = deci_array[i] * AB[0] + AB[1]; if(fApB >= 0) fval += t[i] * fApB + log(1 + exp(-fApB)); else fval += (t[i] - 1) * fApB + log(1 + exp(fApB)); } for(j = 0; j < maxiter; j++) { h11 = h22 = sigma; h21 = g1 = g2 = 0.0; for(i = 0; i < svm_prob.l; i++) { fApB = deci_array[i] * AB[0] + AB[1]; if(fApB >= 0) { p = exp(-fApB) / (1.0 + exp(-fApB)); q = 1.0 / (1.0 + exp(-fApB)); } else { p = 1.0 / (1.0 + exp(fApB)); q = exp(fApB) / (1.0 + exp(fApB)); } d2 = p * q; h11 += deci_array[i] * deci_array[i] * d2; h22 += d2; h21 += deci_array[i] * d2; d1 = t[i] - p; g1 += deci_array[i] * d1; g2 += d1; } // Stopping Criterion if ((fabs(g1) < 1e-5) && (fabs(g2) < 1e-5)) { break; } //compute modified Newton directions det = h11 * h22 - h21 * h21; dA = -(h22 * g1 - h21 * g2) / det; dB = -(-h21 * g1 + h11 * g2) / det; gd = g1 * dA + g2 * dB; stepsize = 1; while (stepsize >= minstep) { newA = AB[0] + stepsize * dA; newB = AB[1] + stepsize * dB; newf = 0.0; for (i = 0; i < svm_prob.l; i++) { fApB = deci_array[i] * newA + newB; if (fApB >= 0) newf += t[i] * fApB + log(1 + exp(-fApB)); else newf += (t[i] - 1) * fApB + log(1 + exp(fApB)); } // Check whether sufficient decrease is satisfied if (newf < fval + 0.0001 * stepsize * gd) { AB[0] = newA; AB[1] = newB; fval = newf; break; } else stepsize /= 2.0; } if (stepsize < minstep) { if(user_trace) fprintf(stderr, "Line search fails in probability estimates\n"); break; } } if (j >= maxiter) if(user_trace) fprintf(stderr, "Reaching maximal iterations in probability estimates\n"); free(t); } static double sigmoid_predict(double decision_value, double A, double B) { double fApB = decision_value * A + B; if (fApB >= 0) { return exp(-fApB) / (1.0 + exp(-fApB)); } else return 1.0 / (1 + exp(fApB)) ; } int crm_expr_sks_learn(CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen) { long cflags, eflags; long sense; long microgroom; long unique; char ftext[MAX_PATTERN]; long flen; char file1[MAX_PATTERN]; char file2[MAX_PATTERN]; char file3[MAX_PATTERN]; char ptext[MAX_PATTERN]; //the regrex pattern long plen; long i, j, k; regex_t regcb; regmatch_t match[5]; long textoffset; long textmaxoffset; HYPERSPACE_FEATUREBUCKET_STRUCT *hashes; // the hashes we'll sort long hashcounts; FILE *stringf; struct stat statbuf1; // for statting the file1 struct stat statbuf2; // for statting the file2 time_t start_timer; time_t end_timer; double run_time; char *file_string ; i = 0; j = 0; k = 0; start_timer = time(NULL); // set our cflags, if needed. The defaults are // "case" and "affirm", (both zero valued). // and "microgroom" disabled. cflags = REG_EXTENDED; eflags = 0; sense = +1; if (apb->sflags & CRM_NOCASE) { cflags = cflags | REG_ICASE; eflags = 1; if (user_trace) fprintf (stderr, "turning oncase-insensitive match\n"); }; if (apb->sflags & CRM_REFUTE) { sense = -sense; if (user_trace) fprintf (stderr, " refuting learning\n"); }; microgroom = 0; if (apb->sflags & CRM_MICROGROOM) { microgroom = 1; if (user_trace) fprintf (stderr, " enabling microgrooming.\n"); }; unique = 0; if (apb->sflags & CRM_UNIQUE) { unique = 1; if (user_trace) fprintf (stderr, " enabling uniqueifying features.\n"); }; // extract the file names for storing svm solver. crm_get_pgm_arg (ftext, MAX_PATTERN, apb->p1start, apb->p1len); flen = apb->p1len; flen = crm_nexpandvar (ftext, flen, MAX_PATTERN); strcpy(ptext, "[[:space:]]*([[:graph:]]+)[[:space:]]+\\|[[:space:]]+([[:graph:]]+)[[:space:]]+\\|[[:space:]]+([[:graph:]]+)[[:space:]]*"); plen = strlen(ptext); plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); i = crm_regcomp (®cb, ptext, plen, cflags); if ( i > 0) { crm_regerror ( i, ®cb, tempbuf, data_window_size); nonfatalerror ("Regular Expression Compilation Problem:", tempbuf); goto regcomp_failed; }; k = crm_regexec (®cb, ftext, flen, 5, match, 0, NULL); if( k==0 ) { //get three input files. memmove(file1,&ftext[match[1].rm_so],(match[1].rm_eo-match[1].rm_so)); file1[match[1].rm_eo-match[1].rm_so]='\000'; memmove(file2,&ftext[match[2].rm_so],(match[2].rm_eo-match[2].rm_so)); file2[match[2].rm_eo-match[2].rm_so]='\000'; memmove(file3,&ftext[match[3].rm_so],(match[3].rm_eo-match[3].rm_so)); file3[match[3].rm_eo-match[3].rm_so]='\000'; if(internal_trace) fprintf(stderr, "file1=%s\tfile2=%s\tfile3=%s\n", file1, file2, file3); } else { //only has one input file if (ptext[0] != '\0') crm_regfree (®cb); i = 0; while(ftext[i] < 0x021) i++; j = i; while(ftext[j] >= 0x021) j++; ftext[j] = '\000'; strcpy(file1, &ftext[i]); file2[0] = '\000'; file3[0] = '\000'; } #ifdef GET_RID_OF_PUNCTUATION //get rid of all punctuation strcpy(ptext, "[^[:punct:]]+"); plen = strlen(ptext); i = crm_regcomp (®cb, ptext, plen, cflags); if ( i > 0) { crm_regerror ( i, ®cb, tempbuf, data_window_size); nonfatalerror ("Regular Expression Compilation Problem:", tempbuf); goto regcomp_failed; }; #endif hashes = calloc (HYPERSPACE_MAX_FEATURE_COUNT, sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT)); file_string = calloc((txtlen+10), sizeof(char)); file_string[0] = '\000'; textoffset = txtstart; textmaxoffset = txtstart + txtlen; i = 0; j = 0; k = 0; //if (|Text|>0) hide the text into the .svm file if(txtlen > 0) { #ifdef GET_RID_OF_PUNCTUATION while( k == 0 && textoffset <= textmaxoffset) { long wlen, slen; slen = textmaxoffset - textoffset; k = crm_regexec (®cb, &(txtptr[textoffset]), slen, 5, match, 0, NULL); if (!(k != 0 || textoffset > textmaxoffset)) { wlen = match[0].rm_eo - match[0].rm_so; memmove (tempbuf, &(txtptr[textoffset + match[0].rm_so]), wlen); tempbuf[wlen] = '\000'; if (strlen (file_string) + strlen(tempbuf) <= txtlen) strcat (file_string, tempbuf); if (match[0].rm_eo == 0) { nonfatalerror ( "The LEARN pattern matched zero length! ", "\n Forcing an increment to avoid an infinite loop."); match[0].rm_eo = 1; }; // and account for the text used up. textoffset = textoffset + match[0].rm_eo; i++; } } #else strncpy (file_string, &txtptr[txtstart], txtlen); #endif if(strlen(file_string) > 0) { simple_string_hide(file_string, hashes, &hashcounts); qsort (hashes, hashcounts, sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT), &hash_compare); if (unique) { while ( i < hashcounts ) { if (hashes[i].hash != hashes[i+1].hash){ hashes[j]= hashes[i]; j++; }; i++; }; hashcounts = j; }; //mark the end of a feature vector hashes[hashcounts].hash = 0; if(hashcounts > 0 && sense > 0) { crm_force_munmap_filename (file1); if (user_trace) fprintf (stderr, "Opening a sks file %s for append.\n", file1); if((stringf = fopen ( file1 , "ab+")) == NULL) { nonfatalerror("Sorry, couldn't open the .svm file", ""); return (0); } if (user_trace) fprintf (stderr, "Writing to a sks file %s\n", file1); //and write the string file out. //every file string is ended at '\000' dontcare = fwrite (hashes, 1, sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT) * (hashcounts + 1), stringf); fclose (stringf); } ///////////////////////////////////////////////////////////////////// // Start refuting........ // What we have to do here is find the set of hashes that matches // the input most closely - and then remove it. // // For this, we want the single closest set of hashes. That // implies highest radiance (see the hyperspace classifier for // details on radiance), so we use the same bit of code // we use down in classification. We also keep start and // end of the "best match" segment. //////////////////////////////////////////////////////////////////// if (hashcounts > 0 && sense < 0) { long beststart, bestend; long thisstart, thislen, thisend; double bestrad; long wrapup; double kandu, unotk, knotu, dist, radiance; long k, u; long file_hashlens; HYPERSPACE_FEATUREBUCKET_STRUCT *file_hashes; // Get the file mmapped so we can find the closest match // struct stat statbuf; // for statting the hash file // stat the file to get it's length k = stat (file1, &statbuf); // does the file really exist? if (k != 0) { nonfatalerror ("Refuting from nonexistent data cannot be done!" " More specifically, this data file doesn't exist: ", file1); return (0); } else { file_hashlens = statbuf.st_size; file_hashes = (HYPERSPACE_FEATUREBUCKET_STRUCT *) crm_mmap_file (file1, 0, file_hashlens, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); file_hashlens = file_hashlens / sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT ); }; wrapup = 0; k = u = 0; beststart = bestend = 0; bestrad = 0.0; while (k < file_hashlens) { long cmp; // Except on the first iteration, we're looking one cell // past the 0x0 start marker. kandu = 0; knotu = unotk = 10 ; u = 0; thisstart = k; if (internal_trace) fprintf (stderr, "At featstart, looking at %u (next bucket value is %u)\n", file_hashes[thisstart].hash, file_hashes[thisstart+1].hash); while (wrapup == 0) { // it's an in-class feature. cmp = hash_compare (&hashes[u], &file_hashes[k]); if (cmp < 0) { // unknown less, step u forward //increment on u, //because maybe k will match next time unotk++; u++; } if (cmp == 0) // features matched. // These aren't the features you're looking for. // Move along, move along.... { u++; k++; kandu++; }; if (cmp > 0) // unknown is greater, step k forward { // increment on k, // because maybe u will match next time. knotu++; k++; }; // End of the U's? If so, skip k to the end marker // and finish. if ( u >= hashcounts ) { while ( k < file_hashlens && file_hashes[k].hash != 0) { k++; knotu++; }; }; // End of the K's? If so, skip U to the end marker if ( k >= file_hashlens - 1 || file_hashes[k].hash == 0 ) // end of doc features { unotk += hashcounts - u; }; // end of the U's or end of the K's? //If so, end document. if (u >= hashcounts || k >= file_hashlens - 1 || file_hashes[k].hash == 0) // this sets end-of-document { wrapup = 1; k++; }; }; // Now the per-document wrapup... wrapup = 0; // reset wrapup for next file // drop our markers for this particular document. We are now // looking at the next 0 (or end of file). thisend = k - 1; thislen = thisend - thisstart + 1; if (internal_trace) fprintf (stderr, "At featend, looking at %u (next bucket value is %u)\n", file_hashes[thisend].hash, file_hashes[thisend+1].hash); // end of a document- process accumulations // Proper pythagorean (Euclidean) distance - best in // SpamConf 2006 paper dist = sqrtf (unotk + knotu) ; // This formula was the best found in the MIT `SC 2006 paper. radiance = 1.0 / (( dist * dist) + .000001); radiance = radiance * kandu; radiance = radiance * kandu; if (user_trace) fprintf (stderr, "Feature Radiance %f at %ld to %ld\n", radiance, thisstart, thisend); if (radiance >= bestrad) { beststart = thisstart; bestend = thisend; bestrad = radiance; } }; // end of the per-document stuff - now chop out the part of the // file between beststart and bestend. if (user_trace) fprintf (stderr, "Deleting feature from %ld to %ld (rad %f) of file %s\n", beststart, bestend, bestrad, file1); // Deletion time - move the remaining stuff in the file // up to fill the hole, then msync the file, munmap it, and // then truncate it to the new, correct length. { long newhashlen, newhashlenbytes; newhashlen = file_hashlens - (bestend + 1 - beststart); newhashlenbytes=newhashlen * sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT); memmove (&file_hashes[beststart], &file_hashes[bestend+1], sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT) * (file_hashlens - bestend) ); crm_force_munmap_filename (file1); if (internal_trace) fprintf (stderr, "Truncating file to %ld cells ( %ld bytes)\n", newhashlen, newhashlenbytes); k = truncate (file1, newhashlenbytes); } }; }; free(file_string); } free(hashes); if ( sense < 0 ) { // finish refuting.... return (0); } // extract parameters for String kernel SVM crm_get_pgm_arg(ptext, MAX_PATTERN, apb->s2start, apb->s2len); plen = apb->s2len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); if(plen) { sscanf(ptext, "%d %d %lf %lf %lf %lf %d", &(param.svm_type), &(param.kernel_type), &(param.cache_size), &(param.eps), &(param.C), &(param.max_run_time), &(param.k)); } else { //set default parameters for SVM param.svm_type = C_SVC; param.kernel_type = SIMPLE_SK; param.cache_size = 100;//MB param.eps = 1e-3; param.C = 1; param.max_run_time = 1; param.k = 4; } // If file2 is not empty, open file1 and file2, calculate hyperplane, // and write the solution to file3 if(file2[0] != '\000' && file3[0] != '\000') { long file1_lens; HYPERSPACE_FEATUREBUCKET_STRUCT *file1_hashes; long file2_lens; HYPERSPACE_FEATUREBUCKET_STRUCT *file2_hashes; int k1, k2; i = 0; k1 = stat (file1, &statbuf1); k2 = stat (file2, &statbuf2); if (k1 != 0) { nonfatalerror ("Sorry, there has no enough data to calculate the hyperplane" "", file1); return (0); } else if(k2 != 0) { nonfatalerror ("Sorry, there has no enough data to calculate the hyperplane" "", file2); return (0); } else { k1 = 0; k2 = 0; file1_lens = statbuf1.st_size; file1_hashes = (HYPERSPACE_FEATUREBUCKET_STRUCT *) crm_mmap_file (file1, 0, file1_lens, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); file1_lens = file1_lens / sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT); for(i = 0;i< file1_lens;i++) { if(file1_hashes[i].hash == 0){ k1 ++; } } file2_lens = statbuf2.st_size; file2_hashes = (HYPERSPACE_FEATUREBUCKET_STRUCT *) crm_mmap_file (file2, 0, file2_lens, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); file2_lens = file2_lens / sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT); for(i = 0;i< file2_lens;i++) { if(file2_hashes[i].hash == 0) { k2 ++; } } if(user_trace) { fprintf (stderr, "\nThe total number of documents in file1 is %d\n", k1); fprintf (stderr, "\nThe total number of documents in file2 is %d\n", k2); } if(!(k1 > 0 && k2 > 0)) { if (user_trace) fprintf(stderr, "There hasn't enough documents to calculate a string kernel svm hyperplane!\n"); } else { //initialize the svm_prob.x, svm_prob.y int *y = NULL; double b; double *deci_array = NULL; double AB[2]; HYPERSPACE_FEATUREBUCKET_STRUCT **x = NULL; svm_prob.l = k1 + k2; // int y[svm_prob.l]; y = calloc (svm_prob.l, sizeof (y[0])); x = calloc (svm_prob.l, sizeof (x[0])); for(i = 0; i < k1; i++) y[i] = 1; for(i = k1; i < svm_prob.l; i++) y[i] = -1; svm_prob.y = y; // HYPERSPACE_FEATUREBUCKET_STRUCT *x[svm_prob.l]; x[0] = &(file1_hashes[0]); k = 1; for(i = 1; i < file1_lens - 1; i++) { if(file1_hashes[i].hash == 0 ) x[k++] = &(file1_hashes[i+1]); } x[k++] = &(file2_hashes[0]); for(i = 1; i < file2_lens - 1; i++) { if((file2_hashes[i].hash == 0 ) && (file2_hashes[i+1].hash != 0)) x[k++] = &(file2_hashes[i+1]); } svm_prob.x = x; Q_init(); solve(); //result is in solver b = calc_b(); //compute decision values for all training documents deci_array = (double *) malloc (svm_prob.l * sizeof(double)); for(i = 0; i < svm_prob.l; i++) { deci_array[i] = calc_decision(svm_prob.x[i],solver.alpha, b); } calc_AB(AB,deci_array, k1,k2); end_timer = time(NULL); run_time = difftime(end_timer, start_timer); if(user_trace) fprintf(stderr, "run_time = %lf seconds\n", run_time); free(deci_array); // write solver to file3 if (user_trace) fprintf (stderr, "Opening a solution file %s for writing alpha and b.\n", file3); if( (stringf = fopen ( file3 , "w+")) == NULL) { nonfatalerror ("Couldn't write to .hypsvm file", " "); return (0); } if (user_trace) fprintf (stderr, "Writing to a svm solution file %s\n", file3); dontcare = fwrite(&k1, sizeof(int), 1, stringf); dontcare = fwrite(&k2, sizeof(int), 1, stringf); for(i = 0; i < svm_prob.l; i++) dontcare = fwrite(&(solver.alpha[i]), sizeof(double), 1, stringf); dontcare = fwrite(&b, sizeof(double), 1, stringf); dontcare = fwrite(&AB[0], sizeof(double), 1, stringf); dontcare = fwrite(&AB[1], sizeof(double), 1, stringf); fclose (stringf); //free cache cache_free(&svmcache); free(solver.G); free(DiagQ); free(solver.alpha); free (x); free (y); if(user_trace) fprintf(stderr, "Finish calculating SVM hyperplane, store the solution to %s!\n", file3); }//end if two sks files are not empty crm_force_munmap_filename (file1); crm_force_munmap_filename (file2); crm_force_munmap_filename (file3); }//end if two sks files are exist! }//end if user inputs three file_names regcomp_failed: return 0; } int crm_expr_sks_classify(CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen) { long i,j, k; char ptext[MAX_PATTERN]; //the regrex pattern long plen; char ftext[MAX_PATTERN]; long flen; char file1[MAX_PATTERN]; char file2[MAX_PATTERN]; char file3[MAX_PATTERN]; regex_t regcb; regmatch_t match[5]; double *alpha; double b; double AB[2]; long slen; char svrbl[MAX_PATTERN]; // the match statistics text buffer long svlen; char stext [MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100)]; // the match statistics variable long stext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100); HYPERSPACE_FEATUREBUCKET_STRUCT *hashes; // the hashes we'll sort long hashcounts; long cflags, eflags; long microgroom; long unique; struct stat statbuf1; // for statting the hash file1 struct stat statbuf2; // for statting the hash file2 struct stat statbuf3; // for statting the hash file3 long textoffset; long textmaxoffset; FILE *stringf; long stringlens[MAX_CLASSIFIERS]; char *stringname[MAX_CLASSIFIERS]; long doc_num[MAX_CLASSIFIERS]; double decision = 0; long totalfeatures = 0; // total features long bestseen; double ptc[MAX_CLASSIFIERS]; // current running probability of this class char *file_string; // extract the optional "match statistics" variable // crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len); svlen = apb->p2len; svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN); { long vstart, vlen; crm_nextword (svrbl, svlen, 0, &vstart, &vlen); memmove (svrbl, &svrbl[vstart], vlen); svlen = vlen; svrbl[vlen] = '\000'; }; // status variable's text (used for output stats) // stext[0] = '\000'; slen = 0; // set our cflags, if needed. The defaults are // "case" and "affirm", (both zero valued). // and "microgroom" disabled. cflags = REG_EXTENDED; eflags = 0; if (apb->sflags & CRM_NOCASE) { cflags = cflags | REG_ICASE; eflags = 1; if (user_trace) fprintf (stderr, "turning oncase-insensitive match\n"); }; microgroom = 0; if (apb->sflags & CRM_MICROGROOM) { microgroom = 1; if (user_trace) fprintf (stderr, " enabling microgrooming.\n"); }; unique = 0; if (apb->sflags & CRM_UNIQUE) { unique = 1; if (user_trace) fprintf (stderr, " enabling uniqueifying features.\n"); }; // extract parameters for svm, and fill in the // magic parameter block. Note that the block is // a catchall for all sorts of things. crm_get_pgm_arg(ptext, MAX_PATTERN, apb->s2start, apb->s2len); plen = apb->s2len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); if(plen) { sscanf(ptext, "%d %d %lf %lf %lf %lf %d", &(param.svm_type), &(param.kernel_type), &(param.cache_size), &(param.eps), &(param.C), &(param.max_run_time), &(param.k)); } else { //set default parameters for SVM param.svm_type = C_SVC; param.kernel_type = SIMPLE_SK; param.cache_size = 100;//MB param.eps = 1e-3; param.C = 1; param.max_run_time = 1; param.k = 4; } hashes = calloc (HYPERSPACE_MAX_FEATURE_COUNT, sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT)); hashcounts = 0; #ifdef GET_RID_OF_PUNCTUATION //get rid of all punctuation strcpy(ptext, "[^[:punct:]]+"); plen = strlen(ptext); i = crm_regcomp (®cb, ptext, plen, cflags); if ( i > 0) { crm_regerror ( i, ®cb, tempbuf, data_window_size); nonfatalerror ("Regular Expression Compilation Problem:", tempbuf); goto regcomp_failed; }; #endif file_string = calloc((txtlen + 10), sizeof(char)); file_string[0] = '\000'; textoffset = txtstart; textmaxoffset = txtstart + txtlen; i = 0; j = 0; k = 0; if(txtlen > 0) { #ifdef GET_RID_OF_PUNCTUATION while( k == 0 && textoffset <= textmaxoffset) { long wlen; long slen = textmaxoffset - textoffset; k = crm_regexec (®cb, &(txtptr[textoffset]), slen, 5, match, 0, NULL); if (!(k != 0 || textoffset > textmaxoffset)) { wlen = match[0].rm_eo - match[0].rm_so; memmove (tempbuf, &(txtptr[textoffset + match[0].rm_so]), wlen); tempbuf[wlen] = '\000'; if (strlen (file_string) + strlen(tempbuf) <= txtlen) strcat (file_string, tempbuf); if (match[0].rm_eo == 0) { nonfatalerror ( "The LEARN pattern matched zero length! ", "\n Forcing an increment to avoid an infinite loop."); match[0].rm_eo = 1; }; // and account for the text used up. textoffset = textoffset + match[0].rm_eo; i++; } } #else strncpy (file_string, &txtptr[txtstart], txtlen); #endif if(strlen(file_string) > 0) { simple_string_hide(file_string, hashes, &hashcounts); qsort (hashes, hashcounts, sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT), &hash_compare); if (unique) { i=0; j=0; while ( i < hashcounts ) { if (hashes[i].hash != hashes[i+1].hash) { hashes[j]= hashes[i]; j++; }; i++; }; hashcounts = j; }; //mark the end of a feature vector hashes[hashcounts].hash = 0; } } // extract the file names.( file1.svm | file2.svm | 1vs2_solver.svm ) crm_get_pgm_arg (ftext, MAX_PATTERN, apb->p1start, apb->p1len); flen = apb->p1len; flen = crm_nexpandvar (ftext, flen, MAX_PATTERN); strcpy(ptext, "[[:space:]]*([[:graph:]]+)[[:space:]]+\\|[[:space:]]+([[:graph:]]+)[[:space:]]+\\|[[:space:]]+([[:graph:]]+)[[:space:]]*"); plen = strlen(ptext); i = crm_regcomp (®cb, ptext, plen, cflags); if ( i > 0) { crm_regerror ( i, ®cb, tempbuf, data_window_size); nonfatalerror ("Regular Expression Compilation Problem:", tempbuf); goto regcomp_failed; }; k = crm_regexec (®cb, ftext, flen, 5, match, 0, NULL); if( k==0 ) { long file1_lens; long file2_lens; int k1, k2, k3; HYPERSPACE_FEATUREBUCKET_STRUCT *file1_hashes; HYPERSPACE_FEATUREBUCKET_STRUCT *file2_hashes; //get three input files. memmove(file1,&ftext[match[1].rm_so],(match[1].rm_eo-match[1].rm_so)); file1[match[1].rm_eo-match[1].rm_so]='\000'; memmove(file2,&ftext[match[2].rm_so],(match[2].rm_eo-match[2].rm_so)); file2[match[2].rm_eo-match[2].rm_so]='\000'; memmove(file3,&ftext[match[3].rm_so],(match[3].rm_eo-match[3].rm_so)); file3[match[3].rm_eo-match[3].rm_so]='\000'; if(user_trace) fprintf(stderr, "file1=%s\tfile2=%s\tfile3=%s\n", file1, file2, file3); //open all files, //first check whether file3 is the current version solution. k1 = stat (file1, &statbuf1); k2 = stat (file2, &statbuf2); k3 = stat (file3, &statbuf3); if (k1 != 0) { nonfatalerror ("Sorry, We can't classify with empty .svm file" " ", file1); return (0); } else if(k2 != 0) { nonfatalerror ("Sorry, We can't classify with empty .svm file" " ", file2); return (0); } else { int temp_k1 = 0, temp_k2 = 0; int *y = NULL; HYPERSPACE_FEATUREBUCKET_STRUCT **x = NULL; k1 = 0; k2 = 0; file1_lens = statbuf1.st_size; crm_force_munmap_filename (file1); crm_force_munmap_filename (file2); file1_hashes = (HYPERSPACE_FEATUREBUCKET_STRUCT *) crm_mmap_file (file1, 0, file1_lens, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); file1_lens = file1_lens / sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT); stringlens[0] = file1_lens; stringname[0] = (char *) malloc (strlen(file1)+10); if (!stringname[0]) untrappableerror("Couldn't malloc stringname[0]\n", "We need that part later, so we're stuck. Sorry."); strcpy(stringname[0],file1); file2_lens = statbuf2.st_size; file2_hashes = (HYPERSPACE_FEATUREBUCKET_STRUCT *) crm_mmap_file (file2, 0, file2_lens, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); file2_lens = file2_lens / sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT); stringlens[1] = file2_lens; stringname[1] = (char *) malloc (strlen(file2)+10); if (!stringname[1]) untrappableerror("Couldn't malloc stringname[1]\n", "We need that part later, so we're stuck. Sorry."); strcpy(stringname[1],file2); //find out how many documents in file1 and file2 separately for(i = 0;i< file1_lens;i++){ if(file1_hashes[i].hash == 0){ k1 ++; } } if(user_trace) fprintf (stderr, "\nThe total number of documents in file1 is %d\n", k1); for(i = 0;i< file2_lens;i++) { if(file2_hashes[i].hash == 0) { k2 ++; } } if(user_trace) fprintf (stderr, "\nThe total number of documents in file2 is %d\n", k2); stringf = fopen ( file3 , "r+"); if(k3 == 0) { dontcare = fread(&temp_k1, sizeof(int), 1, stringf); dontcare = fread(&temp_k2, sizeof(int), 1, stringf); if (user_trace) fprintf(stderr, "temp_k1=%d\ttemp_k2=%d\n",temp_k1,temp_k2); } doc_num[0] = k1; doc_num[1] = k2; //assign svm_prob.x, svm_prob.y svm_prob.l = k1 + k2; x = calloc (svm_prob.l, sizeof (x[0])); y = calloc (svm_prob.l, sizeof (y[0])); for(i = 0; i < k1; i++) y[i] = 1; for(i = k1; i < svm_prob.l; i++) y[i] = -1; svm_prob.y = y; x[0] = &(file1_hashes[0]); k = 1; for(i = 1;i< file1_lens - 1;i++) { if(file1_hashes[i].hash == 0) { x[k++] = &(file1_hashes[i+1]); } } x[k++] = &(file2_hashes[0]); for(i = 1;i< file2_lens - 1;i++) { if(file2_hashes[i].hash == 0) { x[k++] = &(file2_hashes[i+1]); } } svm_prob.x = x; alpha = (double *)malloc( svm_prob.l * sizeof(double)); if((k3 != 0) || (temp_k1 != k1) || (temp_k2 != k2)) { if(internal_trace) fprintf(stderr, "temp_k1=%d\ttemp_k2=%d\tSVM solution file is not up-to-date! we'll recalculate it!\n", temp_k1, temp_k2); //recalculate the svm solution if((k1 > 0) && (k2 >0)) { double *deci_array = NULL; Q_init(); solve(); //result is in solver b = calc_b(); if(user_trace) { fprintf(stderr, "b=%lf\n",b); } for(i = 0; i < svm_prob.l; i++) alpha[i] = solver.alpha[i]; //compute A,B for sigmoid prediction deci_array = (double*) malloc(svm_prob.l * sizeof(double)); for(i = 0; i < svm_prob.l; i++) { deci_array[i] = calc_decision(svm_prob.x[i], alpha, b); } calc_AB(AB, deci_array, k1, k2); //free cache cache_free(&svmcache); free(deci_array); free(solver.G); free(solver.alpha); free(DiagQ); if(user_trace) fprintf(stderr, "Recalculation of svm hyperplane is finished!\n"); } else { if(user_trace) fprintf(stderr, "There hasn't enough documents to recalculate a svm hyperplane!\n"); return (0); } } else { for(i = 0; i 0) { char buf [4096]; double pr; char fname[MAX_FILE_NAME_LEN]; buf [0] = '\000'; // put in standard CRM114 result standard header: ptc[0] = decision; ptc[1] = 1 - decision; if(decision >= 0.5) { pr = 10*(log10(decision + 1e-300) - log10 (1.0 - decision +1e-300 )); sprintf(buf, "CLASSIFY succeeds; success probability: %6.4f pR: %6.4f\n", decision, pr); bestseen = 0; } else { pr =10*(log10 (decision + 1e-300) - log10 (1.0 - decision +1e-300 )); sprintf(buf, "CLASSIFY fails; success probability: %6.4f pR: %6.4f\n", decision, pr); bestseen = 1; } if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); // Second line of the status report is the "best match" line: // if(bestseen) strcpy(fname, file2); else strcpy(fname, file1); sprintf (buf, "Best match to file #%ld (%s) " \ "prob: %6.4f pR: %6.4f \n", bestseen, fname, ptc[bestseen], 10 * (log10 (ptc[bestseen] + 1e-300) - log10 (1.0 - ptc[bestseen] +1e-300 ))); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); totalfeatures = strlen(file_string); sprintf (buf, "Total features in input file: %ld\n", totalfeatures); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); for(k = 0; k < 2; k++) { sprintf (buf, "#%ld (%s):" \ "documents: %ld, features: %ld, prob: %3.2e, pR: %6.2f \n", k, stringname[k], doc_num[k], stringlens[k], ptc[k], 10 * (log10 (ptc[k] + 1e-300) - log10 (1.0 - ptc[k] + 1e-300) ) ); if (strlen(stext)+strlen(buf) <= stext_maxlen) strcat (stext, buf); } for(k = 0; k < 2; k++) { free(stringname[k]); } // finally, save the status output // crm_destructive_alter_nvariable (svrbl, svlen, stext, strlen (stext)); } // Return with the correct status, so an actual FAIL or not can occur. if (decision >= 0.5 ) { // all done... if we got here, we should just continue execution if (user_trace) fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n"); } else { // Classify was a fail. Do the skip. if (user_trace) fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n"); // and do what we do for a FAIL here csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1; return (0); } free(file_string); regcomp_failed: return (0); } crm114-20100106-BlameMichelson.src/mathalgtest.crm0000755000000000017500000000176011321154266020024 0ustar rootwsy#! /usr/bin/crm # # mathalgtest.crm - Test demo program for extended eval # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. # Test demo program for extended eval (can do string lengths, # algebraic math, and inequalities. window output / \n Algebraic math test \n/ { isolate (:z:) isolate (:x:) /12345/ isolate (:pi:) /3.14159/ output / string value of x is :*:x:, string value of pi is :*:pi:\n/ eval (:z:) / length of x is :#::*:x:: , length of pi is :#::*:pi::\n/ output /:*:z:/ { { eval (:z:) /:@: :#::*:x:: > :#::*:pi:::/ output / string rep of X is longer than pi\n/ } alius { output / string rep of X is shorter than pi\n/ } } eval (:z:) / matheval of x + pi is :@: :*:x: + :*:pi: :\n/ output /:*:z:/ eval (:z:) /:@: (2 * 3) + ( 4 * 5) :/ output / Algebraic matheval of (2*3)+(4*5) is :*:z: \n/ eval (:z:) / and adding 3.14159 to that is :@: :*:z: + 3.14159:\n/ output /:*:z:/ #output /With TDW: >:*:_iso:<\n/ } crm114-20100106-BlameMichelson.src/zz_translate_test.crm0000755000000000017500000000121611321154266021262 0ustar rootwsy#! /usr/bin/crm # # zz_translate_test.crm - test whitespace processing # Copyright 2009 William S. Yerazunis. # This file is under GPLv3, as described in COPYING. window output /version: :*:_crm_version:\n\n/ isolate (:noSpaces:) // alter (:noSpaces:) /Are The Russian Involved ?/ output /noSpaces before: :*:noSpaces:\n/ translate (:noSpaces:) [:noSpaces:] / / // output /noSpaces after : :*:noSpaces:\n\n/ isolate ( :withSpaces: ) // alter ( :withSpaces: ) /Nuclear Combat Toe To Toe With The Rooskies !/ output /withSpaces before: :*:withSpaces:\n/ translate ( :withSpaces: ) [ :withSpaces: ] / / // output /withSpaces after : :*:withSpaces:\n\n/ crm114-20100106-BlameMichelson.src/crm114_structs.h0000644000000000017500000003451211321154266017751 0ustar rootwsy// crm114_structs.h - structures for CRM114 // Copyright 2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. #ifndef __CRM114_STRUCTS_H__ #define __CRM114_STRUCTS_H__ // These are systemwide globals. Sure, they should go into a global // struct, but that realization only occured to me in 2008. Sorry. long vht_size; long cstk_limit; long max_pgmlines; long max_pgmsize; long max_pgmsize; long user_trace; long internal_trace; long debug_countdown; long cmdline_break; long cycle_counter; long ignore_environment_vars; long data_window_size; // Number of hash table buckets. Set from command line, read (only) // by classifier learns. long sparse_spectrum_file_length; long microgroom_chain_length ; long microgroom_stop_after; float min_pmax_pmin_ratio; long profile_execution; int dontcare; void *dontcareptr; long prettyprint_listing; // 0= none, 1 = basic, 2 = expanded, 3 = parsecode long engine_exit_base; // All internal errors will use this number or higher; // the user programs can use lower numbers freely. // how should math be handled? // = 0 no extended (non-EVAL) math, use algebraic notation // = 1 no extended (non-EVAL) math, use RPN // = 2 extended (everywhere) math, use algebraic notation // = 3 extended (everywhere) math, use RPN long q_expansion_mode; // structure of a vht cell // note - each file gets an entry, with the name of the file // being the name of the variable - no colons! // // also note that there's no "next" pointer in a vht cell; this is because // we do in-table overflowing (if a table entry is in use, we use the next // available table entry, wrapping around. It's easy to change in any case. // typedef struct mythical_vht_cell { char *filename; // file where defined (or NULL) int filedesc; // filedesc of defining file (or NULL) char *nametxt; // block of text that hosts the variable name long nstart; // index into nametxt to start of varname long nlen; // length of name char *valtxt; // text block that hosts the captured value // vstart, vlen, mstart, and mlen are all measured // from the _start_ of valtxt, mstart relative to // vstart, etc!!! long vstart; // zero-base index of start of variable (inclusive) long vlen; // length of captured value : this plus vstart is where // you could put a NULL if you wanted to. long mstart; // zero-base start of most recent match of this var long mlen; // length of most recent match against this var; this // plus mstart is where you could put a NULL if you // wanted to. long linenumber; // linenumber of this variable (if known, else -1) long lazy_redirects; // how many lazy redirects are allowed (0 by default); } VHT_CELL; // The argparse block is filled in at run time, though at least in // principle it could be done at microcompile time, but var-expansion // needs to be done at statement execution time.. so we don't fill it // in till we have to, then we cache the result. // typedef struct mythical_argparse_block { char *a1start; long a1len; char *p1start; long p1len; char *p2start; long p2len; char *p3start; long p3len; char *b1start; long b1len; char *s1start; long s1len; char *s2start; long s2len; unsigned long long sflags; } ARGPARSE_BLOCK; // structure of a microcompile table cell (one such per statement) // // These table entries get filled in during microcompile operation. // typedef struct mythical_mct_cell { char *hosttxt; // text file this statement lives in. ARGPARSE_BLOCK *apb; // the argparse block for this statement long start; // zero-base index of start of statement (inclusive) long fchar; // zero-base index of non-blank stmt (for prettyprint) long achar; // zero-base index of start of args; long stmt_utime; // user time spent in this statement line; long stmt_stime; // system time spent in this statement line; int stmt_type; // statement type of this line int nest_level; // nesting level of this statement int fail_index; // if this statement failed, where would we go? int liaf_index; // if this statement liafed, where would we go? int trap_index; // if this statement faults, where would we go? int stmt_break; // 1 if "break" on this stmt, 0 otherwise. } MCT_CELL; // structure of a control stack level cell. // Nota Bene: CSL cells are used to both retain toplevel data about // any particular file being executed as well as being used to retain // data on any file that is data! If a file is executable, then the // mct pointer is a pointer to the compiled MCT table, else the mct // pointer is a NULL and the file is not executable. // typedef struct mythical_csl_cell { char *filename; //filename if any long rdwr; // 0=readonly, 1=rdwr long filedes; // file descriptor it's open on (if any) char *filetext; // text buffer long nchars; // characters of data we have unsigned int hash; // hash of this data (if done) MCT_CELL **mct; // microcompile (if compiled) long nstmts; // how many statements in the microcompile long preload_window; // do we preload the window or not? long cstmt; // current executing statement of this file void *caller; // pointer to this file's caller (if any) long return_vht_cell; // index into the VHT to stick the return value long calldepth; // how many calls deep is this stack frame long aliusstk[MAX_BRACKETDEPTH]; // the status stack for ALIUS } CSL_CELL; // A 1024-byte standardized header for our statistical files (well, the // new standard. Old file types don't have this. Forward migration // shall take place. :-) typedef struct { uint32_t start; uint32_t length; uint32_t tag; } STATISTICS_FILE_CHUNK; typedef struct { uint8_t file_ident_string [ STATISTICS_FILE_IDENT_STRING_MAX ]; // Text description of this file. This should // always start with "CRM114 Classdata " and then // the classifier name etc. Embed versioning // information here (and get it back with strtod) // Please pad unused space with NULLs; don't // change the length (to make file-magic easier). // This is always chunks[0]. // STATISTICS_FILE_CHUNK chunks [ STATISTICS_FILE_NCHUNKS ]; // The byte indexed chunks of data in this file, // by start, length, and tag. // chunks[1] points to this array itself. // A -1 length means "to the // end of the file" // //////////////////////////// // Following in the file are more data chunks. Note that there's // plenty of space here for pre-solves (such as an SVM might generate) // but probably NOT enough space for individual examples to get their // own chunks, unless you change the default number of chunks upward // from 1024. //////////////////////////// } STATISTICS_FILE_HEADER_STRUCT; typedef struct { unsigned int hash; unsigned int key; unsigned int value; } FEATUREBUCKET_STRUCT; typedef struct { unsigned char version[4]; unsigned long flags; unsigned long skip_to; } FEATURE_HEADER_STRUCT; typedef struct { unsigned int hash; unsigned int key; float value; } WINNOW_FEATUREBUCKET_STRUCT; #define ENTROPY_RESERVED_HEADER_LEN 1024 typedef struct { long firlatstart; long firlatlen; long nodestart; long nodeslen; long long totalbits; } ENTROPY_HEADER_STRUCT; typedef struct mythical_entropy_alphabet_slot { long count; long nextcell; } ENTROPY_ALPHABET_SLOT; // 28 byte header, 24 bytes alph (52 tot). Pare: 16 header, 16 alph (36 tot) typedef struct mythical_entropy_cell { double fir_prior; long fir_larger; long fir_smaller; long firlat_slot; // long total_count; ENTROPY_ALPHABET_SLOT abet[ENTROPY_ALPHABET_SIZE]; } ENTROPY_FEATUREBUCKET_STRUCT; // TMS struct - used for measurng process time. typedef struct mythical_tms_struct { clock_t tms_utime; // user time clock_t tms_stime; // system time clock_t tms_cutime; // user time of children clock_t tms_cstime; // system time of children } TMS_STRUCT; // define statement types for microcompile // #define CRM_BOGUS 0 #define CRM_NOOP 1 #define CRM_EXIT 2 #define CRM_OPENBRACKET 3 #define CRM_CLOSEBRACKET 4 #define CRM_LABEL 5 #define CRM_GOTO 6 #define CRM_MATCH 7 #define CRM_FAIL 8 #define CRM_LIAF 9 #define CRM_ACCEPT 10 #define CRM_TRAP 11 #define CRM_FAULT 12 #define CRM_INPUT 13 #define CRM_OUTPUT 14 #define CRM_WINDOW 15 #define CRM_ALTER 16 #define CRM_CALL 17 #define CRM_ROUTINE 18 #define CRM_RETURN 19 #define CRM_SYSCALL 20 #define CRM_LEARN 21 #define CRM_CLASSIFY 22 #define CRM_ISOLATE 23 #define CRM_HASH 24 #define CRM_INTERSECT 25 #define CRM_UNION 26 #define CRM_EVAL 27 #define CRM_ALIUS 28 #define CRM_TRANSLATE 29 #define CRM_DEBUG 30 #define CRM_CLUMP 31 // make clusters out of tokens #define CRM_PMULC 32 // pmulc translates tokens to cluster names #define CRM_LAZY 33 // makes a "lazy" variable. #define CRM_UNIMPLEMENTED 34 // FLAGS FLAGS FLAGS // all of the valid CRM114 flags are listed here // // GROT GROT GROT - You must keep this in synchrony with the // definitions of the keywords in crm_stmt_parser!!! Yes, I'd // love to define it in one place and one place only, but I haven't // figured out a way to do that well. // match searchstart flags #define CRM_FROMSTART (1LLU << 0) #define CRM_FROMNEXT (1LLU << 1) #define CRM_FROMEND (1LLU << 2) #define CRM_NEWEND (1LLU << 3) #define CRM_FROMCURRENT (1LLU << 4) // match control flags #define CRM_NOCASE (1LLU << 5) #define CRM_ABSENT (1LLU << 6) #define CRM_BASIC (1LLU << 7) #define CRM_BACKWARDS (1LLU << 8) #define CRM_LITERAL (1LLU << 9) #define CRM_NOMULTILINE (1LLU << 10) // input/output/window flags #define CRM_BYLINE CRM_NOMULTILINE #define CRM_BYCHAR (1LLU << 11) #define CRM_STRING CRM_BYCHAR // string is bychar. I think... #define CRM_BYCHUNK (1LLU << 12) #define CRM_BYEOF (1LLU << 13) #define CRM_EOFACCEPTS (1LLU << 14) #define CRM_EOFRETRY (1LLU << 15) #define CRM_APPEND (1LLU << 16) // process control flags #define CRM_KEEP (1LLU << 17) #define CRM_ASYNC (1LLU << 18) // learn and classify #define CRM_REFUTE (1LLU << 19) #define CRM_MICROGROOM (1LLU << 20) #define CRM_MARKOVIAN (1LLU << 21) #define CRM_OSB_BAYES (1LLU << 22) // synonym with OSB feature gen #define CRM_OSB CRM_OSB_BAYES #define CRM_CORRELATE (1LLU << 23) #define CRM_OSB_WINNOW (1LLU << 24) // synonym to Winnow feature combiner #define CRM_WINNOW CRM_OSB_WINNOW #define CRM_CHI2 (1LLU << 25) #define CRM_UNIQUE (1LLU << 26) #define CRM_ENTROPY (1LLU << 27) #define CRM_OSBF (1LLU << 28) // synonym with OSBF local rule #define CRM_OSBF_BAYES CRM_OSBF #define CRM_HYPERSPACE (1LLU << 29) #define CRM_UNIGRAM (1LLU << 30) #define CRM_CROSSLINK (1LLU << 31) // // Flags that need to be sorted back in // input #define CRM_READLINE (1LLU << 32) // isolate flags #define CRM_DEFAULT (1LLU << 33) // SKS classifier #define CRM_SKS (1LLU << 34) // SVM classifier #define CRM_SVM (1LLU << 35) // FSCM classifier #define CRM_FSCM (1LLU << 36) // Neural Net classifier #define CRM_NEURAL_NET (1LLU << 37) // #define CRM_ERASE (1LLU << 38) //PCA classifier #define CRM_PCA (1LLU << 39) // and a struct to put them in. typedef struct { char * string; unsigned long long value; } FLAG_DEF ; //***************************************************************** // // The following table describes the statements allowed in CRM114. // // Each entry is one line of STMT_TABLE_TYPE, and gives the text // representation of the command, the internal dispatch code, // whether the statement is "executable" or not, what the minimum // and maximum number of slash-groups, paren-groups, and box-groups // are for the statement to make sense, and what flags are allowed // for that statement. // typedef struct { char *stmt_name; int stmt_code; int namelen; int is_executable; int minslashes; int maxslashes; int minparens; int maxparens; int minboxes; int maxboxes; long flags_allowed_mask; } STMT_TABLE_TYPE; // The compiler file actually contains this "for real", the // extern here is merely a reference to it. // #ifndef BASE_COMPILER_TABLE_HERE extern STMT_TABLE_TYPE stmt_table[]; #endif // these defines are for arg type... note that they must remain synched // IN THIS ORDER with the start chars and end chars in crm_statement_parse // #define CRM_ANGLES 0 #define CRM_PARENS 1 #define CRM_BOXES 2 #define CRM_SLASHES 3 // The possible exit codes #define CRM_EXIT_OK 0 #define CRM_EXIT_ERROR 1 #define CRM_EXIT_FATAL 2 #define CRM_EXIT_APOCALYPSE 666 // The ORable exec codes for crm_zexpandvar; OR together the ones // you want to enable for zexpandvar. Nexpandvar is ansi|stringvar|redirect, // and qexpandvar is "all of them". :) #define CRM_EVAL_ANSI 0x01 #define CRM_EVAL_STRINGVAR 0x02 #define CRM_EVAL_REDIRECT 0x04 #define CRM_EVAL_STRINGLEN 0x08 #define CRM_EVAL_MATH 0x10 // The possible cache actions #define CRM_MMAP_CACHE_UNUSED 0 // active makes it really mapped (or reactivates a released mmap) #define CRM_MMAP_CACHE_ACTIVE 1 // release marks the slot reusable, but doesn't unmap (yet) #define CRM_MMAP_CACHE_RELEASE 2 // drop really unmaps #define CRM_MMAP_CACHE_DROP 3 #endif // !__CRM114_STRUCTS_H__ crm114-20100106-BlameMichelson.src/crm_debugger.c0000644000000000017500000002133111321154266017566 0ustar rootwsy// crm_debugger.c - debugging utilities // Copyright 2001-2009 William S. Yerazunis. // This file is under GPLv3, as described in COPYING. // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *inbuf; extern char *outbuf; // If we got to here, we need to run some user-interfacing // (i.e. debugging). // // possible return values: // 1: reparse and continue execution // -1: exit immediately // 0: continue long crm_debugger () { long ichar; static int firsttime = 1; static FILE *mytty; if (firsttime) { fprintf (stderr, "CRM114 Debugger - type \"h\" for help. "); fprintf (stderr, "User trace turned on.\n"); user_trace = 1; firsttime = 0; if (user_trace) fprintf (stderr, "Opening the user terminal for debugging I/O\n"); #ifndef CRM_WINDOWS mytty = fopen ("/dev/tty", "rb"); #else // CRM_WINDOWS mytty = fopen ("CON", "rb"); #endif // CRM_WINDOWS clearerr (mytty); }; if (csl->mct[csl->cstmt]->stmt_break > 0) fprintf (stderr, "Breakpoint tripped at statement %ld\n", csl->cstmt); debug_top: // let the user know they're in the debugger // fprintf (stderr, "\ncrm-dbg> "); // find out what they want to do // ichar = 0; while (!feof (mytty) && ichar < data_window_size - 1 && (inbuf [ichar-1] != '\n' ) ) { inbuf[ichar] = fgetc (mytty); ichar++; }; inbuf [ichar] = '\000'; if (feof (mytty) ) { fprintf (stderr, "Quitting\n"); if (engine_exit_base != 0) { exit (engine_exit_base + 8); } else exit ( EXIT_SUCCESS ); }; // now a big siwtch statement on the first character of the command // switch (inbuf[0]) { case 'q': case 'Q': { if (user_trace) fprintf (stderr, "Quitting.\n"); if (engine_exit_base != 0) { exit (engine_exit_base + 9); } else exit ( EXIT_SUCCESS ); }; break; case 'n': case 'N': { debug_countdown = 0; return (0); } break; case 'c': case 'C': { sscanf (&inbuf[1], "%ld", &debug_countdown); if (debug_countdown <= 0) { debug_countdown = -1; fprintf (stderr, "continuing execution...\n"); } else { fprintf (stderr, "continuing %ld cycles...\n", debug_countdown); }; return (0); }; break; case 't': if (user_trace == 0 ) { user_trace = 1 ; fprintf (stderr, "User tracing on"); } else { user_trace = 0; fprintf (stderr, "User tracing off"); }; break; case 'T': if (internal_trace == 0 ) { internal_trace = 1 ; fprintf (stderr, "Internal tracing on"); } else { internal_trace = 0; fprintf (stderr, "Internal tracing off"); }; break; case 'e': { fprintf (stderr, "expanding: \n"); memmove (inbuf, &inbuf[1], strlen (inbuf) -1); crm_nexpandvar (inbuf, strlen(inbuf) -1, data_window_size); fprintf (stderr, "%s", inbuf); }; break; case 'i': { fprintf (stderr, "Isolating %s", &inbuf[1]); fprintf (stderr, "NOT YET IMPLEMENTED! Sorry. \n"); } break; case 'v': { long i, j; long stmtnum; i = sscanf (&inbuf[1], "%ld", &stmtnum); if (i > 0) { fprintf (stderr, "statement %ld: ", stmtnum); if ( stmtnum < 0 || stmtnum > csl->nstmts) { fprintf (stderr, "... does not exist!\n"); } else { for ( j = csl->mct[stmtnum]->start; j < csl->mct[stmtnum+1]->start; j++) { fprintf (stderr, "%c", csl->filetext[j]); }; }; } else { fprintf (stderr, "What statement do you want to view?\n"); } }; break; case 'j': { long nextstmt; long i; long vindex; i = sscanf (&inbuf[1], "%ld", &nextstmt); if (i == 0) { // maybe the user put in a label? long tstart; long tlen; crm_nextword (&inbuf[1], strlen (&inbuf[1]), 0, &tstart, &tlen); memmove (inbuf, &inbuf[1+tstart], tlen); inbuf[tlen] = '\000'; vindex= crm_vht_lookup (vht, inbuf, tlen); if (vht[vindex] == NULL) { fprintf (stderr, "No label '%s' in this program. ", inbuf); fprintf (stderr, "Staying at line %ld\n", csl->cstmt); nextstmt = csl->cstmt; } else { nextstmt = vht[vindex]->linenumber; }; }; if (nextstmt <= 0) { nextstmt = 0; } if (nextstmt >= csl->nstmts) { nextstmt = csl-> nstmts; fprintf (stderr, "last statement is %ld, assume you meant that.\n", csl->nstmts); }; if (csl->cstmt != nextstmt) fprintf (stderr, "Next statement is statement %ld\n", nextstmt); csl->cstmt = nextstmt; } return (1); break; case 'b': { // is there a breakpoint to make? long breakstmt; long i; long vindex; breakstmt = -1; i = sscanf (&inbuf[1], "%ld", &breakstmt); if (i == 0) { // maybe the user put in a label? long tstart; long tlen; crm_nextword (&inbuf[1], strlen (&inbuf[1]), 0, &tstart, &tlen); memmove (inbuf, &inbuf[1+tstart], tlen); inbuf[tlen] = '\000'; vindex= crm_vht_lookup (vht, inbuf, tlen); fprintf (stderr, "vindex = %ld\n", vindex); if (vht[vindex] == NULL) { fprintf (stderr, "No label '%s' in this program. ", inbuf); fprintf (stderr, "No breakpoint change made. \n"); } else { breakstmt = vht[vindex]->linenumber; }; }; if (breakstmt <= -1) { breakstmt = 0; } if (breakstmt >= csl->nstmts) { breakstmt = csl-> nstmts; fprintf (stderr, "last statement is %ld, assume you meant that.\n", csl->nstmts); }; csl->mct[breakstmt]->stmt_break = 1 - csl->mct[breakstmt]->stmt_break; if (csl->mct[breakstmt]->stmt_break == 1) { fprintf (stderr, "Setting breakpoint at statement %ld\n", breakstmt); } else { fprintf (stderr, "Clearing breakpoint at statement %ld\n", breakstmt); }; } return (1); break; case 'a': { // do a debugger-level alteration // maybe the user put in a label? long vstart, vlen; long vindex; long ostart, oend, olen; crm_nextword (&inbuf[1], strlen (&inbuf[1]), 0, &vstart, &vlen); memmove (inbuf, &inbuf[1+vstart], vlen); inbuf[vlen] = '\000'; vindex= crm_vht_lookup (vht, inbuf, vlen); if (vht[vindex] == NULL) { fprintf (stderr, "No variable '%s' in this program. ", inbuf); } // now grab what's left of the input as the value to set // ostart = vlen + 1; while (inbuf[ostart] != '/' && inbuf[ostart] != '\000') ostart++; ostart++; oend = ostart + 1; while (inbuf[oend] != '/' && inbuf[oend] != '\000') oend++; memmove (outbuf, &inbuf[ostart], oend - ostart); outbuf [oend - ostart] = '\000'; olen = crm_nexpandvar (outbuf, oend - ostart, data_window_size); crm_destructive_alter_nvariable (inbuf, vlen, outbuf, olen); }; break; case 'f': { csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; fprintf (stderr, "Forward to }, next statement : %ld\n", csl->cstmt); csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1; }; return (1); break; case 'l': { csl->cstmt = csl->mct[csl->cstmt]->liaf_index; fprintf (stderr, "Backward to {, next statement : %ld\n", csl->cstmt); }; return (1); break; case 'h': { fprintf (stderr, "a :var: /value/ - alter :var: to /value/ \n"); fprintf (stderr, "b - toggle breakpoint on line \n"); fprintf (stderr, "b