pax_global_header00006660000000000000000000000064135622304430014514gustar00rootroot0000000000000052 comment=a764f83d4698317198e8c7de62959930178f2a16 spamoracle-release16/000077500000000000000000000000001356223044300147735ustar00rootroot00000000000000spamoracle-release16/.depend000066400000000000000000000025531356223044300162400ustar00rootroot00000000000000attachments.cmo: mail.cmi attachments.cmi attachments.cmx: mail.cmx attachments.cmi configfile.cmo: configfile.cmi configfile.cmx: configfile.cmi config.cmo: configfile.cmi config.cmi config.cmx: configfile.cmx config.cmi database.cmo: database.cmi database.cmx: database.cmi htmlscan.cmo: config.cmi htmlscan.cmi htmlscan.cmx: config.cmx htmlscan.cmi mail.cmo: config.cmi htmlscan.cmi mail.cmi mail.cmx: config.cmx htmlscan.cmx mail.cmi main.cmo: config.cmi configfile.cmi database.cmi mbox.cmi processing.cmi main.cmx: config.cmx configfile.cmx database.cmx mbox.cmx processing.cmx mbox.cmo: mbox.cmi mbox.cmx: mbox.cmi processing.cmo: attachments.cmi config.cmi database.cmi mail.cmi rankmsg.cmi \ refhosts.cmi wordsplit.cmi processing.cmi processing.cmx: attachments.cmx config.cmx database.cmx mail.cmx rankmsg.cmx \ refhosts.cmx wordsplit.cmx processing.cmi rankmsg.cmo: config.cmi database.cmi mail.cmi refhosts.cmi wordsplit.cmi \ rankmsg.cmi rankmsg.cmx: config.cmx database.cmx mail.cmx refhosts.cmx wordsplit.cmx \ rankmsg.cmi refhosts.cmo: refhosts.cmi refhosts.cmx: refhosts.cmi virus.cmo: mail.cmi mbox.cmi zip.cmo virus.cmx: mail.cmx mbox.cmx zip.cmx wordsplit.cmo: wordsplit.cmi wordsplit.cmx: wordsplit.cmi attachments.cmi: mail.cmi config.cmi: configfile.cmi processing.cmi: database.cmi rankmsg.cmi: database.cmi mail.cmi spamoracle-release16/Changes000066400000000000000000000023731356223044300162730ustar00rootroot00000000000000Release 1.6: - Adapt to safe strings, using mutable byte arrays where needed. Release 1.5: - Be resilient to changes in OCaml's hash table implementation. - Added command "spamoracle upgrade" to convert the database to the current hash table format. Release 1.4: - More lenient rule for ignoring HTML comments. - Recognition of S P A C E D O-U-T words. - Fixed a bug causing certain text-only e-mails to be ignored. - Improved the "spamoracle test" command. - Added "spamoracle words" command for troubleshooting the word-splitting algorithm. Release 1.3: - Better scanning of words in HTML documents. - Better handling of multipart documents; now scan both text and HTML parts of a multipart/alternative. - Support for Portuguese and Japanese (JIS encoding) - Added configuration file (~/.spamoracle.conf) to allow twiddling with filtering parameters. - Fixed database corruption problem with concurrent adds. - Handles gzipped databases. - Added manual pages. Release 1.2: - When choosing part of a multipart/alternative, ignore pure text part if much smaller than HTML part. - Added commands "backup" and "restore" to facilitate future evolution towards other database formats. Release 1.1: - Fixed "usage" message. Release 1.0: - First public release. spamoracle-release16/Makefile000066400000000000000000000031711356223044300164350ustar00rootroot00000000000000### Configuration section # The laguages you're interested in, besides English LANGUAGES=-DFRENCH #-DSPANISH -DITALIAN -DGERMAN -DPORTUGUESE -DJAPANESE # How to invoke the C preprocessor CPP=gcc -E -P $(LANGUAGES) - # Where to install the binary BINDIR=/usr/local/bin # Where to install the man pages MANDIR=/usr/local/man ### End of configuration section OCAMLC=ocamlc -g OCAMLLEX=ocamllex OCAMLDEP=ocamldep OCAMLOPT=ocamlopt BYTEOBJS=configfile.cmo config.cmo \ htmlscan.cmo mail.cmo database.cmo mbox.cmo wordsplit.cmo \ refhosts.cmo rankmsg.cmo attachments.cmo processing.cmo main.cmo BYTELIBS=unix.cma str.cma NATOBJS=$(BYTEOBJS:.cmo=.cmx) NATLIBS=$(BYTELIBS:.cma=.cmxa) all: spamoracle install: cp spamoracle $(BINDIR)/spamoracle cp spamoracle.1 $(MANDIR)/man1/spamoracle.1 cp spamoracle.conf.5 $(MANDIR)/man5/spamoracle.conf.5 spamoracle: $(NATOBJS) $(OCAMLOPT) -o spamoracle $(NATLIBS) $(NATOBJS) clean:: rm -f spamoracle spamoracle.byte: $(BYTEOBJS) $(OCAMLC) -o spamoracle.byte $(BYTELIBS) $(BYTEOBJS) clean:: rm -f spamoracle.byte wordsplit.mll: wordsplit.mlp $(CPP) < wordsplit.mlp > wordsplit.mll \ || { rm -f wordsplit.mll; exit 2; } clean:: rm -f wordsplit.mll wordsplit.ml: wordsplit.mll $(OCAMLLEX) wordsplit.mll clean:: rm -f wordsplit.ml beforedepend:: wordsplit.ml htmlscan.ml: htmlscan.mll $(OCAMLLEX) htmlscan.mll clean:: rm -f htmlscan.ml beforedepend:: htmlscan.ml clean:: rm -f *.cm[iox] *.o .SUFFIXES: .ml .mli .cmo .cmi .cmx .mli.cmi: $(OCAMLC) -c $< .ml.cmo: $(OCAMLC) -c $< .ml.cmx: $(OCAMLOPT) -c $< depend: beforedepend $(OCAMLDEP) *.ml *.mli > .depend include .depend spamoracle-release16/README000066400000000000000000000315741356223044300156650ustar00rootroot00000000000000 SpamOracle -- a spam classification tool Version 1.6 OVERVIEW: SpamOracle is a tool to help detect and filter away "spam" (unsolicited commercial e-mail). It proceeds by statistical analysis of the words that appear in the e-mail, comparing the frequencies of words with those found in a user-provided corpus of known spam and known legitimate e-mail. The classification algorithm is based on Bayes' formula, and is described in Paul Graham's paper, "A plan for spam", http://www.paulgraham.com/spam.html. This program is designed to work in conjunction with procmail. The result of the analysis is output as an additional message header "X-Spam:", followed by "yes", "no" or "unknown", plus additional details. A procmail rule can then test this "X-Spam:" header and deliver the e-mail to the appropriate mailbox. In addition, SpamOracle also also analyses MIME attachments, extracting relevant information such as MIME type, character encoding and attached file name, and summarizing them in an additional "X-Attachments:" header. This allows procmail to easily reject e-mails containing suspicious attachments, e.g. Windows executables which often indicate a virus. LICENSE: This program is distributed under the terms of the GPL (GNU Public License) version 2, available from http://www.gnu.org/licenses/gpl.txt. REQUIREMENTS AND LIMITATIONS: - To compile: Objective Caml, https://ocaml.org/, version 4.02 or later. - To use: . Your mail must be delivered to a Unix machine. You must have a shell account on this machine. This machine must have procmail (http://www.procmail.org/) installed. Your ~/.forward file must be set up to run all incoming e-mail through procmail. . To provide the corpus of messages from which SpamOracle "learns", an archive of at least 1000 of your e-mails is needed. The archive must be manually or semi-automatically split into known spams and known good messages. Mis-classified messages in the corpus (e.g. spams mistakenly stored among the good messages) will decrease the efficiency of the classification. The archive must be in Unix mailbox format, or in "one message per file" format (a la MH). Other formats, such as Emacs' Babyl, are not supported. . The notion of "word" used by SpamOracle is slanted towards Western European languages, i.e. the ISO Latin-1 and Latin-9 character sets. Preliminary support for JIS-encoded Japanese is provided. Still, SpamOracle will not work well if you receive many legitimate e-mails written in other character sets, such as Chinese or Korean. INSTALLATION: Edit the Makefile and change the definitions of the following variables at the top of the file: LANGUAGES the languages you're interested in besides English CPP how to invoke the C preprocessor BINDIR where to install the executable Do "make" in this directory. Become superuser if necessary and do "make install". INITIALIZATION: To build the database of word frequencies from the corpus, do: rm ~/.spamoracle.db spamoracle add -v -good goodmails -spam spammails (By default, the database is stored in the file ".spamoracle.db" in your home directory. This can be overriden with the -f option: spamoracle -f mydatabase add ... ) This assumes that the good, non-spam messages from the corpus are stored in the file "goodmails", and the known spam messages in the file "spammails". You can also fetch corpus messages from several files: spamoracle add -v -good goodmails1 ... goodmailsN \ -spam spammails1 ... spammailsP To check that the database was built correctly, and familiarize yourself with the statistical analysis performed by SpamOracle, invoke the "test" mode on the mailboxes that you just used for building the corpus: spamoracle test goodmails | more spamoracle test spammails | more For each message in the given mailboxes, you'll see a summary like this: From: bbo Subject: Check This Out Score: 1.00 -- 15 Details: refid:98 $$$$:98 surfing:98 asp:95 click:93 cable:92 instantly:90 https:88 internet:87 www:86 U4:85 isn't:14 month:81 com:75 surf:75 Attachments: cset="GB2312" type="application/octet-stream" name="Guangwen4.zip" The first two lines are just the From: and Subject: fields of the original message. The "Score:" line summarizes the result of the analysis. The first number (between 0.0 and 1.0) is the probability that the message is actually spam --- or, equivalently, the degree of similarity of the message with the spam messages in the corpus. The second number (an integer between 0 and 15) is the number of "interesting" words found in the message. "Interesting" words are those that occur at least 5 times in the corpus. In the example, we have 15 interesting words (the maximum) and a score of 1.00, indicating a spam with high certainty. The "Details:" line provides an explanation of the score. It lists the 15 most interesting words found in the message, that is, the 15 interesting words whose probability of denoting a spam is farthest away from the neutral 0.5. Each word is given with its individual score, written as a percentage (between 01 and 99) rather than as a probability so as to save space. Here, we see a number of very "spammish" words such as "$$$$" or "click", with probability 0.98 and 0.93 respectively, and a few "innocent" words such as "isn't" (probability 0.14). The "U4" word with probability 0.85 is actually a pseudo-word representing a 4-letter word all in uppercase -- something spammers are fond of. The "Attachments:" line summarizes some information about MIME attachments for this message. Here, we have one attachment of type "application/octect-stream", file name "Guangwen4.zip", and character set "GB2312". The latter is an encoding for Chinese and a solid hint that this is a Chinese spam (assuming that, like me, you can't read Chinese). Normally, when running "spamoracle test goodmails", most messages should come out with low score (0.2 or less), and when running "spamoracle test spammails", most messages should come out with a high score (0.8 or more). If not, your corpus isn't very good, or not well classified into spam and non-spam. To quickly see the outliers, you can reduce the interval of scores for which message summaries are displayed, as follows: spamoracle test -min 0.2 goodmails | more # Shows only good mails with score >= 0.2 spamoracle test -max 0.8 spammails | more # Shows only spam mails with score <= 0.8 Now, for a more challenging test, take a mailbox that contains unfiltered e-mails, i.e. a mixture of spam and legitimate e-mails, and run it through "spamoracle": spamoracle test mymailbox | less Marvel at how well the oracle recognizes spam from the rest! If the result isn't that marvelous to you, keep in mind that certain spams are just too short to be recognized (not enough significant words). Also, perhaps your corpus was too small, or not well categorized... USING SPAMORACLE AND PROCMAIL TO FILTER YOUR INCOMING E-MAIL: Once the database is built, you're ready to run incoming e-mails through SpamOracle. The command "spamoracle mark" reads one e-mail from standard input, and copies it to standard output, with two headers inserted: "X-Spam:" and "X-Attachments:". The X-Spam: header is as follows: X-Spam: yes; ;
or X-Spam: no; ;
or X-Spam: unknown; ;
The and
are as described for "spamoracle test". The "yes" / "no" / "unknown" synthesizes the results of the analysis: "yes": score >= 0.8 and at least 5 interesting words found "no": score <= 0.2 and at least 5 interesting words found "unknown": otherwise The "unknown" case generally occurs for very short messages, where not enough interesting words were found. The "X-Attachments:" header contains the same information as the "Attachments:" output of "spamoracle test", that is, a summary of the message attachments. To process automatically your incoming e-mail through "spamoracle" and act upon the results of the analysis, just insert the following "recipes" in the file ~/.procmailrc: :0fw | /usr/local/bin/spamoracle mark :0 * ^X-Spam: yes; spambox What these cryptic commands mean is: - Run every mail through the "spamoracle mark" command. (If spamoracle wasn't installed in /usr/local/bin, adjust the path as necessary.) This adds two headers to the message: "X-Spam:" and "X-Attachments:", describing the results of the spam analysis and the attachment analysis. - If we have an "X-Spam: yes" header, deliver the message to the file "spambox" rather than to your regular mailbox. Presumably, you'll read "spambox" once in a while, but less often than your regular mailbox. Daring users can put "/dev/null" instead of "spambox" to just throw away the message, but please don't do that until you've used SpamOracle for a while and are happy with the results. SpamOracle's false positive rate (i.e. legitimate mails classified as spam) is low (0.1% on my mail) but not null. So, better save the presumed spams somewhere, and scan them quickly from time to time. If you'd like to enjoy a bit of attachment-based filtering, here are some procmail rules for that: :0 * ^X-Attachments:.*name=".*\.(pif|scr|exe|bat)" spambox :0 * ^X-Attachments:.*type="audio/(x-wav|x-midi) spambox :0 * ^(Content-type:.*|X-Attachments:.*cset="|^Subject:.*=\?)(ks_c|gb2312|iso-2|euc-|big5|windows-1251) spambox The first rule treats as spam every mail that has a Windows executable as attachment. These mails are typically sent by viruses. The second rule does the same with attachments of type x-wav or x-midi. I never normally receive music by e-mail, however some popular e-mail viruses seem fond of these attachment types. The third rule treats as spam every mail that uses character encodings corresponding to Korean, Chinese, Japanese, and Cyrillic. Since I don't read any of these scripts, why not get rid of the messages immediately? UPDATING THE DATABASE: At any time, you can add more known spams or known legitimate messages to the database by using the "spamoracle add" command. For instance, if you find a spam message that was not classified as such, run it through "spamoracle add -spam", so that SpamOracle can learn from its mistake. (Without additional arguments, this command will read a single message from standard input and record it as spam.) Under mutt for instance, just highlight the spam message and type |spamoracle add -spam Similarly, if you find a legitimate message while checking your spam box, run it through "spamoracle add -good". Another option is to collect more known spams or more known good messages into mailbox files, and once in a while do spamoracle add -good new_good_mails or spamoracle add -spam new_spam_mails TECHNICAL DETAILS: SpamOracle's notion of "word" is the following: - any run of 3 to 12 letters, single quotes, dashes (-) - any run of 3 to 8 digits, dots, commas, dollar, euro and percent signs. If support for non-English european languages was compiled in, letters also include the relevant accented letters for the languages in question. All words are mapped to lowercase, and accented letters are mapped to the corresponding non-accented letters. In addition, a run of three or more uppercase letters generates a pseudo-word "Un" where n is the length of the run. Similarly, a run of three or more non-ASCII characters (code >= 128) generates a pseudo-word "Wn" where n is the length of the run. For instance, the following text: SUMMER in English is written "été" in French ¹²³ is processed into the following words, assuming French support was selected: U5 summer english written ete french W3 and if French support was not selected: U5 summer english written french W3 For your edification and entertainment, the contents of the database can be dumped with the "spamoracle list " command, where is an Emacs-style regexp specifying the words you're interested in, e.g.: spamoracle list '.*' # show all words -- big list! spamoracle list 'sex.*' spamoracle list 'caml.*' It is possible to tweak many of the parameters that govern filtering via the configuration file ~/.spamoracle.conf. The configurable parameters are listed and explained in the man page spamoracle.conf (5). All parameters have reasonable default values, but you may try to tweak them to get better filtering. spamoracle-release16/attachments.ml000066400000000000000000000052641356223044300176470ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (* Summarize the attachments of a message as one line that can be put in the header of the message. Allows procmail to filter suspicious attachments without looking at the message body. *) open Printf open Mail let re_content_type = Str.regexp "\\([/a-zA-Z0-9-]+\\)" let re_innocuous_content_types = Str.regexp_case_fold "text/plain\\|text/html\\|text/x-vcard\\|multipart/\\|message/rfc822\\|message/delivery-status" let re_charset = Str.regexp_case_fold "charset=\\(\"\\([^\"]+\\)\"\\|[^ \t;]+\\)" let re_innocuous_charsets = Str.regexp_case_fold "us-ascii\\|iso[-_]8859[-_]1$\\|iso[-_]8859[-_]15\\|windows-1252" let re_name = Str.regexp_case_fold "name=\\(\"\\([^\"]+\\)\"\\|[^ \t;]+\\)" let match_anchored re s = Str.string_match re s 0 let match_unanchored re s = try ignore (Str.search_forward re s 0); true with Not_found -> false let summarize msg = let res = Buffer.create 200 in let rec summ m = let h = header "content-type:" m in if match_anchored re_content_type h then begin let c = Str.matched_group 1 h in if not (Str.string_match re_innocuous_content_types c 0) then bprintf res "type=\"%s\" " c end; if match_unanchored re_charset h then begin let c = try Str.matched_group 2 h with Not_found -> Str.matched_group 1 h in if not (Str.string_match re_innocuous_charsets c 0) then bprintf res "cset=\"%s\" " c end; if match_unanchored re_name h then begin let c = try Str.matched_group 2 h with Not_found -> Str.matched_group 1 h in bprintf res "name=\"%s\" " c end; let h = header "content-disposition:" m in if match_unanchored re_name h then begin let c = try Str.matched_group 2 h with Not_found -> Str.matched_group 1 h in bprintf res "name=\"%s\" " c end; List.iter summ m.parts in List.iter summ msg.parts; Buffer.contents res spamoracle-release16/attachments.mli000066400000000000000000000020461356223044300200130ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (* Summarize the attachments of a message as one line that can be put in the header of the message. Allows procmail to filter suspicious attachments without looking at the message body. *) val summarize: Mail.message -> string spamoracle-release16/config.ml000066400000000000000000000037311356223044300165760ustar00rootroot00000000000000(* Configurable parameters *) let database_name = ref (try Filename.concat (Sys.getenv "HOME") ".spamoracle.db" with Not_found -> ".spamoracle.db") let html_add_tags = ref false let html_tag_attr = ref (Str.regexp_case_fold "a/href\\|img/src\\|img/alt\\|frame/src\\|font/face\\|font/color") let mail_headers = ref (Str.regexp_case_fold "from:\\|subject:") let alternative_favor_html = ref true let spam_header = ref "X-Spam" let attachments_header = ref "X-Attachments" let summarize_attachments = ref true let num_words_retained = ref 15 let max_repetitions = ref 2 let robinson_s = ref 0.0 let robinson_x = ref 0.5 let low_freq_limit = ref 0.01 let high_freq_limit = ref 0.99 let use_chi_square = ref false let good_mail_prob = ref 0.2 let spam_mail_prob = ref 0.8 let min_meaningful_words = ref 5 let summarize_referenced = ref false let referenced_header = ref "X-Referenced-Hosts" let reassemble_words = ref false let external_converter = ref "" open Configfile let options = [ "database_file", String database_name; "html_retain_tags", Bool html_add_tags; "html_tag_attributes", Regexp html_tag_attr; "mail_headers", Regexp mail_headers; "alternative_favor_html", Bool alternative_favor_html; "spam_header", String spam_header; "attachments_header", String attachments_header; "summarize_attachments", Bool summarize_attachments; "referenced_header", String referenced_header; "summarize_referenced", Bool summarize_referenced; "num_meaningful_words", Int num_words_retained; "max_repetitions", Int max_repetitions; "low_freq_limit", Float low_freq_limit; "high_freq_limit", Float high_freq_limit; "min_meaningful_words", Int min_meaningful_words; "good_mail_prob", Float good_mail_prob; "spam_mail_prob", Float spam_mail_prob; "robinson_s", Float robinson_s; "robinson_x", Float robinson_x; "use_chi_square", Bool use_chi_square; "reassemble_words", Bool reassemble_words; "external_converter", String external_converter ] spamoracle-release16/config.mli000066400000000000000000000045721356223044300167530ustar00rootroot00000000000000(* Configurable parameters *) val database_name : string ref (** Name of database file *) val html_add_tags : bool ref (** Whether to treat HTML tag names as words *) val html_tag_attr : Str.regexp ref (** Regexp matching [tag/attr] strings denoting pairs of HTML tag and attribute names. If a tag and attribute pair matches, the associated value is added to the text. *) val mail_headers : Str.regexp ref (** Regexp matching names of e-mail headers that must be analyzed. *) val alternative_favor_html : bool ref (** If true, consider only the HTML part of a multipart/alternative. Otherwise, consider all parts. *) val spam_header : string ref (** Name of header added with spam / not-spam info (default: "X-Spam") *) val attachments_header : string ref (** Name of header added with attachment summary (default: "X-Attachments") *) val summarize_attachments : bool ref (** Whether to generate the attachment summary *) val num_words_retained : int ref (** Number of meaningful words to retain for computing final prob. *) val max_repetitions : int ref (** Among the meaningful words, max number of time a given word can appear. *) val low_freq_limit : float ref (** Lower limit for word frequencies. Default is 0.001. *) val high_freq_limit : float ref (** Upper limit for word frequencies. Default is 0.999. *) val robinson_s : float ref val robinson_x : float ref (** Robinson's parameters for taking word frequencies into account. *) val use_chi_square : bool ref (** Use Robinson's chi-square test *) val min_meaningful_words : int ref (** Number of meaningful words below which mails are classified as unknown *) val good_mail_prob : float ref (** Spam probability below which mails are classified as good *) val spam_mail_prob : float ref (** Spam probability below which mails are classified as spam *) val summarize_referenced : bool ref val referenced_header : string ref val reassemble_words : bool ref val external_converter : string ref (** Program to be called on message parts that are not text. The program receives the content-type as first argument and the actual data on standard input. It should output the corresponding text on standard output, or exit with non-zero error code if it cannot extract text. *) val options : (string * Configfile.value) list (** List of configurable parameters *) spamoracle-release16/configfile.ml000066400000000000000000000054051356223044300174360ustar00rootroot00000000000000(* Parsing configuration files *) type value = | Bool of bool ref | String of string ref | Int of int ref | Float of float ref | Regexp of Str.regexp ref | OptBool of bool option ref | OptString of string option ref | OptInt of int option ref | OptFloat of float option ref | OptRegexp of Str.regexp option ref exception Error of string let re_bool_yes = Str.regexp_case_fold "\\(on\\|yes\\|true\\|1\\)[ \t\r]*$" let re_bool_no = Str.regexp_case_fold "\\(off\\|no\\|false\\|0\\)[ \t\r]*$" let parse_bool data = if Str.string_match re_bool_yes data 0 then true else if Str.string_match re_bool_no data 0 then false else raise (Error "invalid boolean value") let trim_spaces s = let i = ref (String.length s - 1) in while !i >= 0 && (let c = s.[!i] in c = ' ' || c = '\t' || c = '\r') do decr i done; String.sub s 0 (!i + 1) let parse_string data = try Scanf.sscanf data "%S" (fun s -> s) with Scanf.Scan_failure _ -> trim_spaces data let parse_int data = try Scanf.sscanf data "%i" (fun n -> n) with Scanf.Scan_failure _ | Failure _ -> raise (Error ("invalid integer value")) let parse_float data = try Scanf.sscanf data "%f" (fun n -> n) with Scanf.Scan_failure _ | Failure _ -> raise (Error ("invalid floating-point value")) let parse_regexp data = try Str.regexp_case_fold (trim_spaces data) with Failure msg -> raise (Error ("invalid regular expression: " ^ msg)) let parse_data valuedesc data = match valuedesc with | Bool r -> r := parse_bool data | String r -> r := parse_string data | Int r -> r := parse_int data | Float r -> r := parse_float data | Regexp r -> r := parse_regexp data | OptBool r -> r := Some(parse_bool data) | OptString r -> r := Some(parse_string data) | OptInt r -> r := Some(parse_int data) | OptFloat r -> r := Some(parse_float data) | OptRegexp r -> r := Some(parse_regexp data) let re_line = Str.regexp "[ \t]*\\([A-Za-z][A-Za-z0-9_]*\\)[ \t]*=[ \t]*\\(.*\\)" let re_skip = Str.regexp "#\\|[ \t\r]*$" let parse_line opts s = if Str.string_match re_line s 0 then begin let key = Str.matched_group 1 s and data = Str.matched_group 2 s in try parse_data (List.assoc key opts) data with Not_found -> raise (Error ("unknown variable " ^ key)) end else if not (Str.string_match re_skip s 0) then raise (Error "ill-formed line") let parse opts filename = let ic = open_in filename in let lineno = ref 1 in let errors = ref [] in begin try while true do let s = input_line ic in begin try parse_line opts s with Error msg -> errors := (!lineno, msg) :: !errors end; incr lineno done with End_of_file -> close_in ic end; List.rev !errors spamoracle-release16/configfile.mli000066400000000000000000000006171356223044300176070ustar00rootroot00000000000000(* Parsing configuration files *) type value = | Bool of bool ref | String of string ref | Int of int ref | Float of float ref | Regexp of Str.regexp ref | OptBool of bool option ref | OptString of string option ref | OptInt of int option ref | OptFloat of float option ref | OptRegexp of Str.regexp option ref val parse: (string * value) list -> string -> (int * string) list spamoracle-release16/database.ml000066400000000000000000000154101356223044300170720ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (* Word frequency database *) exception Error of string type short = { s_num_good: int; s_num_spam: int; s_freq: (string, int * int) Hashtbl.t } type full = { mutable f_num_good: int; mutable f_num_spam: int; f_high_freq: (string, int * int) Hashtbl.t; f_low_freq: (string, int * int) Hashtbl.t } let magic = "Mailscrubber" (* + 4 digits for version number *) let check_magic filename ic = let mlen = String.length magic in let buf = really_input_string ic (mlen + 4) in if String.sub buf 0 mlen <> magic then raise(Error(filename ^ ": bad magic number")); try int_of_string (String.sub buf mlen 4) with Failure _ -> raise(Error(filename ^ ": bad magic number")); type db_chan = {zipped : bool ; ic : in_channel} let open_db filename = if Filename.check_suffix filename ".gz" then { ic = Unix.open_process_in ("gunzip -c " ^ filename); zipped = true; } else { ic = open_in_bin filename ; zipped = false } let close_db {zipped = zipped ; ic = ic } = if zipped then ignore(Unix.close_process_in ic) else close_in ic let current_version = if Sys.ocaml_version < "4.03" then 1 else 2 let read_hashtbl filename ic version = try let tbl : ('a, 'b) Hashtbl.t = Marshal.from_channel ic in if version = current_version then tbl else if version > current_version then raise (Error(filename ^ ": database version not supported")) else begin Printf.eprintf "%s: converting from version %d to version %d\n\ Run 'spamoracle upgrade' to suppress this warning.\n%!" filename version current_version; let tbl' = Hashtbl.create (Hashtbl.length tbl / 3) in Hashtbl.iter (fun k d -> Hashtbl.add tbl' k d) tbl; tbl' end with Failure _ -> raise (Error(filename ^ ": database is corrupted")) let read_short filename = let {ic=ic ; zipped=zipped} as db_ic = open_db filename in let version = check_magic filename ic in let ng = input_binary_int ic in let ns = input_binary_int ic in let freq = read_hashtbl filename ic version in close_db db_ic; { s_num_good = ng; s_num_spam = ns; s_freq = freq } let read_full filename = let {ic=ic ; zipped=zipped} as db_ic = open_db filename in let version = check_magic filename ic in let ng = input_binary_int ic in let ns = input_binary_int ic in let high_freq = read_hashtbl filename ic version in let low_freq = read_hashtbl filename ic version in close_db db_ic; { f_num_good = ng; f_num_spam = ns; f_low_freq = low_freq; f_high_freq = high_freq } let temp_file basename = let pid = Unix.getpid() in let rec tmpfile counter = if counter > 10000 then raise (Error "cannot create temporary database"); let filename = basename ^ string_of_int (pid + counter) in try (filename, open_out_gen [Open_wronly; Open_creat; Open_excl; Open_binary] 0o600 filename) with Sys_error _ -> tmpfile (counter + 1) in tmpfile 0 let write_full filename db = let basename, zip = if Filename.check_suffix filename ".gz" then Filename.chop_suffix filename ".gz", true else filename, false in let (tempname, oc) = temp_file (basename ^ ".tmp") in Printf.fprintf oc "%s%04d" magic current_version; output_binary_int oc db.f_num_good; output_binary_int oc db.f_num_spam; Marshal.to_channel oc db.f_high_freq [Marshal.No_sharing]; Marshal.to_channel oc db.f_low_freq [Marshal.No_sharing]; close_out oc; if zip then begin let r = Sys.command ("gzip -best " ^ tempname) in if r = 0 then Sys.rename (tempname ^ ".gz") filename else Sys.rename tempname basename end else Sys.rename tempname filename let create sz = { f_num_good = 0; f_num_spam = 0; f_high_freq = Hashtbl.create sz; f_low_freq = Hashtbl.create sz } let add_good db w = begin try let (g, s as f) = Hashtbl.find db.f_high_freq w in Hashtbl.replace db.f_high_freq w (g+1, s) with Not_found -> try let (g, s as f) = Hashtbl.find db.f_low_freq w in let g' = g + 1 in if 2 * g' + s >= 5 then begin Hashtbl.remove db.f_low_freq w; Hashtbl.add db.f_high_freq w (g', s) end else Hashtbl.replace db.f_low_freq w (g', s) with Not_found -> Hashtbl.add db.f_low_freq w (1, 0) end let add_spam db w = begin try let (g, s) = Hashtbl.find db.f_high_freq w in Hashtbl.replace db.f_high_freq w (g, s+1) with Not_found -> try let (g, s) = Hashtbl.find db.f_low_freq w in let s' = s + 1 in if 2 * g + s' >= 5 then begin Hashtbl.remove db.f_low_freq w; Hashtbl.add db.f_high_freq w (g, s') end else Hashtbl.replace db.f_low_freq w (g, s') with Not_found -> Hashtbl.add db.f_low_freq w (0, 1) end open Printf let dump db oc = let dump_entry w (g, s) = fprintf oc "%s %d %d\n" w g s in fprintf oc "SPAMORACLE/1 %d %d\n" db.f_num_good db.f_num_spam; Hashtbl.iter dump_entry db.f_high_freq; Hashtbl.iter dump_entry db.f_low_freq let split s = try let i = String.index s ' ' in let j = String.index_from s (i + 1) ' ' in (String.sub s 0 i, int_of_string (String.sub s (i + 1) (j - i - 1)), int_of_string (String.sub s (j + 1) (String.length s - j - 1))) with Not_found -> raise(Error("Database restoration: ill-formed line `" ^ String.escaped s ^ "'")) let restore ic = let db = create 997 in begin try let (w, ng, ns) = split (input_line ic) in if w <> "SPAMORACLE/1" then raise (Error("Database restoration: wrong version")); db.f_num_good <- ng; db.f_num_spam <- ns with End_of_file -> raise (Error("Database restoration: first line missing")); end; begin try while true do let (w, g, s) = split (input_line ic) in if 2 * g + s >= 5 then Hashtbl.add db.f_high_freq w (g, s) else Hashtbl.add db.f_low_freq w (g, s) done with End_of_file -> () end; db let in_short db w = Hashtbl.mem db.s_freq w let in_full db w = Hashtbl.mem db.f_high_freq w spamoracle-release16/database.mli000066400000000000000000000027471356223044300172540ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (** Word frequency database *) type short = { s_num_good: int; s_num_spam: int; s_freq: (string, int * int) Hashtbl.t } type full = { mutable f_num_good: int; mutable f_num_spam: int; f_high_freq: (string, int * int) Hashtbl.t; f_low_freq: (string, int * int) Hashtbl.t } val read_short: string -> short val read_full: string -> full val write_full: string -> full -> unit val create: int -> full val add_good: full -> string -> unit val add_spam: full -> string -> unit val dump: full -> out_channel -> unit val restore: in_channel -> full val in_short: short -> string -> bool val in_full: full -> string -> bool val current_version: int exception Error of string spamoracle-release16/htmlscan.mli000066400000000000000000000017271356223044300173160ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (** Approximate HTML scanner. Extracts words and certain parameters of certain tags (e.g. URLs) from HTML text. *) val extract_text: string -> string spamoracle-release16/htmlscan.mll000066400000000000000000000166201356223044300173170ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (** Approximate HTML scanner. Extracts words from HTML text, as well as certain parameters of certain tags (e.g. URLs). *) { module StringSet = Set.Make(String) module StringMap = Map.Make(String) let re_url_encoding = Str.regexp "%\\([0-9A-Fa-f][0-9A-Fa-f]\\)" let decode_url_percent s = let n = int_of_string ("0x" ^ Str.matched_group 1 s) in String.make 1 (Char.chr n) let decode_url s = Str.global_substitute re_url_encoding decode_url_percent s let entity_table = List.fold_left (fun t (s, c) -> StringMap.add s c t) StringMap.empty ["amp", '&'; "lt", '<'; "gt", '>'; "nbsp", '\160'; "Agrave", '\192'; "Aacute", '\193'; "Acirc", '\194'; "Atilde", '\195'; "Auml", '\196'; "Aring", '\197'; "AElig", '\198'; "Ccedil", '\199'; "Egrave", '\200'; "Eacute", '\201'; "Ecirc", '\202'; "Euml", '\203'; "Igrave", '\204'; "Iacute", '\205'; "Icirc", '\206'; "Iuml", '\207'; "ETH", '\208'; "Ntilde", '\209'; "Ograve", '\210'; "Oacute", '\211'; "Ocirc", '\212'; "Otilde", '\213'; "Ouml", '\214'; "times", '\215'; "Oslash", '\216'; "Ugrave", '\217'; "Uacute", '\218'; "Ucirc", '\219'; "Uuml", '\220'; "Yacute", '\221'; "THORN", '\222'; "szlig", '\223'; "agrave", '\224'; "aacute", '\225'; "acirc", '\226'; "atilde", '\227'; "auml", '\228'; "aring", '\229'; "aelig", '\230'; "ccedil", '\231'; "egrave", '\232'; "eacute", '\233'; "ecirc", '\234'; "euml", '\235'; "igrave", '\236'; "iacute", '\237'; "icirc", '\238'; "iuml", '\239'; "eth", '\240'; "ntilde", '\241'; "ograve", '\242'; "oacute", '\243'; "ocirc", '\244'; "otilde", '\245'; "ouml", '\246'; "divide", '\247'; "oslash", '\248'; "ugrave", '\249'; "uacute", '\250'; "ucirc", '\251'; "uuml", '\252'; "yacute", '\253'; "thorn", '\254'; "yuml", '\255'] let word_breaking_tags = List.fold_right StringSet.add [ "p"; "br"; "ul"; "ol"; "dt"; "li"; "dd"; "table"; "tr"; "th"; "td"; "img"; "div"; "blockquote"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "address" ] StringSet.empty module Output = struct type t = { txt: Buffer.t; extra: Buffer.t } let create() = { txt = Buffer.create 2048; extra = Buffer.create 256 } let clear ob = Buffer.clear ob.txt; Buffer.clear ob.extra let contents ob = Buffer.add_char ob.txt '\n'; Buffer.add_buffer ob.txt ob.extra; Buffer.contents ob.txt let char ob c = Buffer.add_char ob.txt c let string ob s = Buffer.add_string ob.txt s let add_extra ob s = Buffer.add_string ob.extra s; Buffer.add_char ob.extra '\n' let tag ob t = if StringSet.mem t word_breaking_tags then char ob ' '; if !Config.html_add_tags then add_extra ob t let tag_attr ob t n s = let n = String.lowercase_ascii n in if Str.string_match !Config.html_tag_attr (t ^ "/" ^ n) 0 then if n = "href" || n = "src" then add_extra ob (decode_url s) else add_extra ob s end let ob = Output.create() let tag = ref "" let attr_name = ref "" let attr_value = Buffer.create 128 } let ws = [' ' '\n' '\r' '\t'] let name = ['A'-'Z' 'a'-'z'] ['A'-'Z' 'a'-'z' '0'-'9' '.' '-']* let unquotedattrib = [^ '\'' '\"' '>' ' ' '\n' '\r' '\t'] [^ '>' ' ' '\n' '\r' '\t']* rule main = parse "" (* tolerance *) { () } | _ (* tolerance *) { comment lexbuf } | eof (* tolerance *) { () } and tagbody = parse ">" { Output.tag ob !tag } | name { attr_name := Lexing.lexeme lexbuf; tagattrib lexbuf; tagbody lexbuf } | _ (* tolerance -- should be ws *) { tagbody lexbuf } | eof (* tolerance *) { Output.tag ob !tag } and tagattrib = parse ws* '=' ws* { tagvalue lexbuf } | "" { Output.tag_attr ob !tag !attr_name "" } and tagvalue = parse "'" { Buffer.clear attr_value; singlequoted lexbuf } | "\"" { Buffer.clear attr_value; doublequoted lexbuf } | unquotedattrib { Output.tag_attr ob !tag !attr_name (Lexing.lexeme lexbuf) } | "" (* tolerance *) { Output.tag_attr ob !tag !attr_name "" } and singlequoted = parse "'" | eof (* eof is tolerance *) { Output.tag_attr ob !tag !attr_name (Buffer.contents attr_value) } | "&" { Buffer.add_char attr_value (entity lexbuf); singlequoted lexbuf } | _ { Buffer.add_char attr_value (Lexing.lexeme_char lexbuf 0); singlequoted lexbuf } and doublequoted = parse "\"" | eof (* eof is tolerance *) { Output.tag_attr ob !tag !attr_name (Buffer.contents attr_value) } | "&" { Buffer.add_char attr_value (entity lexbuf); doublequoted lexbuf } | _ { Buffer.add_char attr_value (Lexing.lexeme_char lexbuf 0); doublequoted lexbuf } and entity = parse '#' ['0'-'9']+ { let s = Lexing.lexeme lexbuf in let n = int_of_string (String.sub s 1 (String.length s - 1)) in entity_end lexbuf; if n >= 0 && n <= 255 then Char.chr n else '\255' } | name { let s = Lexing.lexeme lexbuf in entity_end lexbuf; try StringMap.find s entity_table with Not_found -> '\255' } | "" (* tolerance *) { '&' } and entity_end = parse ";" ? { () } { let extract_text s = Output.clear ob; main (Lexing.from_string s) } spamoracle-release16/mail.ml000066400000000000000000000204101356223044300162440ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (** Parsing of e-mail messages, including attachments *) type message = { headers: (string * string) list; body: string; parts: message list } let base64_decode_char c = match c with 'A' .. 'Z' -> Char.code c - 65 | 'a' .. 'z' -> Char.code c - 97 + 26 | '0' .. '9' -> Char.code c - 48 + 52 | '+' -> 62 | '/' -> 63 | _ -> -1 let decode_base64 s = let d = Buffer.create (String.length s * 3 / 4) in let buf = Array.make 4 0 in let pos = ref 0 in for i = 0 to String.length s - 1 do let n = base64_decode_char s.[i] in if n >= 0 then begin buf.(!pos) <- n; incr pos; if !pos = 4 then begin Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4)); Buffer.add_char d (Char.chr((buf.(1) land 15) lsl 4 + buf.(2) lsr 2)); Buffer.add_char d (Char.chr((buf.(2) land 3) lsl 6 + buf.(3))); pos := 0 end end done; begin match !pos with 2 -> Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4)) | 3 -> Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4)); Buffer.add_char d (Char.chr((buf.(1) land 15) lsl 4 + buf.(2) lsr 2)) | _ -> () end; Buffer.contents d let hexa_digit c = if c >= '0' && c <= '9' then Char.code c - 48 else if c >= 'A' && c <= 'F' then Char.code c - 65 + 10 else if c >= 'a' && c <= 'f' then Char.code c - 97 + 10 else raise Not_found let decode_qp s = let len = String.length s in let d = Buffer.create (String.length s) in let pos = ref 0 in while !pos < len do let c = s.[!pos] in if c = '=' && !pos + 1 < len && s.[!pos + 1] = '\n' then begin pos := !pos + 2 end else if c = '=' && !pos + 2 < len then begin try let h1 = hexa_digit s.[!pos + 1] and h2 = hexa_digit s.[!pos + 2] in Buffer.add_char d (Char.chr(h1 lsl 4 + h2)); pos := !pos + 3 with Not_found -> Buffer.add_char d c; incr pos end else begin Buffer.add_char d c; incr pos end done; Buffer.contents d let re_base64 = Str.regexp_case_fold "base64" let re_qp = Str.regexp_case_fold "quoted-printable" let decode encoding s = if Str.string_match re_base64 encoding 0 then decode_base64 s else if Str.string_match re_qp encoding 0 then decode_qp s else s let re_encoded_header = Str.regexp "=\\?[_A-Za-z0-9-]+\\?\\([BbQq]\\)\\?\\([^?]*\\)\\?=" let decode_header s = let decode_group s = let enc = Str.matched_group 1 s and txt = Str.matched_group 2 s in match enc with "B" | "b" -> decode_base64 txt | "Q" | "q" -> decode_qp txt | _ -> assert false in Str.global_substitute re_encoded_header decode_group s let re_continuation = Str.regexp "\n[ \t]+" let re_nl = Str.regexp "\n" let re_field = Str.regexp "\\([A-Za-z-]+[: ]\\)[ \t]*\\(.*\\)" let parse_header s = let rec parse_field accu = function [] -> List.rev accu | line :: rem -> if Str.string_match re_field line 0 then begin let field_name = String.lowercase_ascii (Str.matched_group 1 line) and field_val = Str.matched_group 2 line in parse_field ((field_name, decode_header field_val) :: accu) rem end else parse_field accu rem in parse_field [] (Str.split re_nl (Str.global_replace re_continuation " " s)) let find_header name headers = try List.assoc name headers with Not_found -> "" let re_nl_nl = Str.regexp "\n\n" let re_multipart = Str.regexp_case_fold "multipart/.*boundary *= *\\(\"\\([^\"]+\\)\"\\|\\([^ \t]+\\)\\)" let rec parse_message s = try let pos_sep = Str.search_forward re_nl_nl s 0 in let headers = parse_header (String.sub s 0 pos_sep) in let body = String.sub s (pos_sep + 2) (String.length s - pos_sep - 2) in let encoding = find_header "content-transfer-encoding:" headers in let ctype = find_header "content-type:" headers in if Str.string_match re_multipart ctype 0 then begin let boundary = try Str.matched_group 2 ctype with Not_found -> try Str.matched_group 3 ctype with Not_found -> assert false in let re_bound = Str.regexp ("--" ^ Str.quote boundary ^ "[ \t\n]*") in match Str.split_delim re_bound body with [] -> { headers = headers; body = decode encoding body; parts = [] } | blurb :: parts -> { headers = headers; body = decode encoding blurb; parts = List.map parse_message parts } end else { headers = headers; body = decode encoding body; parts = [] } with Not_found -> { headers = []; body = s; parts = [] } let safe_remove fname = try Sys.remove fname with Sys_error _ -> () let run_body_through_external_converter cmd arg body = let infile = Filename.temp_file "spamoracle" ".data" in let outfile = Filename.temp_file "spamoracle" ".txt" in let oc = open_out_bin infile in let ic = open_in_bin outfile in output_string oc body; close_out oc; let retcode = Sys.command (Printf.sprintf "%s %s < %s > %s" cmd (Filename.quote arg) infile outfile) in if retcode <> 0 then begin close_in ic; safe_remove infile; safe_remove outfile; None end else begin let len = in_channel_length ic in let res = really_input_string ic len in close_in ic; safe_remove infile; safe_remove outfile; Some res end let header s msg = let rec hdr = function [] -> [] | (h,v) :: rem -> if h = s then v :: hdr rem else hdr rem in String.concat "\n" (hdr msg.headers) let header_matches s re msg = let rec hmatch = function [] -> false | (h,v) :: rem -> (h = s && Str.string_match re v 0) || hmatch rem in hmatch msg.headers let re_content_text = Str.regexp_case_fold "text/plain\\|text/enriched\\|text;\\|text$" let re_content_html = Str.regexp_case_fold "text/html" let re_content_message_rfc822 = Str.regexp_case_fold "message/rfc822" let re_content_alternative = Str.regexp_case_fold "multipart/alternative" let re_content_multipart = Str.regexp_case_fold "multipart/" let re_content_any_text = Str.regexp_case_fold "text/" let rec iter_text_parts fn m = if header_matches "content-type:" re_content_text m || not(List.mem_assoc "content-type:" m.headers) then fn m else if header_matches "content-type:" re_content_html m then fn {m with body = Htmlscan.extract_text m.body} else if header_matches "content-type:" re_content_alternative m then begin try if not !Config.alternative_favor_html then raise Not_found; iter_text_parts fn (List.find (header_matches "content-type:" re_content_html) m.parts) with Not_found -> fn m; List.iter (iter_text_parts fn) m.parts end else if header_matches "content-type:" re_content_multipart m then begin fn m; List.iter (iter_text_parts fn) m.parts end else if header_matches "content-type:" re_content_message_rfc822 m then iter_text_parts fn (parse_message m.body) else if !Config.external_converter <> "" && not (header_matches "content-type:" re_content_any_text m) then begin match run_body_through_external_converter !Config.external_converter (header "content-type:" m) m.body with | None -> () | Some txt -> fn {m with body = txt} end else () let iter_message fn msg = List.iter (fun (h, v) -> if Str.string_match !Config.mail_headers h 0 then fn v) msg.headers; iter_text_parts (fun m -> fn m.body) msg spamoracle-release16/mail.mli000066400000000000000000000044151356223044300164240ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (** Parsing of e-mail messages, including attachments *) type message = { headers: (string * string) list; body: string; parts: message list } (** The type of parsed e-mail messages. - [headers] is an A-list of pairs [(header-name, header-content)]. [header-name] is lowercased and includes [:], e.g. [subject:]. - [body] is the body of the message. Base64 and quoted-printable encodings are already decoded. For multipart messages, [body] is the initial blurb before the first part. - [parts] is empty except for multipart messages, in which case it lists all parts, recursively represented as messages. *) val parse_message: string -> message (** Parse the given textual message and return its structure. *) val header: string -> message -> string (** [header h msg] returns the contents of header named [h] in message [msg], or the empty string if this header is missing. Remember that header names are lowercased and include the final [:], e.g. [subject:]. *) val iter_text_parts: (message -> unit) -> message -> unit (** [iter_text_parts fn msg] applies [fn] to every (sub-)message contained in [msg] that is of type text. *) val iter_message: (string -> unit) -> message -> unit (** [iter_message fn msg] applies [fn] to the following parts of message [msg]: - the headers that match [!Config.mail_headers]; - the body of every sub-message of [msg] that is of type text. *) spamoracle-release16/main.ml000066400000000000000000000222451356223044300162560ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (* Argument parsing and main program *) open Printf open Mbox open Database open Processing exception Usage of string let default_config_name = try Filename.concat (Sys.getenv "HOME") ".spamoracle.conf" with Not_found -> ".spamoracle.conf" let parse_config_file file = try let errs = Configfile.parse Config.options file in if errs <> [] then begin eprintf "Error while reading configuration file %s:\n" file; List.iter (fun (line, msg) -> eprintf "Line %d: %s\n" line msg) errs; exit 2 end with Sys_error msg -> eprintf "Cannot read configuration file %s:\n%s\n" file msg let mark_command args = let db = Database.read_short !Config.database_name in if args = [] then mark_message db (read_single_msg stdin) else List.iter (fun f -> mbox_file_iter f (mark_message db)) args let add_command args = let db = try Database.read_full !Config.database_name with Sys_error _ -> Database.create 997 in let processed = ref false and is_spam = ref true and verbose = ref false in let rec parse_args = function | "-v" :: rem -> verbose := true; parse_args rem | "-spam" :: rem -> is_spam := true; parse_args rem | "-good" :: rem -> is_spam := false; parse_args rem | f :: rem -> mbox_file_iter f (add_message db !verbose !is_spam); processed := true; parse_args rem | [] -> if not !processed then add_message db !verbose !is_spam (read_single_msg stdin); if !verbose then printf "\r%6d / %6d good / spam messages\n" db.f_num_good db.f_num_spam in parse_args args; Database.write_full !Config.database_name db let list_command args = let db = Database.read_full !Config.database_name in let res = ref [] in List.iter (fun s -> let re = Str.regexp (s ^ "$") in let match_word w (g, s) = if Str.string_match re w 0 then begin let p = if 2 * g + s < 5 then -1.0 else Rankmsg.word_proba g s db.f_num_good db.f_num_spam in res := (w, p, g, s) :: !res end in Hashtbl.iter match_word db.f_high_freq; Hashtbl.iter match_word db.f_low_freq) args; if !res = [] then Printf.printf "No matching word found in database.\n" else begin Printf.printf " Word Proba #good #spam\n"; List.iter (fun (w, p, g, s) -> if p >= 0.0 then Printf.printf "%-15s%8.2f%8d%8d\n" w p g s else Printf.printf "%-15s ----%8d%8d\n" w g s) (List.sort (fun (_, p1, _, _) (_, p2, _, _) -> compare p2 p1) !res) end let test_command args = let db = Database.read_short !Config.database_name in let low = ref 0.0 and high = ref 1.0 in let rec parse_args = function | "-min" :: s :: rem -> begin try low := float_of_string s with Failure _ -> raise(Usage("bad argument to -min")) end; parse_args rem | "-min" :: [] -> raise(Usage("no argument to -min")) | "-max" :: s :: rem -> begin try high := float_of_string s with Failure _ -> raise(Usage("bad argument to -max")) end; parse_args rem | "-max" :: [] -> raise(Usage("no argument to -max")) | f :: rem -> mbox_file_iter f (test_message db !low !high f); parse_args rem | [] -> () in parse_args args let stat_command args = let db = Database.read_short !Config.database_name in let stat_mbox f = let num_msgs = ref 0 and num_good = ref 0 and num_spam = ref 0 and num_unknown = ref 0 in mbox_file_iter f (fun s -> incr num_msgs; match stat_message db s with Msg_good -> incr num_good | Msg_spam -> incr num_spam | Msg_unknown -> incr num_unknown); let percentage a b = 100.0 *. float a /. float b in if !num_msgs > 0 then printf "%s: %d (%.2f%%) good, %d (%.2f%%) unknown, %d (%.2f%%) spam\n" f !num_good (percentage !num_good !num_msgs) !num_unknown (percentage !num_unknown !num_msgs) !num_spam (percentage !num_spam !num_msgs) in List.iter stat_mbox args let words_command args = let db = Database.read_short !Config.database_name in if args = [] then wordsplit_message db (read_single_msg stdin) else List.iter (fun f -> mbox_file_iter f (fun msg -> print_string "----------------------------------------\n"; wordsplit_message db msg)) args let backup_command () = Database.dump (Database.read_full !Config.database_name) stdout let restore_command () = Database.write_full !Config.database_name (Database.restore stdin) let upgrade_command () = let db = Database.read_full !Config.database_name in Database.write_full !Config.database_name db; printf "Converted %s to version %d.\n" !Config.database_name Database.current_version let rec parse_args_1 = function "-config" :: file :: rem -> parse_config_file file; parse_args_2 rem | "-config" :: [] -> raise(Usage("Option -config requires an argument")) | rem -> if Sys.file_exists default_config_name then parse_config_file default_config_name; parse_args_2 rem and parse_args_2 = function | "-f" :: file :: rem -> Config.database_name := file; parse_args_3 rem | "-f" :: [] -> raise(Usage("Option -f requires an argument")) | rem -> parse_args_3 rem and parse_args_3 = function "mark" :: rem -> mark_command rem | "add" :: rem -> add_command rem | "list" :: rem -> list_command rem | "test" :: rem -> test_command rem | "stat" :: rem -> stat_command rem | "backup" :: rem -> backup_command () | "restore" :: rem -> restore_command () | "words" :: rem -> words_command rem | "upgrade" :: rem -> upgrade_command () | s :: rem -> raise(Usage("Unknown command " ^ s)) | [] -> raise(Usage "") let usage_string = "\ Usage: spamoracle [-config conf] [-f db] mark {mailbox}* Add 'X-Spam:' headers to messages with result of analysis {mailbox}* Mailboxes containing messages to analyze and mark If none given, read single msg from standard input spamoracle [-config conf] [-f db] add [-v] -spam {spambox}* -good {goodbox}* Create or update database with known spam or non-spam messages -v Print progress bar -spam Indicate subsequent mailboxes contain spam -good Indicate subsequent mailboxes contain good msgs (not spam) {spambox}* Mailboxes containing spam {goodbox}* Mailboxes containing good messages (not spam) If no mailbox given, read single msg from standard input spamoracle [-config conf] [-f db] test [-min prob] [-max prob] {mailbox}* Analyze messages and print summary of results for each message -min Don't print messages with result below -max Don't print messages with result above {mailbox}* Mailboxes containing messages to analyze spamoracle [-config conf] [-f db] stat {mailbox}* Analyze messages and print percentages of spam/non-spam for each mailbox {mailbox}* Mailboxes containing messages to analyze spamoracle [-config conf] [-f db] list {regexp}* Dump word statistics in database {regexp}* Regular expressions for words we are interested in spamoracle [-config conf] [-f db] backup > database.backup Dump whole database in portable text format on standard output spamoracle [-config conf] [-f db] restore < database.backup Restore database from text backup file read from standard input spamoracle [-config conf] [-f db] upgrade Convert database to the latest format spamoracle [-config conf] [-f db] words {mailbox}* Extract words from messages and print them {mailbox}* Mailboxes containing messages to scan If no mailbox given, read single msg from standard input Common options: -config Configuration file (default $HOME/.spamoracle.conf) -f Database to use (default $HOME/.spamoracle.db)" let main () = try parse_args_1 (List.tl (Array.to_list Sys.argv)) with | Usage msg -> eprintf "%s\n%s\n" msg usage_string; exit 2 | Sys_error msg -> eprintf "System error: %s\n" msg | Database.Error msg -> eprintf "%s\n" msg let _ = main() spamoracle-release16/mbox.ml000066400000000000000000000052321356223044300162740ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (* Reading of a mailbox file and splitting into individual messages *) type t = { ic: in_channel; zipped: bool; mutable start: string; buf: Buffer.t } let open_mbox_file filename = if Filename.check_suffix filename ".gz" then { ic = Unix.open_process_in ("gunzip -c " ^filename); zipped = true; start = ""; buf = Buffer.create 50000 } else { ic = open_in filename; zipped = false; start = ""; buf = Buffer.create 50000 } let open_mbox_channel ic = { ic = ic; zipped = false; start = ""; buf = Buffer.create 50000 } let read_msg t = Buffer.clear t.buf; Buffer.add_string t.buf t.start; let rec read () = let line = input_line t.ic in if String.length line >= 5 && String.sub line 0 5 = "From " && Buffer.length t.buf > 0 then begin t.start <- (line ^ "\n"); Buffer.contents t.buf end else begin Buffer.add_string t.buf line; Buffer.add_char t.buf '\n'; read () end in try read() with End_of_file -> if Buffer.length t.buf > 0 then begin t.start <- ""; Buffer.contents t.buf end else raise End_of_file let close_mbox t = if t.zipped then ignore(Unix.close_process_in t.ic) else close_in t.ic let mbox_file_iter filename fn = let ic = open_mbox_file filename in try while true do fn(read_msg ic) done with End_of_file -> close_mbox ic let mbox_channel_iter inchan fn = let ic = open_mbox_channel inchan in try while true do fn(read_msg ic) done with End_of_file -> close_mbox ic let read_single_msg inchan = let res = Buffer.create 10000 in let buf = Bytes.create 1024 in let rec read () = let n = input inchan buf 0 (Bytes.length buf) in if n > 0 then begin Buffer.add_subbytes res buf 0 n; read () end in read (); Buffer.contents res spamoracle-release16/mbox.mli000066400000000000000000000036751356223044300164560ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (** Reading of a mailbox file and splitting into individual messages *) type t (** The type of channels opened on a mailbox *) val open_mbox_file: string -> t (** Open the given file name as a mailbox, and return an mbox channel ready for reading. If the file name ends in [.gz], arrange for on-the-fly decompression with [zcat]. *) val open_mbox_channel: in_channel -> t (** Open the given input channel as a mailbox. *) val read_msg: t -> string (** Read the next message from the given channel, and return it as a string. Raise [End_of_file] if no message remains. *) val close_mbox: t -> unit (** Close the given mbox channel. *) val mbox_file_iter: string -> (string -> unit) -> unit (** [mbox_file_iter filename fn] reads messages from the file named [filename], and applies [fn] in turn to each message. *) val mbox_channel_iter: in_channel -> (string -> unit) -> unit (** [mbox_channel_iter ic fn] reads messages from the input channel [ic], and applies [fn] in turn to each message. *) val read_single_msg: in_channel -> string (** Read one message from the given channel, up to end of file *) spamoracle-release16/processing.ml000066400000000000000000000075351356223044300175130ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (* Processing messages *) open Printf open Mail open Database open Rankmsg (* Mark message with rank info *) let re_nl_nl = Str.regexp "\n\n" let mark_message db txt = let m = parse_message txt in let r = rank_message db m in try let pos_sep = Str.search_forward re_nl_nl txt 0 in output_substring stdout txt 0 pos_sep; let verdict = if r.spam_prob <= !Config.good_mail_prob && r.num_meaningful >= !Config.min_meaningful_words then "no" else if r.spam_prob >= !Config.spam_mail_prob && r.num_meaningful >= !Config.min_meaningful_words then "yes" else "unknown" in printf "\n%s: %s; %.2f; %s" !Config.spam_header verdict r.spam_prob r.explanation; if !Config.summarize_attachments then begin let att = Attachments.summarize m in if att <> "" then printf "\n%s: %s" !Config.attachments_header att; end; if !Config.summarize_referenced then begin let refh = Refhosts.summarize () in if refh <> "" then printf "\n%s: %s" !Config.referenced_header refh; end; output_substring stdout txt pos_sep (String.length txt - pos_sep) with Not_found -> print_string txt (* Add messages to database *) let record_words db is_spam txt = Wordsplit.iter (fun w -> if is_spam then add_spam db w else add_good db w) (in_full db) txt let add_message db verbose is_spam msg = if verbose then begin printf "\r%6d / %6d" db.f_num_good db.f_num_spam; flush stdout end; iter_message (record_words db is_spam) (parse_message msg); if is_spam then db.f_num_spam <- db.f_num_spam + 1 else db.f_num_good <- db.f_num_good + 1 (* Test analysis on a message *) let test_message db low high f txt = let msg = parse_message txt in let r = rank_message db msg in if r.spam_prob >= low && r.spam_prob <= high then begin printf "--------------------------------------------------\n"; printf "From: %s\n" (header "from:" msg); printf "Subject: %s\n" (header "subject:" msg); printf "Score: %.2f -- %d\n" r.spam_prob r.num_meaningful; printf "Details: %s\n" r.explanation; if !Config.summarize_attachments then begin let att = Attachments.summarize msg in if att <> "" then printf "Attachments: %s\n" att end; if !Config.summarize_referenced then begin let refh = Refhosts.summarize () in if refh <> "" then printf "Referenced hosts: %s\n" refh end; printf "File: %s\n" f; end (* Statistics *) type message_class = Msg_good | Msg_unknown | Msg_spam let stat_message db txt = let msg = parse_message txt in let r = rank_message db msg in if r.spam_prob <= 0.2 && r.num_meaningful >= 5 then Msg_good else if r.spam_prob >= 0.8 && r.num_meaningful >= 5 then Msg_spam else Msg_unknown (* Word splitting *) let wordsplit_message db txt = Format.open_hovbox 0; Mail.iter_message (fun txt -> Wordsplit.iter (fun word -> Format.print_string word; Format.print_space()) (in_short db) txt) (parse_message txt); Format.close_box(); Format.print_newline() spamoracle-release16/processing.mli000066400000000000000000000024041356223044300176520ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (* Processing messages *) val mark_message : Database.short -> string -> unit val record_words : Database.full -> bool -> string -> unit val add_message : Database.full -> bool -> bool -> string -> unit val test_message : Database.short -> float -> float -> string -> string -> unit type message_class = Msg_good | Msg_unknown | Msg_spam val stat_message : Database.short -> string -> message_class val wordsplit_message : Database.short -> string -> unit spamoracle-release16/rankmsg.ml000066400000000000000000000112401356223044300167650ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (* Message ranking *) open Mail open Database let word_count_in w res = let count = ref 0 in for i = 0 to Array.length res - 1 do if w = fst res.(i) then incr count done; !count let add_word w p res = let i = ref 0 in while !i < Array.length res && abs_float (p -. 0.5) <= abs_float(snd res.(!i) -. 0.5) do incr i done; if !i < Array.length res then begin for j = Array.length res - 1 downto !i + 1 do res.(j) <- res.(j - 1) done; res.(!i) <- (w, p) end let normalize (p : float) low high = if p > high then high else if p < low then low else p let cap (p : float) = if p > 1.0 then 1.0 else p let word_proba g b num_g num_b = let g = 2 * g in (* Graham's magic factor to bias in favor of ham *) let pgood = cap (float g /. float num_g) and pbad = cap (float b /. float num_b) in let p = pbad /. (pgood +. pbad) in if !Config.robinson_s = 0.0 then normalize p !Config.low_freq_limit !Config.high_freq_limit else begin (* Robinson's adjustement *) let n = float (g + b) in let p = (!Config.robinson_s *. !Config.robinson_x +. n *. p) /. (!Config.robinson_s +. n) in (* Result normalization *) normalize p !Config.low_freq_limit !Config.high_freq_limit end let process_word (db, res) w = try let (g, b) = Hashtbl.find db.s_freq w in if word_count_in w res < !Config.max_repetitions then begin let p = word_proba g b db.s_num_good db.s_num_spam in add_word w p res end with Not_found -> () let process_words ((db, res) as ctx) txt = Wordsplit.iter (process_word ctx) (in_short db) txt; if !Config.summarize_referenced then Refhosts.add txt let process_msg ctx m = iter_message (process_words ctx) m (* This is Graham's original approach *) let spaminess_score_graham res = let p = ref 1.0 and pexp = ref 0 and cp = ref 1.0 and cpexp = ref 0 in for i = 0 to Array.length res - 1 do let (_, x) = res.(i) in p := !p *. x; if !p <= 1e-100 then begin let (m, e) = frexp !p in p := m; pexp := !pexp + e end; cp := !cp *. (1.0 -. x); if !cp <= 1e-100 then begin let (m, e) = frexp !cp in cp := m; cpexp := !cpexp + e end done; if !cpexp < !pexp then cp := ldexp !cp (!cpexp - !pexp) else if !cpexp > !pexp then p := ldexp !p (!pexp - !cpexp); !p /. (!p +. !cp) (* This is Robinson's chi-square stuff *) let chi2_inverse m n = (* chi2 inverse of 2m with 2n degrees *) let t = ref (exp (-. m)) in let s = ref !t in for i = 1 to n do t := !t *. m /. float i; s := !s +. !t done; if !s >= 1.0 then 1.0 else !s let log2 = log 2.0 let chi2_hypothesis ps = (* Compute -2 * ln (product ps). Be careful with underflows. *) let p = ref 1.0 and pexp = ref 0 in for i = 0 to Array.length ps - 1 do p := !p *. ps.(i); if !p <= 1e-100 then begin let (x, e) = frexp !p in p := x; pexp := !pexp + e end done; chi2_inverse (-. (log !p +. log2 *. float !pexp)) (Array.length ps) let spaminess_score_robinson res = let probs = Array.map snd res in let cprobs = Array.map (fun x -> 1.0 -. x) probs in 0.5 *. (1.0 +. chi2_hypothesis probs -. chi2_hypothesis cprobs) type rank = { spam_prob: float; num_meaningful: int; explanation: string } let rank_message db msg = Refhosts.reset(); let res = Array.make !Config.num_words_retained ("", 0.5) in process_msg (db, res) msg; let p = if !Config.use_chi_square then spaminess_score_robinson res else spaminess_score_graham res in let meaningful = ref 0 in while !meaningful < Array.length res && fst res.(!meaningful) <> "" do incr meaningful done; let summary = Buffer.create 200 in for i = 0 to !meaningful - 1 do let (w, p) = res.(i) in Printf.bprintf summary "%s:%02d " w (truncate (p *. 100.0)) done; { spam_prob = p; num_meaningful = !meaningful; explanation = Buffer.contents summary } spamoracle-release16/rankmsg.mli000066400000000000000000000020241356223044300171360ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (** Message ranking *) type rank = { spam_prob: float; num_meaningful: int; explanation: string } val rank_message: Database.short -> Mail.message -> rank val word_proba: int -> int -> int -> int -> float spamoracle-release16/refhosts.ml000066400000000000000000000027701356223044300171700ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (* Extract hostnames or IP addresses referenced from http URLs in message bodies. *) let re_url = Str.regexp_case_fold "http://\\([^@]+@\\)?\\([a-z0-9-]+\\(\\.[a-z0-9-]+\\)+\\)" module StringSet = Set.Make(String) let hosts = ref StringSet.empty let reset() = hosts := StringSet.empty let rec add_urls txt pos = let matched = try ignore (Str.search_forward re_url txt pos); true with Not_found -> false in if matched then begin hosts := StringSet.add (Str.matched_group 2 txt) !hosts; add_urls txt (Str.match_end()) end let add txt = add_urls txt 0 let summarize () = let lst = StringSet.elements !hosts in hosts := StringSet.empty; String.concat " " lst spamoracle-release16/refhosts.mli000066400000000000000000000017421356223044300173370ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id$ *) (* Extract hostnames or IP addresses referenced from http URLs in message bodies. *) val reset: unit -> unit val add: string -> unit val summarize: unit -> string spamoracle-release16/spamoracle.1000066400000000000000000000407511356223044300172120ustar00rootroot00000000000000.TH SPAMORACLE 1 .SH NAME spamoracle \- a spam classification tool .SH SYNOPSIS .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B mark [ .I mailbox ... ] .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B add .RB [ \-v ] .B \-spam .I spambox ... .B \-good .I goodbox ... .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B test .RB [ \-min .IR prob ] .RB [ \-max .IR prob ] [ .I mailbox ... ] .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B stat [ .I mailbox ... ] .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B list .I regexp ... .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B backup .B > .I backupfile .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B restore .B < .I backupfile .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B upgrade .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B words [ .I mailbox ... ] .SH DESCRIPTION SpamOracle is a tool to help detect and filter away "spam" (unsolicited commercial e-mail). It proceeds by statistical analysis of the words that appear in the e-mail, comparing the frequencies of words with those found in a user-provided corpus of known spam and known legitimate e-mail. The classification algorithm is based on Bayes' formula, and is described in Paul Graham's paper, .IR "A plan for spam" , .BR http://www.paulgraham.com/spam.html . This program is designed to work in conjunction with .BR procmail (1). The result of the analysis is output as an additional message header .B "X-Spam:" followed by .BR "yes" , .B "no" or .BR "unknown" , plus additional details. A procmail rule can then test this .B "X-Spam:" header and deliver the e-mail to the appropriate mailbox. In addition, SpamOracle also analyses MIME attachments, extracting relevant information such as MIME type, character encoding and attached file name, and summarizing them in an additional .B "X-Attachments:" header. This allows procmail to easily reject e-mails containing suspicious attachments, e.g. Windows executables which often indicate a virus. .SH "REQUIREMENTS AND LIMITATIONS" To use SpamOracle, your mail must be delivered to a Unix machine on which you have a shell account. This machine must have .BR procmail (1) (see .BR http://www.procmail.org/ ) installed. Your .B "~/.forward" file must be set up to run all incoming e-mail through .BR procmail (1). If your mail server supports the POP or IMAP protocols, you can also use .BR fetchmail (1) to fetch your mail from the server and have it delivered to your local machine. To provide the corpus of messages from which SpamOracle "learns", an archive of about 1000 of your e-mails is needed. The archive must be manually or semi-automatically split into known spams and known good messages. Mis-classified messages in the corpus (e.g. spams mistakenly stored among the good messages) will decrease the efficiency of the classification. The archive must be in Unix mailbox format, or in "one message per file" format (a la MH). Other formats, such as Emacs' Babyl, are not supported. The notion of "word" used by SpamOracle is slanted towards Western European languages, i.e. the ISO Latin-1 and Latin-9 character sets. Preliminary support for JIS-encoded Japanese can be selected at compile-time. SpamOracle will not work well if you receive many legitimate e-mails written in other character sets, such as Chinese or Korean sets. .SH INITIALIZATION To build the database of word frequencies from the corpus, do: .RS .ft 3 .nf .sp rm ~/.spamoracle.db spamoracle add -v -good goodmails -spam spammails .ft .LP .RE .fi By default, the database is stored in the file .B ".spamoracle.db" in your home directory. This can be overriden with the .B -f option: .BI "spamoracle -f mydatabase add" " ..." The .B \-v option prints progress information during the processing of the corpus. This assumes that the good, non-spam messages from the corpus are stored in the file .BR goodmails , and the known spam messages in the file .BR spammails . You can also fetch corpus messages from several files, and/or process them via several invocations of SpamOracle: .RS .ft 3 .nf .sp spamoracle add -good goodmails1 ... goodmailsN spamoracle add -spam spammails1 ... spammailsP .ft .LP .RE .fi .SH "TESTING THE DATABASE" To check that the database was built correctly, and familiarize yourself with the statistical analysis performed by SpamOracle, invoke the "test" mode on the mailboxes that you just used for building the corpus: .RS .ft 3 .nf .sp spamoracle test goodmails | more spamoracle test spammails | more .ft .LP .RE .fi For each message in the given mailboxes, you'll see a summary like this: .RS .ft 3 .nf .sp From: bbo Subject: Check This Out Score: 1.00 -- 15 Details: refid:98 $$$$:98 surfing:98 asp:95 click:93 cable:92 instantly:90 https:88 internet:87 www:86 U4:85 isn't:14 month:81 com:75 surf:75 Attachments: cset="GB2312" type="application/octet-stream" name="Guangwen4.zip" File: inbox/314 .ft .LP .RE .fi The first two lines are just the .B From: and .B Subject: fields of the original message. The .B "Score:" line summarizes the result of the analysis. The first number (between 0.0 and 1.0) is the probability that the message is actually spam --- or, equivalently, the degree of similarity of the message with the spam messages in the corpus. The second number (an integer between 0 and 15) is the number of "interesting" words found in the message. "Interesting" words are those that occur at least 5 times in the corpus. In the example, we have 15 interesting words (the maximum) and a score of 1.00, indicating a spam with high certainty. The .B "Details:" line provides an explanation of the score. It lists the 15 most interesting words found in the message, that is, the 15 interesting words whose probability of denoting a spam is farthest away from the neutral 0.5. Each word is given with its individual score, written as a percentage (between 01 and 99) rather than as a probability so as to save space. Here, we see a number of very "spammish" words such as .B "$$$$" or .BR "click" , with probability 0.98 and 0.93 respectively, and a few "innocent" words such as .B "isn't" (probability 0.14). The .B "U4" word with probability 0.85 is actually a pseudo-word representing a 4-letter word all in uppercase -- something spammers are fond of. The .B "Attachments:" line summarizes some information about MIME attachments for this message. Here, we have one attachment of type .BR "application/octect-stream" , file name .BR "Guangwen4.zip" , and character set .B "GB2312" (an encoding for Chinese). The .B "File:" line shows the file that is being tested. Normally, when running .BR "spamoracle test goodmails" , most messages should come out with low score (0.2 or less), and when running .BR "spamoracle test spammails" , most messages should come out with a high score (0.8 or more). If not, your corpus isn't very good, or not well classified into spam and non-spam. To quickly see the outliers, you can reduce the interval of scores for which message summaries are displayed, as follows: .RS .ft 3 .nf .sp spamoracle test -min 0.2 goodmails | more # Shows only good mails with score >= 0.2 spamoracle test -max 0.8 spammails | more # Shows only spam mails with score <= 0.8 .ft .LP .RE .fi Now, for a more challenging test, take a mailbox that contains unfiltered e-mails, i.e. a mixture of spam and legitimate e-mails, and run it through SpamOracle: .RS .ft 3 .nf .sp spamoracle test mymailbox | less .ft .LP .RE .fi Marvel at how well the oracle recognizes spam from the rest! If the result isn't that marvelous to you, keep in mind that certain spams are just too short to be recognized (not enough significant words). Also, perhaps your corpus was too small, or not well categorized... .SH "MARKING AND FILTERING INCOMING E-MAIL" Once the database is built, you're ready to run incoming e-mails through SpamOracle. The command .B spamoracle mark reads one e-mail from standard input, and copies it to standard output, with two headers inserted: .B "X-Spam:" and .BR "X-Attachments:" . The .B "X-Spam:" header has one the following formats: .B " X-Spam: yes; .IB score ; .I details or .B " X-Spam: no; .IB score ; .I details or .B " X-Spam: unknown; .IB score ; .I details The .I score and .I details are as described for .BR "spamoracle test" . The .BR "yes" / "no" / "unknown" tag synthesizes the results of the analysis: .B "yes" means that the score is >= 0.8 and at least 5 interesting words were found; .B "no" means that the score is <= 0.2 and at least 5 interesting words were found; .B "unknown" is returned otherwise. The .B "unknown" case generally occurs for very short messages, where not enough interesting words were found. The .B "X-Attachments:" header contains the same information as the .B "Attachments:" output of .BR "spamoracle test" , that is, a summary of the message attachments. To process automatically your incoming e-mail through SpamOracle and act upon the results of the analysis, just insert the following "recipes" in the file ~/.procmailrc: .RS .ft 3 .nf .sp :0fw | /usr/local/bin/spamoracle mark :0 * ^X-Spam: yes; spambox .ft .LP .RE .fi What these cryptic commands mean is: - Run every mail through the .B "spamoracle mark" command. (If spamoracle wasn't installed in /usr/local/bin, adjust the path as necessary.) This adds two headers to the message: .B "X-Spam:" and .BR "X-Attachments:" , describing the results of the spam analysis and the attachment analysis. - If we have an .B "X-Spam: yes" header, deliver the message to the file .B "spambox" rather than to your regular mailbox. Presumably, you'll read .B "spambox" once in a while, but less often than your regular mailbox. Daring users can put .B "/dev/null" instead of .B "spambox" to just throw away the message, but please don't do that until you've used SpamOracle for a while and are happy with the results. SpamOracle's false positive rate (i.e. legitimate mails classified as spam) is low (0.1%) but not null. So, better save the presumed spams somewhere, and scan them quickly from time to time. If you'd like to enjoy a bit of attachment-based filtering, here are some procmail rules for that: .RS .ft 3 .nf .sp :0 * ^X-Attachments:.*name=".*\\.(pif|scr|exe|bat|com)" spambox :0 * ^X-Attachments:.*type="audio/(x-wav|x-midi) spambox :0 * ^(Content-type:.*|X-Attachments:.*cset="|^Subject:.*=\\?)(ks_c|gb2312|iso-2|euc-|big5|windows-1251) spambox .ft .LP .RE .fi The first rule treats as spam every mail that has a Windows executable as attachment. These mails are typically sent by viruses. The second rule does the same with attachments of type x-wav or x-midi. I never normally receive music by e-mail, however some popular e-mail viruses seem fond of these attachment types. The third rule treats as spam every mail that uses character encodings corresponding to Korean, Chinese, Japanese, and Cyrillic. .SH "UPDATING THE DATABASE" At any time, you can add more known spams or known legitimate messages to the database by using the .B "spamoracle add" command. For instance, if you find a spam message that was not classified as such, run it through .BR "spamoracle add -spam" , so that SpamOracle can learn from its mistake. (Without additional arguments, this command will read a single message from standard input and record it as spam.) Under .BR mutt (1) for instance, just highlight the spam message and type .RS .ft 3 .nf .sp |spamoracle add -spam .ft .LP .RE .fi Similarly, if you find a legitimate message while checking your spam box, run it through .BR "spamoracle add -good" . Another option is to collect more known spams or more known good messages into mailbox files, and once in a while do .B spamoracle add -good new_good_mails or .BR "spamoracle add -spam new_spam_mails" . .SH "QUERYING THE DATABASE" For your edification and entertainment, the contents of the database can be queried by regular expressions. The .BI "spamoracle list " regexp command lists all words in the database that match .I regexp (an Emacs-style regular expression), along with their number of occurrences in spam mail and in good mail. For instance: .RS .ft 3 .nf .sp spamoracle list '.*' # show all words -- big list! spamoracle list 'sex.*' spamoracle list 'linux.*' .ft .LP .RE .fi .SH "DATABASE BACKUPS AND UPGRADES" The database used by SpamOracle is stored in a compact, binary format that is not human-readable. Moreover, this format is subject to change in later versions of SpamOracle. To facilitate backups and upgrades, the database contents can also be manipulated in a portable, text format. The .B "spamoracle backup" command dumps the contents of the database to standard output, in a textual, portable format. The .B "spamoracle restore" command reads such a dump from standard input and rebuilds the database with this data. The recommended procedure for upgrading to a newer version of SpamOracle is: .RS .ft 3 .nf .sp # Before the upgrade: spamoracle backup > backupfile # Upgrade SpamOracle # Restore the database spamoracle restore < backupfile .ft .LP .RE .fi Alternatively, the .B "spamoracle upgrade" command converts a database created with an earlier version of SpamOracle to the native database format of the current version of SpamOracle. .SH "CONFIGURING FILTERING PARAMETERS" Many of the parameters that govern message classification can be configured via a configuration file. By default, the configuration is read from the file .B ".spamoracle.conf" in the user's home directory. A different configuration file can be specified on the command line using the .B -config option: .BI "spamoracle -config myconfigfile" " ..." The list of configurable parameters and the format of the configuration file are described in .BR spamoracle.conf (5). All parameters have reasonable defaults, but you can try to improve the quality of classification further by tweaking them. To determine the impact of your changes, use either the .B test or .B stat commands to .BR spamoracle . The .B spamoracle stat command prints a one-line summary of how many spam, non-spam, and unknown messages were found in the mailboxes given as arguments. .SH "TECHNICAL DETAILS" SpamOracle's notion of "word" is any run of 3 to 12 of the following characters: letters, single quotes, and dashes (-). If support for non-English european languages was compiled in, word characters also include the relevant accented letters for the languages in question. All words are mapped to lowercase, and accented letters are mapped to the corresponding non-accented letters. A run of 3 to 12 of the following characters also constitutes a word: digits, dots, commas, and dollar, Euro and percent signs. In addition, a run of three or more uppercase letters generates a pseudo-word .BI U n where .I n is the length of the run. Similarly, a run of three or more non-ASCII characters (code >= 128) generates a pseudo-word .BI W n where .I n is the length of the run. For instance, the following text: .RS .ft 3 .nf .sp SUMMER in English is written "été" in French ¹²³ .ft .LP .RE .fi is processed into the following words, assuming French support was selected at compile-time: .RS .ft 3 .nf .sp U5 summer english written ete french W3 .ft .LP .RE .fi and if French support was not selected: .RS .ft 3 .nf .sp U5 summer english written french W3 .ft .LP .RE .fi To see the words that are extracted from a message, issue the .B spamoracle words command. It reads either a single message from standard input, or all messages from the mailbox files given as arguments, decomposes the messages into words and prints the words. .SH RANDOM NOTES The database file can be compressed with .BR gzip (1) to save disk space, at the expense of slower .B spamoracle operations. If the database file specified with the .B -f option has the extension .BR .gz , .B spamoracle will automatically uncompress it on start-up, and re-compress it after updates. If your mail is stored in MH format, you may run into "command line too long" errors while trying to process a lot of small files with the .B spamoracle add command, e.g. when doing .br .B spamoracle add -good archives/*/* -spam spam/* .br Instead, do something like: .br .B find archives -type f -print | xargs spamoracle add -good .br .B find spam -type f -print | xargs spamoracle add -spam .SH AUTHOR Xavier Leroy .SH "SEE ALSO" .BR spamoracle.conf (5); .BR procmail (1); .BR fetchmail (1) .B http://spamoracle.forge.ocamlcore.org/ (SpamOracle distribution site) .B http://www.paulgraham.com/spam.html (Paul Graham's seminal paper) spamoracle-release16/spamoracle.conf.5000066400000000000000000000150741356223044300201420ustar00rootroot00000000000000.TH SPAMORACLE.CONF 5 .SH NAME spamoracle.conf \- SpamOracle configuration file format .SH DESCRIPTION The .B spamoracle.conf file is a configuration file governing the operation of the .BR spamoracle (1) e-mail classification tool. By default, the configuration file is searched in .IB $HOME /.spamoracle.conf but an alternate location can be specified using the .B -config flag to .BR spamoracle (1). .B Important note: most of the configuration parameters should not be modified lightly, as this may result in completely wrong e-mail classification. Familiarity with Graham's filtering algorithm, as described in the paper referenced at the end of this page, is recommended to fully understand the effect of the parameters. .SH SYNTAX The .B spamoracle.conf file is composed of lines of the form .I variable .B = .IR value . Lines starting with a # sign are treated as comments and ignored. Blank lines are ignored. Depending on the type of the variable (see the list of variables below), the .I value part takes one of the following forms: .TP .I string A sequence of characters. Blanks (spaces, tabs) at the beginning and the end of the string are ignored. Alternatively, the string can be enclosed in double quotes ("), in which case spaces are not trimmed. Inside quoted strings, blackslashes (\\) and double quotes (") must be escaped with a backslash, as in \\\\ or \\\" .TP .I boolean Either .BR on, .BR yes, .BR true, or .B 1 to activate the boolean option, or .BR off, .BR no, .BR false, or .B 0 to deactivate it. .TP .I integer A decimal integer .TP .I float A decimal floating-point number. .TP .I regexp A regular expression in .BR emacs (1) syntax. The repetition operators are .BR * , .BR + , and .BR ? . Alternation is written .B \e| and grouping is written .BR \e( ... \e) . Character classes are written between brackets .BR [ ... ] as usual. A single dot denotes any character except newline. Regular expressions are case-insensitive. .SH CONFIGURABLE PARAMETERS .TP .B database_file (type .IR string, default value .IB $HOME /.spamoracle.db ) .br The location of the file that contains the database of word frequencies used by .BR spamoracle (1). .TP .B html_retain_tags (type .IR boolean, default value .BR false ) .br In HTML-formatted e-mails and attachments, the names of HTML tags are normally not treated as words and are ignored for the word frequency calculations. If the .B html_retain_tags parameter is set to .BR true , HTML tags (such as .B img or .BR bold ) are treated as words and included in the computation of word frequencies. .TP .B html_tag_attributes (type .IR regexp , default value .br .BR a/href\e|img/src\e|img/alt\e|frame/src\e|font/face\e|font/color ) .br This regular expression matches pairs of HTML tags and HTML attributes written as .IB tag / attribute. When scanning HTML-formatted e-mails and attachments, attributes to HTML tags are normally ignored, unless the tag/attribute pair matches the regular expression .BR html_tag_attributes . If the tag/attribute pair matches this regexp, the value of the attribute (for instance, the URL for the .BR a / href attribute) is scanned for words. .TP .B mail_headers (type .IR regexp , default value .BR from:\e|subject: ) .br A regular expression determining which headers of an e-mail message are scanned for words. .TP .B alternative_favor_html (type .IR bool , default value .BR true ) .br Determine how multipart/alternative messages are treated. If this parameter is set, and one part of the alternative is of type text/html, this part is scanned and all other parts are ignored. In all other cases, all parts of the alternative are scanned. .TP .B spam_header (type .IR string , default value .BR X-Spam ) .br The name of the header that .B spamoracle mark adds to incoming e-mail messages, with the results of the spam/non-spam classification. .TP .B attachments_header (type .IR string , default value .BR X-Attachments ) .br The name of the header that .B spamoracle mark adds to incoming e-mail messages, with the one-line summary of attachment types, names and character sets. The generation of this header can be turned off with the .B summarize_attachment parameter. .TP .B summarize_attachment (type .IR boolean , default value .BR true ) .br If this parameter is set, .B spamoracle mark generates a one-line summary of the attachments of the incoming messages, and inserts this summary in the message headers. Setting this parameter to .B false disables the generation of this extra header. .TP .B num_meaningful_words (type .IR integer , default value .BR 15 ) .br Maximal number of "meaningful" words that are retained for computing the spam probability. During mail analysis, .B spamoracle extracts all words of the message, and retains those whose spam frequency (frequency of occurrence in spam messages) is closest to 1 or to 0. At most .B num_meaningful_words such "meaningful" words are retained. .TP .B max_repetitions (type .IR integer , default value .BR 2 ) .br Maximum number of times a given word can occur in the set of "meaningful" words retained for computing the spam probability. The default value of 2 means that at most 2 occurrences of the same word will be retained. .TP .B low_freq_limit (type .IR float , default value .BR 0.01 ) .TP .B high_freq_limit (type .IR float , default value .BR 0.99 ) .br The spam frequency of a word is computed as the number of occurrences in spam divided by number of occurrences in all messages. This ratio is then clipped to the interval [ .BR low_freq_limit , .B high_freq_limit ], so that words that are extremely rare or extremely common in spam do not bias the probability computation too much. The default values of 0.01 and 0.99 are adequate for a corpus of a few thousand e-mails. For larger corpora (e.g. 10000 e-mails), the values 0.001 and 0.999 may give better results. .TP .B min_meaningful_words (type .IR integer , default value .BR 5 ) .br Minimum number of "meaningful" words below which .B spamoracle mark refuses to classify the e-mail and outputs "unknown" status. This happens with very short e-mails, or e-mails that consist exclusively of links and pictures. .TP .B good_mail_prob (type .IR float , default value .BR 0.2 ) .br Spam probability below which the e-mail is classified as non-spam. .TP .B spam_mail_prob (type .IR float , default value .BR 0.8 ) .br Spam probability above which the e-mail is classified as spam. Messages whose probability falls between .B good_mail_prob and .B spam_mail_prob are classified as "unknown". .SH AUTHOR Xavier Leroy .SH "SEE ALSO" .BR spamoracle (1) .B http://www.paulgraham.com/spam.html (Paul Graham's seminal paper) spamoracle-release16/wordsplit.mli000066400000000000000000000004521356223044300175260ustar00rootroot00000000000000(** Decompose a string into words. *) val iter: (string -> unit) -> (string -> bool) -> string -> unit (** [iter fn db txt] applies [fn] to each word in [txt]. [db] is a predicate that returns [true] for known words. It is used to recognize stretched-out words, e.g. [H.e.ll.o]. *) spamoracle-release16/wordsplit.mlp000066400000000000000000000123001356223044300175300ustar00rootroot00000000000000(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2. *) (* *) (***********************************************************************) (* $Id$ *) (* Decompose a string into words. *) { (* Maximal length of a word *) let maxlen = 12 (* Map uppercase to lowercase. Remove ISO Latin 1 accents. *) let tbl = "\ \000\001\002\003\004\005\006\007\008\t\n\011\012\013\014\015\ \016\017\018\019\020\021\022\023\024\025\026\027\028\029\030\031 \ !\"#$%&'()*+,-./\ 0123456789:;<=>?\ @abcdefghijklmno\ pqrstuvwxyz[\\]^_\ `abcdefghijklmno\ pqrstuvwxyz{|}~\127\ \128\129\130\131\132\133\134\135\136\137\138\139\140\141\142\143\ \144\145\146\147\148\149\150\151\152\153\154\155\156\157\158\159\ \160¡¢£¤¥¦§¨©ª«¬­®¯\ °±²³´µ¶·¸¹º»¼½¾¿\ aaaaaaeceeeeiiii\ Ðnooooo×ouuuuypß\ aaaaaaeceeeeiiii\ ðnooooo÷ouuuuypy" let normalize s = String.map (fun c -> tbl.[Char.code c]) s (* Check if a string is all uppercase characters *) let all_uppercase s = try for i = 0 to String.length s - 1 do let c = s.[i] in if (c >= 'A' && c <= 'Z') || (c >= '\192' && c <= '\214') || (c >= '\216' && c <= '\222') then () else raise Exit done; true with Exit -> false (* Take first n chars of string *) let adjust s n = if String.length s <= n then s else String.sub s 0 n (* Reassembly of stretched-out words *) let reassembly_buffer = Bytes.create maxlen let previous_words = ref ([] : string list) let reassemble db action word = let rec reass idx = function | [] -> [] | hd :: tl -> let l = String.length hd in let idx = idx - l in if idx < 0 then [] else begin Bytes.blit_string hd 0 reassembly_buffer idx l; let w = Bytes.sub_string reassembly_buffer idx (maxlen - idx) in if db w then action w; hd :: reass idx tl end in let l = String.length word in if l >= maxlen then begin previous_words := [] end else begin let idx = maxlen - l in Bytes.blit_string word 0 reassembly_buffer idx l; previous_words := word :: reass idx !previous_words end } let letter = [ 'a'-'z' 'A'-'Z' #ifdef FRENCH 'À' 'Â' 'Ç' 'È' 'É' 'Ê' 'Ë' 'Î' 'Ï' 'Ô' 'Ù' 'Û' 'Ü' 'à' 'â' 'ç' 'è' 'é' 'ê' 'ë' 'î' 'ï' 'ô' 'ù' 'û' 'ü' 'ÿ' #endif #ifdef SPANISH 'Á' 'É' 'Í' 'Ó' 'Ú' 'Ü' 'Ñ' 'á' 'é' 'í' 'ó' 'ú' 'ü' 'ñ' #endif #ifdef ITALIAN 'É' 'Í' 'Ú' 'À' 'È' 'Ò' 'é' 'í' 'ú' 'à' 'è' 'ò' #endif #ifdef GERMAN 'Ä' 'Ö' 'Ü' 'ä' 'ö' 'ü' 'ß' #endif #ifdef PORTUGUESE 'À' 'Á' 'Â' 'Ã' 'Ç' 'É' 'Ê' 'Í' 'Ó' 'Ô' 'Õ' 'Ú' 'Ü' 'à' 'á' 'â' 'ã' 'ç' 'é' 'ê' 'í' 'ó' 'ô' 'õ' 'ú' 'ü' #endif ] let word_constituent = letter | '\'' let numeric = ['0'-'9' '.' ',' '$' '%' '\164' (* Euro *)] let weird_character = ['\127' - '\255'] #ifdef JAPANESE let jis_snd = ['\032'-'\127'] let kanji = ['\048'-'\116'] jis_snd let katakana = (['\037'] jis_snd | "\033\060") let jis_char = ['\032'-'\116'] jis_snd let jis_in = "\027$" '(' ? ['@' 'B'] let jis_out = "\027(" ['B' 'J' 'H'] #endif rule split = parse | word_constituent + { fun db action -> let s = Lexing.lexeme lexbuf in let l = String.length s in if l >= 3 && all_uppercase s then action ("U" ^ string_of_int l); let s = normalize s in if !Config.reassemble_words then reassemble db action s; if l >= 3 && l <= maxlen then action s; split lexbuf db action } | numeric numeric numeric numeric * { fun db action -> let s = Lexing.lexeme lexbuf in if String.length s <= 8 then action (normalize s); split lexbuf db action } #ifdef JAPANESE | jis_in { split_jis lexbuf } #endif | weird_character weird_character weird_character weird_character * { fun db action -> let s = Lexing.lexeme lexbuf in action ("W" ^ string_of_int (String.length s)); split lexbuf db action } | eof { fun db action -> () } | _ { split lexbuf } #ifdef JAPANESE and split_jis = parse kanji kanji * { fun db action -> let s = Lexing.lexeme lexbuf in action ("\027$B" ^ adjust s 8 ^ "\027(B"); split_jis lexbuf db action } | katakana katakana katakana katakana * { fun db action -> let s = Lexing.lexeme lexbuf in action ("\027$B" ^ adjust s 12 ^ "\027(B"); split_jis lexbuf db action } | jis_char { split_jis lexbuf } | eof { fun db action -> () } | jis_out { split lexbuf } | _ { split lexbuf } #endif { let iter action db txt = previous_words := []; split (Lexing.from_string txt) db action }