spamoracle-1.4/0040755004340400511750000000000007737253571013130 5ustar xleroyformelspamoracle-1.4/.depend0100644004340400511750000000240707723412005014350 0ustar xleroyformelattachments.cmo: mail.cmi attachments.cmi attachments.cmx: mail.cmx attachments.cmi configfile.cmo: configfile.cmi configfile.cmx: configfile.cmi config.cmo: configfile.cmi config.cmi config.cmx: configfile.cmx config.cmi database.cmo: database.cmi database.cmx: database.cmi htmlscan.cmo: config.cmi htmlscan.cmi htmlscan.cmx: config.cmx htmlscan.cmi mail.cmo: config.cmi htmlscan.cmi mail.cmi mail.cmx: config.cmx htmlscan.cmx mail.cmi main.cmo: config.cmi configfile.cmi database.cmi mbox.cmi processing.cmi main.cmx: config.cmx configfile.cmx database.cmx mbox.cmx processing.cmx mbox.cmo: mbox.cmi mbox.cmx: mbox.cmi processing.cmo: attachments.cmi config.cmi database.cmi mail.cmi rankmsg.cmi \ wordsplit.cmi processing.cmi processing.cmx: attachments.cmx config.cmx database.cmx mail.cmx rankmsg.cmx \ wordsplit.cmx processing.cmi rankmsg.cmo: config.cmi database.cmi mail.cmi wordsplit.cmi rankmsg.cmi rankmsg.cmx: config.cmx database.cmx mail.cmx wordsplit.cmx rankmsg.cmi testsplit.cmo: mail.cmi mbox.cmi wordsplit.cmi testsplit.cmx: mail.cmx mbox.cmx wordsplit.cmx wordsplit.cmo: wordsplit.cmi wordsplit.cmx: wordsplit.cmi attachments.cmi: mail.cmi config.cmi: configfile.cmi processing.cmi: database.cmi rankmsg.cmi: database.cmi mail.cmi spamoracle-1.4/Changes0100644004340400511750000000177507724406764014432 0ustar xleroyformelRelease 1.4: - More lenient rule for ignoring HTML comments. - Recognition of S P A C E D O-U-T words. - Fixed a bug causing certain text-only e-mails to be ignored. - Improved the "spamoracle test" command. - Added "spamoracle words" command for troubleshooting the word-splitting algorithm. Release 1.3: - Better scanning of words in HTML documents. - Better handling of multipart documents; now scan both text and HTML parts of a multipart/alternative. - Support for Portuguese and Japanese (JIS encoding) - Added configuration file (~/.spamoracle.conf) to allow twiddling with filtering parameters. - Fixed database corruption problem with concurrent adds. - Handles gzipped databases. - Added manual pages. Release 1.2: - When choosing part of a multipart/alternative, ignore pure text part if much smaller than HTML part. - Added commands "backup" and "restore" to facilitate future evolution towards other database formats. Release 1.1: - Fixed "usage" message. Release 1.0: - First public release. spamoracle-1.4/Makefile0100644004340400511750000000315407664107420014556 0ustar xleroyformel### Configuration section # The laguages you're interested in, besides English LANGUAGES=-DFRENCH #-DSPANISH -DITALIAN -DGERMAN -DPORTUGUESE -DJAPANESE # How to invoke the C preprocessor CPP=gcc -E -P $(LANGUAGES) - # Where to install the binary BINDIR=/usr/local/bin # Where to install the man pages MANDIR=/usr/local/man ### End of configuration section OCAMLC=ocamlc -g OCAMLLEX=ocamllex OCAMLDEP=ocamldep OCAMLOPT=ocamlopt BYTEOBJS=configfile.cmo config.cmo \ htmlscan.cmo mail.cmo database.cmo mbox.cmo wordsplit.cmo \ rankmsg.cmo attachments.cmo processing.cmo main.cmo BYTELIBS=unix.cma str.cma NATOBJS=$(BYTEOBJS:.cmo=.cmx) NATLIBS=$(BYTELIBS:.cma=.cmxa) all: spamoracle install: cp spamoracle $(BINDIR)/spamoracle cp spamoracle.1 $(MANDIR)/man1/spamoracle.1 cp spamoracle.conf.5 $(MANDIR)/man5/spamoracle.conf.5 spamoracle: $(NATOBJS) $(OCAMLOPT) -o spamoracle $(NATLIBS) $(NATOBJS) clean:: rm -f spamoracle spamoracle.byte: $(BYTEOBJS) $(OCAMLC) -o spamoracle.byte $(BYTELIBS) $(BYTEOBJS) clean:: rm -f spamoracle.byte wordsplit.mll: wordsplit.mlp $(CPP) < wordsplit.mlp > wordsplit.mll \ || { rm -f wordsplit.mll; exit 2; } clean:: rm -f wordsplit.mll wordsplit.ml: wordsplit.mll $(OCAMLLEX) wordsplit.mll clean:: rm -f wordsplit.ml beforedepend:: wordsplit.ml htmlscan.ml: htmlscan.mll $(OCAMLLEX) htmlscan.mll clean:: rm -f htmlscan.ml beforedepend:: htmlscan.ml clean:: rm -f *.cm[iox] *.o .SUFFIXES: .ml .mli .cmo .cmi .cmx .mli.cmi: $(OCAMLC) -c $< .ml.cmo: $(OCAMLC) -c $< .ml.cmx: $(OCAMLOPT) -c $< depend: beforedepend $(OCAMLDEP) *.ml *.mli > .depend include .depend spamoracle-1.4/README0100644004340400511750000003154607663635144014014 0ustar xleroyformel SpamOracle -- a spam classification tool Version 1.3 OVERVIEW: SpamOracle is a tool to help detect and filter away "spam" (unsolicited commercial e-mail). It proceeds by statistical analysis of the words that appear in the e-mail, comparing the frequencies of words with those found in a user-provided corpus of known spam and known legitimate e-mail. The classification algorithm is based on Bayes' formula, and is described in Paul Graham's paper, "A plan for spam", http://www.paulgraham.com/spam.html. This program is designed to work in conjunction with procmail. The result of the analysis is output as an additional message header "X-Spam:", followed by "yes", "no" or "unknown", plus additional details. A procmail rule can then test this "X-Spam:" header and deliver the e-mail to the appropriate mailbox. In addition, SpamOracle also also analyses MIME attachments, extracting relevant information such as MIME type, character encoding and attached file name, and summarizing them in an additional "X-Attachments:" header. This allows procmail to easily reject e-mails containing suspicious attachments, e.g. Windows executables which often indicate a virus. LICENSE: This program is distributed under the terms of the GPL (GNU Public License) version 2, available from http://www.gnu.org/licenses/gpl.txt. REQUIREMENTS AND LIMITATIONS: - To compile: Objective Caml, http://caml.inria.fr/ - To use: . Your mail must be delivered to a Unix machine. You must have a shell account on this machine. This machine must have procmail (http://www.procmail.org/) installed. Your ~/.forward file must be set up to run all incoming e-mail through procmail. . To provide the corpus of messages from which SpamOracle "learns", an archive of at least 1000 of your e-mails is needed. The archive must be manually or semi-automatically split into known spams and known good messages. Mis-classified messages in the corpus (e.g. spams mistakenly stored among the good messages) will decrease the efficiency of the classification. The archive must be in Unix mailbox format, or in "one message per file" format (a la MH). Other formats, such as Emacs' Babyl, are not supported. . The notion of "word" used by SpamOracle is slanted towards Western European languages, i.e. the ISO Latin-1 and Latin-9 character sets. Preliminary support for JIS-encoded Japanese is provided. Still, SpamOracle will not work well if you receive many legitimate e-mails written in other character sets, such as Chinese or Korean. INSTALLATION: Edit the Makefile and change the definitions of the following variables at the top of the file: LANGUAGES the languages you're interested in besides English CPP how to invoke the C preprocessor BINDIR where to install the executable Do "make" in this directory. Become superuser if necessary and do "make install". INITIALIZATION: To build the database of word frequencies from the corpus, do: rm ~/.spamoracle.db spamoracle add -v -good goodmails -spam spammails (By default, the database is stored in the file ".spamoracle.db" in your home directory. This can be overriden with the -f option: spamoracle -f mydatabase add ... ) This assumes that the good, non-spam messages from the corpus are stored in the file "goodmails", and the known spam messages in the file "spammails". You can also fetch corpus messages from several files: spamoracle add -v -good goodmails1 ... goodmailsN \ -spam spammails1 ... spammailsP To check that the database was built correctly, and familiarize yourself with the statistical analysis performed by SpamOracle, invoke the "test" mode on the mailboxes that you just used for building the corpus: spamoracle test goodmails | more spamoracle test spammails | more For each message in the given mailboxes, you'll see a summary like this: From: bbo Subject: Check This Out Score: 1.00 -- 15 Details: refid:98 $$$$:98 surfing:98 asp:95 click:93 cable:92 instantly:90 https:88 internet:87 www:86 U4:85 isn't:14 month:81 com:75 surf:75 Attachments: cset="GB2312" type="application/octet-stream" name="Guangwen4.zip" The first two lines are just the From: and Subject: fields of the original message. The "Score:" line summarizes the result of the analysis. The first number (between 0.0 and 1.0) is the probability that the message is actually spam --- or, equivalently, the degree of similarity of the message with the spam messages in the corpus. The second number (an integer between 0 and 15) is the number of "interesting" words found in the message. "Interesting" words are those that occur at least 5 times in the corpus. In the example, we have 15 interesting words (the maximum) and a score of 1.00, indicating a spam with high certainty. The "Details:" line provides an explanation of the score. It lists the 15 most interesting words found in the message, that is, the 15 interesting words whose probability of denoting a spam is farthest away from the neutral 0.5. Each word is given with its individual score, written as a percentage (between 01 and 99) rather than as a probability so as to save space. Here, we see a number of very "spammish" words such as "$$$$" or "click", with probability 0.98 and 0.93 respectively, and a few "innocent" words such as "isn't" (probability 0.14). The "U4" word with probability 0.85 is actually a pseudo-word representing a 4-letter word all in uppercase -- something spammers are fond of. The "Attachments:" line summarizes some information about MIME attachments for this message. Here, we have one attachment of type "application/octect-stream", file name "Guangwen4.zip", and character set "GB2312". The latter is an encoding for Chinese and a sure sign that this is a Chinese spam (assuming that, like me, you can't read Chinese). Normally, when running "spamoracle test goodmails", most messages should come out with low score (0.2 or less), and when running "spamoracle test spammails", most messages should come out with a high score (0.8 or more). If not, your corpus isn't very good, or not well classified into spam and non-spam. To quickly see the outliers, you can reduce the interval of scores for which message summaries are displayed, as follows: spamoracle test -min 0.2 goodmails | more # Shows only good mails with score >= 0.2 spamoracle test -max 0.8 spammails | more # Shows only spam mails with score <= 0.8 Now, for a more challenging test, take a mailbox that contains unfiltered e-mails, i.e. a mixture of spam and legitimate e-mails, and run it through "spamoracle": spamoracle test mymailbox | less Marvel at how well the oracle recognizes spam from the rest! If the result isn't that marvelous to you, keep in mind that certain spams are just too short to be recognized (not enough significant words). Also, perhaps your corpus was too small, or not well categorized... USING SPAMORACLE AND PROCMAIL TO FILTER YOUR INCOMING E-MAIL: Once the database is built, you're ready to run incoming e-mails through SpamOracle. The command "spamoracle mark" reads one e-mail from standard input, and copies it to standard output, with two headers inserted: "X-Spam:" and "X-Attachments:". The X-Spam: header is as follows: X-Spam: yes; ;
or X-Spam: no; ;
or X-Spam: unknown; ;
The and
are as described for "spamoracle test". The "yes" / "no" / "unknown" synthesizes the results of the analysis: "yes": score >= 0.8 and at least 5 interesting words found "no": score <= 0.2 and at least 5 interesting words found "unknown": otherwise The "unknown" case generally occurs for very short messages, where not enough interesting words were found. The "X-Attachments:" header contains the same information as the "Attachments:" output of "spamoracle test", that is, a summary of the message attachments. To process automatically your incoming e-mail through "spamoracle" and act upon the results of the analysis, just insert the following "recipes" in the file ~/.procmailrc: :0fw | /usr/local/bin/spamoracle mark :0 * ^X-Spam: yes; spambox What these cryptic commands mean is: - Run every mail through the "spamoracle mark" command. (If spamoracle wasn't installed in /usr/local/bin, adjust the path as necessary.) This adds two headers to the message: "X-Spam:" and "X-Attachments:", describing the results of the spam analysis and the attachment analysis. - If we have an "X-Spam: yes" header, deliver the message to the file "spambox" rather than to your regular mailbox. Presumably, you'll read "spambox" once in a while, but less often than your regular mailbox. Daring users can put "/dev/null" instead of "spambox" to just throw away the message, but please don't do that until you've used SpamOracle for a while and are happy with the results. SpamOracle's false positive rate (i.e. legitimate mails classified as spam) is low (0.1% on my mail) but not null. So, better save the presumed spams somewhere, and scan them quickly from time to time. If you'd like to enjoy a bit of attachment-based filtering, here are some procmail rules for that: :0 * ^X-Attachments:.*name=".*\.(pif|scr|exe|bat)" spambox :0 * ^X-Attachments:.*type="audio/(x-wav|x-midi) spambox :0 * ^(Content-type:.*|X-Attachments:.*cset="|^Subject:.*=\?)(ks_c|gb2312|iso-2|euc-|big5|windows-1251) spambox The first rule treats as spam every mail that has a Windows executable as attachment. These mails are typically sent by viruses. The second rule does the same with attachments of type x-wav or x-midi. I never normally receive music by e-mail, however some popular e-mail viruses seem fond of these attachment types. The third rule treats as spam every mail that uses character encodings corresponding to Korean, Chinese, Japanese, and Cyrillic. Since I don't read any of these scripts, why not get rid of the messages immediately? UPDATING THE DATABASE: At any time, you can add more known spams or known legitimate messages to the database by using the "spamoracle add" command. For instance, if you find a spam message that was not classified as such, run it through "spamoracle add -spam", so that SpamOracle can learn from its mistake. (Without additional arguments, this command will read a single message from standard input and record it as spam.) Under mutt for instance, just highlight the spam message and type |spamoracle add -spam Similarly, if you find a legitimate message while checking your spam box, run it through "spamoracle add -good". Another option is to collect more known spams or more known good messages into mailbox files, and once in a while do spamoracle add -good new_good_mails or spamoracle add -spam new_spam_mails TECHNICAL DETAILS: SpamOracle's notion of "word" is the following: - any run of 3 to 12 letters, single quotes, dashes (-) - any run of 3 to 8 digits, dots, commas, dollar, euro and percent signs. If support for non-English european languages was compiled in, letters also include the relevant accented letters for the languages in question. All words are mapped to lowercase, and accented letters are mapped to the corresponding non-accented letters. In addition, a run of three or more uppercase letters generates a pseudo-word "Un" where n is the length of the run. Similarly, a run of three or more non-ASCII characters (code >= 128) generates a pseudo-word "Wn" where n is the length of the run. For instance, the following text: SUMMER in English is written "été" in French ¹²³ is processed into the following words, assuming French support was selected: U5 summer english written ete french W3 and if French support was not selected: U5 summer english written french W3 For your edification and entertainment, the contents of the database can be dumped with the "spamoracle list " command, where is an Emacs-style regexp specifying the words you're interested in, e.g.: spamoracle list '.*' # show all words -- big list! spamoracle list 'sex.*' spamoracle list 'caml.*' It is possible to tweak many of the parameters that govern filtering via the configuration file ~/.spamoracle.conf. The configurable parameters are listed and explained in the man page spamoracle.conf (5). All parameters have reasonable default values, but you may try to tweak them to get better filtering. spamoracle-1.4/attachments.ml0100644004340400511750000000535207532373133015765 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: attachments.ml,v 1.3 2002/08/26 09:35:23 xleroy Exp $ *) (* Summarize the attachments of a message as one line that can be put in the header of the message. Allows procmail to filter suspicious attachments without looking at the message body. *) open Printf open Mail let re_content_type = Str.regexp "\\([/a-zA-Z0-9-]+\\)" let re_innocuous_content_types = Str.regexp_case_fold "text/plain\\|text/html\\|text/x-vcard\\|multipart/\\|message/rfc822\\|message/delivery-status" let re_charset = Str.regexp_case_fold "charset=\\(\"\\([^\"]+\\)\"\\|[^ \t;]+\\)" let re_innocuous_charsets = Str.regexp_case_fold "us-ascii\\|iso[-_]8859[-_]1$\\|iso[-_]8859[-_]15\\|windows-1252" let re_name = Str.regexp_case_fold "name=\\(\"\\([^\"]+\\)\"\\|[^ \t;]+\\)" let match_anchored re s = Str.string_match re s 0 let match_unanchored re s = try ignore (Str.search_forward re s 0); true with Not_found -> false let summarize msg = let res = Buffer.create 200 in let rec summ m = let h = header "content-type:" m in if match_anchored re_content_type h then begin let c = Str.matched_group 1 h in if not (Str.string_match re_innocuous_content_types c 0) then bprintf res "type=\"%s\" " c end; if match_unanchored re_charset h then begin let c = try Str.matched_group 2 h with Not_found -> Str.matched_group 1 h in if not (Str.string_match re_innocuous_charsets c 0) then bprintf res "cset=\"%s\" " c end; if match_unanchored re_name h then begin let c = try Str.matched_group 2 h with Not_found -> Str.matched_group 1 h in bprintf res "name=\"%s\" " c end; let h = header "content-disposition:" m in if match_unanchored re_name h then begin let c = try Str.matched_group 2 h with Not_found -> Str.matched_group 1 h in bprintf res "name=\"%s\" " c end; List.iter summ m.parts in List.iter summ msg.parts; Buffer.contents res spamoracle-1.4/attachments.mli0100644004340400511750000000213507532373133016132 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: attachments.mli,v 1.2 2002/08/26 09:35:23 xleroy Exp $ *) (* Summarize the attachments of a message as one line that can be put in the header of the message. Allows procmail to filter suspicious attachments without looking at the message body. *) val summarize: Mail.message -> string spamoracle-1.4/config.ml0100644004340400511750000000253007637275342014723 0ustar xleroyformel(* Configurable parameters *) let database_name = ref (try Filename.concat (Sys.getenv "HOME") ".spamoracle.db" with Not_found -> ".spamoracle.db") let html_add_tags = ref false let html_tag_attr = ref (Str.regexp_case_fold "a/href\\|img/src\\|img/alt\\|frame/src\\|font/face\\|font/color") let mail_headers = ref (Str.regexp_case_fold "from:\\|subject:") let spam_header = ref "X-Spam" let attachments_header = ref "X-Attachments" let summarize_attachments = ref true let num_words_retained = ref 15 let max_repetitions = ref 2 let low_freq_limit = ref 0.01 let high_freq_limit = ref 0.99 let good_mail_prob = ref 0.2 let spam_mail_prob = ref 0.8 let min_meaningful_words = ref 5 open Configfile let options = [ "database_file", String database_name; "html_retain_tags", Bool html_add_tags; "html_tag_attributes", Regexp html_tag_attr; "mail_headers", Regexp mail_headers; "spam_header", String spam_header; "attachments_header", String attachments_header; "summarize_attachments", Bool summarize_attachments; "num_meaningful_words", Int num_words_retained; "max_repetitions", Int max_repetitions; "low_freq_limit", Float low_freq_limit; "high_freq_limit", Float high_freq_limit; "min_meaningful_words", Int min_meaningful_words; "good_mail_prob", Float good_mail_prob; "spam_mail_prob", Float spam_mail_prob ] spamoracle-1.4/config.mli0100644004340400511750000000313307637275342015074 0ustar xleroyformel(* Configurable parameters *) val database_name : string ref (** Name of database file *) val html_add_tags : bool ref (** Whether to treat HTML tag names as words *) val html_tag_attr : Str.regexp ref (** Regexp matching [tag/attr] strings denoting pairs of HTML tag and attribute names. If a tag and attribute pair matches, the associated value is added to the text. *) val mail_headers : Str.regexp ref (** Regexp matching names of e-mail headers that must be analyzed. *) val spam_header : string ref (** Name of header added with spam / not-spam info (default: "X-Spam") *) val attachments_header : string ref (** Name of header added with attachment summary (default: "X-Attachments") *) val summarize_attachments : bool ref (** Whether to generate the attachment summary *) val num_words_retained : int ref (** Number of meaningful words to retain for computing final prob. *) val max_repetitions : int ref (** Among the meaningful words, max number of time a given word can appear. *) val low_freq_limit : float ref (** Lower limit for word frequencies. Default is 0.01. *) val high_freq_limit : float ref (** Upper limit for word frequencies. Default is 0.99. *) val min_meaningful_words : int ref (** Number of meaningful words below which mails are classified as unknown *) val good_mail_prob : float ref (** Spam probability below which mails are classified as good *) val spam_mail_prob : float ref (** Spam probability below which mails are classified as spam *) val options : (string * Configfile.value) list (** List of configurable parameters *) spamoracle-1.4/configfile.ml0100644004340400511750000000537407677607503015576 0ustar xleroyformel(* Parsing configuration files *) type value = | Bool of bool ref | String of string ref | Int of int ref | Float of float ref | Regexp of Str.regexp ref | OptBool of bool option ref | OptString of string option ref | OptInt of int option ref | OptFloat of float option ref | OptRegexp of Str.regexp option ref exception Error of string let re_bool_yes = Str.regexp_case_fold "\\(on\\|yes\\|true\\|1\\)[ \t\r]*$" let re_bool_no = Str.regexp_case_fold "\\(off\\|no\\|false\\|0\\)[ \t\r]*$" let parse_bool data = if Str.string_match re_bool_yes data 0 then true else if Str.string_match re_bool_no data 0 then false else raise (Error "invalid boolean value") let trim_spaces s = let i = ref (String.length s - 1) in while !i >= 0 && (let c = s.[!i] in c = ' ' || c = '\t' || c = '\r') do decr i done; String.sub s 0 (!i + 1) let parse_string data = try Scanf.sscanf data "%S" (fun s -> s) with Scanf.Scan_failure _ -> trim_spaces data let parse_int data = try Scanf.sscanf data "%i" (fun n -> n) with Scanf.Scan_failure _ | Failure _ -> raise (Error ("invalid integer value")) let parse_float data = try Scanf.sscanf data "%f" (fun n -> n) with Scanf.Scan_failure _ | Failure _ -> raise (Error ("invalid floating-point value")) let parse_regexp data = try Str.regexp_case_fold (trim_spaces data) with Failure msg -> raise (Error ("invalid regular expression: " ^ msg)) let parse_data valuedesc data = match valuedesc with | Bool r -> r := parse_bool data | String r -> r := parse_string data | Int r -> r := parse_int data | Float r -> r := parse_float data | Regexp r -> r := parse_regexp data | OptBool r -> r := Some(parse_bool data) | OptString r -> r := Some(parse_string data) | OptInt r -> r := Some(parse_int data) | OptFloat r -> r := Some(parse_float data) | OptRegexp r -> r := Some(parse_regexp data) let re_line = Str.regexp "[ \t]*\\([A-Za-z][A-Za-z0-9_]*\\)[ \t]*=[ \t]*\\(.*\\)" let re_skip = Str.regexp "#\\|[ \t\r]*$" let parse_line opts s = if Str.string_match re_line s 0 then begin let key = Str.matched_group 1 s and data = Str.matched_group 2 s in try parse_data (List.assoc key opts) data with Not_found -> raise (Error ("unknown variable " ^ key)) end else if not (Str.string_match re_skip s 0) then raise (Error "ill-formed line") let parse opts filename = let ic = open_in filename in let lineno = ref 1 in let errors = ref [] in begin try while true do let s = input_line ic in begin try parse_line opts s with Error msg -> errors := (!lineno, msg) :: !errors end; incr lineno done with End_of_file -> close_in ic end; !errors spamoracle-1.4/configfile.mli0100644004340400511750000000061707637275342015740 0ustar xleroyformel(* Parsing configuration files *) type value = | Bool of bool ref | String of string ref | Int of int ref | Float of float ref | Regexp of Str.regexp ref | OptBool of bool option ref | OptString of string option ref | OptInt of int option ref | OptFloat of float option ref | OptRegexp of Str.regexp option ref val parse: (string * value) list -> string -> (int * string) list spamoracle-1.4/database.ml0100644004340400511750000001363507665640432015227 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: database.ml,v 1.8 2003/05/30 11:31:38 xleroy Exp $ *) (* Word frequency database *) exception Error of string type short = { s_num_good: int; s_num_spam: int; s_freq: (string, int * int) Hashtbl.t } type full = { mutable f_num_good: int; mutable f_num_spam: int; f_high_freq: (string, int * int) Hashtbl.t; f_low_freq: (string, int * int) Hashtbl.t } let magic = "Mailscrubber0001" let check_magic filename ic = let buf = String.create (String.length magic) in really_input ic buf 0 (String.length magic); if buf <> magic then raise(Error(filename ^ ": bad magic number")) type db_chan = {zipped : bool ; ic : in_channel} let open_db filename = if Filename.check_suffix filename ".gz" then { ic = Unix.open_process_in ("gunzip -c " ^ filename); zipped = true; } else { ic = open_in_bin filename ; zipped = false } let close_db {zipped = zipped ; ic = ic } = if zipped then ignore(Unix.close_process_in ic) else close_in ic let marshal_from_channel filename ic = try Marshal.from_channel ic with Failure _ -> raise (Error(filename ^ ": database is corrupted")) let read_short filename = let {ic=ic ; zipped=zipped} as db_ic = open_db filename in check_magic filename ic; let ng = input_binary_int ic in let ns = input_binary_int ic in let freq = marshal_from_channel filename ic in close_db db_ic; { s_num_good = ng; s_num_spam = ns; s_freq = freq } let read_full filename = let {ic=ic ; zipped=zipped} as db_ic = open_db filename in check_magic filename ic; let ng = input_binary_int ic in let ns = input_binary_int ic in let high_freq = marshal_from_channel filename ic in let low_freq = marshal_from_channel filename ic in close_db db_ic; { f_num_good = ng; f_num_spam = ns; f_low_freq = low_freq; f_high_freq = high_freq } let temp_file basename = let pid = Unix.getpid() in let rec tmpfile counter = if counter > 10000 then raise (Error "cannot create temporary database"); let filename = basename ^ string_of_int (pid + counter) in try (filename, open_out_gen [Open_wronly; Open_creat; Open_excl; Open_binary] 0o600 filename) with Sys_error _ -> tmpfile (counter + 1) in tmpfile 0 let write_full filename db = let basename, zip = if Filename.check_suffix filename ".gz" then Filename.chop_suffix filename ".gz", true else filename, false in let (tempname, oc) = temp_file (basename ^ ".tmp") in output_string oc magic; output_binary_int oc db.f_num_good; output_binary_int oc db.f_num_spam; Marshal.to_channel oc db.f_high_freq [Marshal.No_sharing]; Marshal.to_channel oc db.f_low_freq [Marshal.No_sharing]; close_out oc; if zip then begin let r = Sys.command ("gzip -best " ^ tempname) in if r = 0 then Sys.rename (tempname ^ ".gz") filename else Sys.rename tempname basename end else Sys.rename tempname filename let create sz = { f_num_good = 0; f_num_spam = 0; f_high_freq = Hashtbl.create sz; f_low_freq = Hashtbl.create sz } let add_good db w = begin try let (g, s as f) = Hashtbl.find db.f_high_freq w in Hashtbl.replace db.f_high_freq w (g+1, s) with Not_found -> try let (g, s as f) = Hashtbl.find db.f_low_freq w in let g' = g + 1 in if 2 * g' + s >= 5 then begin Hashtbl.remove db.f_low_freq w; Hashtbl.add db.f_high_freq w (g', s) end else Hashtbl.replace db.f_low_freq w (g', s) with Not_found -> Hashtbl.add db.f_low_freq w (1, 0) end let add_spam db w = begin try let (g, s) = Hashtbl.find db.f_high_freq w in Hashtbl.replace db.f_high_freq w (g, s+1) with Not_found -> try let (g, s) = Hashtbl.find db.f_low_freq w in let s' = s + 1 in if 2 * g + s' >= 5 then begin Hashtbl.remove db.f_low_freq w; Hashtbl.add db.f_high_freq w (g, s') end else Hashtbl.replace db.f_low_freq w (g, s') with Not_found -> Hashtbl.add db.f_low_freq w (0, 1) end open Printf let dump db oc = let dump_entry w (g, s) = fprintf oc "%s %d %d\n" w g s in fprintf oc "SPAMORACLE/1 %d %d\n" db.f_num_good db.f_num_spam; Hashtbl.iter dump_entry db.f_high_freq; Hashtbl.iter dump_entry db.f_low_freq let split s = try let i = String.index s ' ' in let j = String.index_from s (i + 1) ' ' in (String.sub s 0 i, int_of_string (String.sub s (i + 1) (j - i - 1)), int_of_string (String.sub s (j + 1) (String.length s - j - 1))) with Not_found -> raise(Error("Database restoration: ill-formed line `" ^ String.escaped s ^ "'")) let restore ic = let db = create 997 in begin try let (w, ng, ns) = split (input_line ic) in if w <> "SPAMORACLE/1" then raise (Error("Database restoration: wrong version")); db.f_num_good <- ng; db.f_num_spam <- ns with End_of_file -> raise (Error("Database restoration: first line missing")); end; begin try while true do let (w, g, s) = split (input_line ic) in if 2 * g + s >= 5 then Hashtbl.add db.f_high_freq w (g, s) else Hashtbl.add db.f_low_freq w (g, s) done with End_of_file -> () end; db spamoracle-1.4/database.mli0100644004340400511750000000267007664101014015360 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: database.mli,v 1.4 2003/05/25 08:42:52 xleroy Exp $ *) (** Word frequency database *) type short = { s_num_good: int; s_num_spam: int; s_freq: (string, int * int) Hashtbl.t } type full = { mutable f_num_good: int; mutable f_num_spam: int; f_high_freq: (string, int * int) Hashtbl.t; f_low_freq: (string, int * int) Hashtbl.t } val read_short: string -> short val read_full: string -> full val write_full: string -> full -> unit val create: int -> full val add_good: full -> string -> unit val add_spam: full -> string -> unit val dump: full -> out_channel -> unit val restore: in_channel -> full exception Error of string spamoracle-1.4/htmlscan.mli0100644004340400511750000000201307604776677015447 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: htmlscan.mli,v 1.1 2003/01/02 08:52:15 xleroy Exp $ *) (** Approximate HTML scanner. Extracts words and certain parameters of certain tags (e.g. URLs) from HTML text. *) val extract_text: string -> string spamoracle-1.4/htmlscan.mll0100644004340400511750000001666007723412006015436 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: htmlscan.mll,v 1.7 2003/08/28 14:39:02 xleroy Exp $ *) (** Approximate HTML scanner. Extracts words from HTML text, as well as certain parameters of certain tags (e.g. URLs). *) { module StringSet = Set.Make(String) module StringMap = Map.Make(String) let re_url_encoding = Str.regexp "%\\([0-9A-Fa-f][0-9A-Fa-f]\\)" let decode_url_percent s = let n = int_of_string ("0x" ^ Str.matched_group 1 s) in String.make 1 (Char.chr n) let decode_url s = Str.global_substitute re_url_encoding decode_url_percent s let entity_table = List.fold_left (fun t (s, c) -> StringMap.add s c t) StringMap.empty ["amp", '&'; "lt", '<'; "gt", '>'; "nbsp", '\160'; "Agrave", '\192'; "Aacute", '\193'; "Acirc", '\194'; "Atilde", '\195'; "Auml", '\196'; "Aring", '\197'; "AElig", '\198'; "Ccedil", '\199'; "Egrave", '\200'; "Eacute", '\201'; "Ecirc", '\202'; "Euml", '\203'; "Igrave", '\204'; "Iacute", '\205'; "Icirc", '\206'; "Iuml", '\207'; "ETH", '\208'; "Ntilde", '\209'; "Ograve", '\210'; "Oacute", '\211'; "Ocirc", '\212'; "Otilde", '\213'; "Ouml", '\214'; "times", '\215'; "Oslash", '\216'; "Ugrave", '\217'; "Uacute", '\218'; "Ucirc", '\219'; "Uuml", '\220'; "Yacute", '\221'; "THORN", '\222'; "szlig", '\223'; "agrave", '\224'; "aacute", '\225'; "acirc", '\226'; "atilde", '\227'; "auml", '\228'; "aring", '\229'; "aelig", '\230'; "ccedil", '\231'; "egrave", '\232'; "eacute", '\233'; "ecirc", '\234'; "euml", '\235'; "igrave", '\236'; "iacute", '\237'; "icirc", '\238'; "iuml", '\239'; "eth", '\240'; "ntilde", '\241'; "ograve", '\242'; "oacute", '\243'; "ocirc", '\244'; "otilde", '\245'; "ouml", '\246'; "divide", '\247'; "oslash", '\248'; "ugrave", '\249'; "uacute", '\250'; "ucirc", '\251'; "uuml", '\252'; "yacute", '\253'; "thorn", '\254'; "yuml", '\255'] let word_breaking_tags = List.fold_right StringSet.add [ "p"; "br"; "ul"; "ol"; "dt"; "li"; "dd"; "table"; "tr"; "th"; "td"; "img"; "div"; "blockquote"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "address" ] StringSet.empty module Output = struct type t = { txt: Buffer.t; extra: Buffer.t } let create() = { txt = Buffer.create 2048; extra = Buffer.create 256 } let clear ob = Buffer.clear ob.txt; Buffer.clear ob.extra let contents ob = Buffer.add_char ob.txt '\n'; Buffer.add_buffer ob.txt ob.extra; Buffer.contents ob.txt let char ob c = Buffer.add_char ob.txt c let string ob s = Buffer.add_string ob.txt s let add_extra ob s = Buffer.add_string ob.extra s; Buffer.add_char ob.extra '\n' let tag ob t = if StringSet.mem t word_breaking_tags then char ob ' '; if !Config.html_add_tags then add_extra ob t let tag_attr ob t n s = let n = String.lowercase n in if Str.string_match !Config.html_tag_attr (t ^ "/" ^ n) 0 then if n = "href" || n = "src" then add_extra ob (decode_url s) else add_extra ob s end let ob = Output.create() let tag = ref "" let attr_name = ref "" let attr_value = Buffer.create 128 } let ws = [' ' '\n' '\r' '\t'] let name = ['A'-'Z' 'a'-'z'] ['A'-'Z' 'a'-'z' '0'-'9' '.' '-']* let unquotedattrib = [^ '\'' '\"' '>' ' ' '\n' '\r' '\t'] [^ '>' ' ' '\n' '\r' '\t']* rule main = parse "" (* tolerance *) { () } | _ (* tolerance *) { comment lexbuf } | eof (* tolerance *) { () } and tagbody = parse ">" { Output.tag ob !tag } | name { attr_name := Lexing.lexeme lexbuf; tagattrib lexbuf; tagbody lexbuf } | _ (* tolerance -- should be ws *) { tagbody lexbuf } | eof (* tolerance *) { Output.tag ob !tag } and tagattrib = parse ws* '=' ws* { tagvalue lexbuf } | "" { Output.tag_attr ob !tag !attr_name "" } and tagvalue = parse "'" { Buffer.clear attr_value; singlequoted lexbuf } | "\"" { Buffer.clear attr_value; doublequoted lexbuf } | unquotedattrib { Output.tag_attr ob !tag !attr_name (Lexing.lexeme lexbuf) } | "" (* tolerance *) { Output.tag_attr ob !tag !attr_name "" } and singlequoted = parse "'" | eof (* eof is tolerance *) { Output.tag_attr ob !tag !attr_name (Buffer.contents attr_value) } | "&" { Buffer.add_char attr_value (entity lexbuf); singlequoted lexbuf } | _ { Buffer.add_char attr_value (Lexing.lexeme_char lexbuf 0); singlequoted lexbuf } and doublequoted = parse "\"" | eof (* eof is tolerance *) { Output.tag_attr ob !tag !attr_name (Buffer.contents attr_value) } | "&" { Buffer.add_char attr_value (entity lexbuf); doublequoted lexbuf } | _ { Buffer.add_char attr_value (Lexing.lexeme_char lexbuf 0); doublequoted lexbuf } and entity = parse '#' ['0'-'9']+ { let s = Lexing.lexeme lexbuf in let n = int_of_string (String.sub s 1 (String.length s - 1)) in entity_end lexbuf; if n >= 0 && n <= 255 then Char.chr n else '\255' } | name { let s = Lexing.lexeme lexbuf in entity_end lexbuf; try StringMap.find s entity_table with Not_found -> '\255' } | "" (* tolerance *) { '&' } and entity_end = parse ";" ? { () } { let extract_text s = Output.clear ob; main (Lexing.from_string s) } spamoracle-1.4/mail.ml0100644004340400511750000001540207723411756014377 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: mail.ml,v 1.8 2003/08/28 14:38:38 xleroy Exp $ *) (** Parsing of e-mail messages, including attachments *) type message = { headers: (string * string) list; body: string; parts: message list } let base64_decode_char c = match c with 'A' .. 'Z' -> Char.code c - 65 | 'a' .. 'z' -> Char.code c - 97 + 26 | '0' .. '9' -> Char.code c - 48 + 52 | '+' -> 62 | '/' -> 63 | _ -> -1 let decode_base64 s = let d = Buffer.create (String.length s * 3 / 4) in let buf = Array.create 4 0 in let pos = ref 0 in for i = 0 to String.length s - 1 do let n = base64_decode_char s.[i] in if n >= 0 then begin buf.(!pos) <- n; incr pos; if !pos = 4 then begin Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4)); Buffer.add_char d (Char.chr((buf.(1) land 15) lsl 4 + buf.(2) lsr 2)); Buffer.add_char d (Char.chr((buf.(2) land 3) lsl 6 + buf.(3))); pos := 0 end end done; begin match !pos with 2 -> Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4)) | 3 -> Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4)); Buffer.add_char d (Char.chr((buf.(1) land 15) lsl 4 + buf.(2) lsr 2)) | _ -> () end; Buffer.contents d let hexa_digit c = if c >= '0' && c <= '9' then Char.code c - 48 else if c >= 'A' && c <= 'F' then Char.code c - 65 + 10 else if c >= 'a' && c <= 'f' then Char.code c - 97 + 10 else raise Not_found let decode_qp s = let len = String.length s in let d = Buffer.create (String.length s) in let pos = ref 0 in while !pos < len do let c = s.[!pos] in if c = '=' && !pos + 1 < len && s.[!pos + 1] = '\n' then begin pos := !pos + 2 end else if c = '=' && !pos + 2 < len then begin try let h1 = hexa_digit s.[!pos + 1] and h2 = hexa_digit s.[!pos + 2] in Buffer.add_char d (Char.chr(h1 lsl 4 + h2)); pos := !pos + 3 with Not_found -> Buffer.add_char d c; incr pos end else begin Buffer.add_char d c; incr pos end done; Buffer.contents d let re_base64 = Str.regexp_case_fold "base64" let re_qp = Str.regexp_case_fold "quoted-printable" let decode encoding s = if Str.string_match re_base64 encoding 0 then decode_base64 s else if Str.string_match re_qp encoding 0 then decode_qp s else s let re_encoded_header = Str.regexp "=\\?[_A-Za-z0-9-]+\\?\\([BbQq]\\)\\?\\([^?]*\\)\\?=" let decode_header s = let decode_group s = let enc = Str.matched_group 1 s and txt = Str.matched_group 2 s in match enc with "B" | "b" -> decode_base64 txt | "Q" | "q" -> decode_qp txt | _ -> assert false in Str.global_substitute re_encoded_header decode_group s let re_continuation = Str.regexp "\n[ \t]+" let re_nl = Str.regexp "\n" let re_field = Str.regexp "\\([A-Za-z-]+[: ]\\)[ \t]*\\(.*\\)" let parse_header s = let rec parse_field accu = function [] -> List.rev accu | line :: rem -> if Str.string_match re_field line 0 then begin let field_name = String.lowercase (Str.matched_group 1 line) and field_val = Str.matched_group 2 line in parse_field ((field_name, decode_header field_val) :: accu) rem end else parse_field accu rem in parse_field [] (Str.split re_nl (Str.global_replace re_continuation " " s)) let find_header name headers = try List.assoc name headers with Not_found -> "" let re_nl_nl = Str.regexp "\n\n" let re_multipart = Str.regexp_case_fold "multipart/.*boundary *= *\\(\"\\([^\"]+\\)\"\\|\\([^ \t]+\\)\\)" let rec parse_message s = try let pos_sep = Str.search_forward re_nl_nl s 0 in let headers = parse_header (String.sub s 0 pos_sep) in let body = String.sub s (pos_sep + 2) (String.length s - pos_sep - 2) in let encoding = find_header "content-transfer-encoding:" headers in let ctype = find_header "content-type:" headers in if Str.string_match re_multipart ctype 0 then begin let boundary = try Str.matched_group 2 ctype with Not_found -> try Str.matched_group 3 ctype with Not_found -> assert false in let re_bound = Str.regexp ("--" ^ Str.quote boundary ^ "[ \t\n]*") in match Str.split_delim re_bound body with [] -> { headers = headers; body = decode encoding body; parts = [] } | blurb :: parts -> { headers = headers; body = decode encoding blurb; parts = List.map parse_message parts } end else { headers = headers; body = decode encoding body; parts = [] } with Not_found -> { headers = []; body = s; parts = [] } let header s msg = let rec hdr = function [] -> [] | (h,v) :: rem -> if h = s then v :: hdr rem else hdr rem in String.concat "\n" (hdr msg.headers) let header_matches s re msg = let rec hmatch = function [] -> false | (h,v) :: rem -> (h = s && Str.string_match re v 0) || hmatch rem in hmatch msg.headers let re_content_text = Str.regexp_case_fold "text/plain\\|text/enriched\\|text;\\|text$" let re_content_html = Str.regexp_case_fold "text/html" let re_content_message_rfc822 = Str.regexp_case_fold "message/rfc822" let re_content_multipart = Str.regexp_case_fold "multipart/" let rec iter_text_parts fn m = if header_matches "content-type:" re_content_text m || not(List.mem_assoc "content-type:" m.headers) then fn m else if header_matches "content-type:" re_content_html m then fn {m with body = Htmlscan.extract_text m.body} else if header_matches "content-type:" re_content_multipart m then begin fn m; List.iter (iter_text_parts fn) m.parts end else if header_matches "content-type:" re_content_message_rfc822 m then iter_text_parts fn (parse_message m.body) else () let iter_message fn msg = List.iter (fun (h, v) -> if Str.string_match !Config.mail_headers h 0 then fn v) msg.headers; iter_text_parts (fun m -> fn m.body) msg spamoracle-1.4/mail.mli0100644004340400511750000000447507637275342014563 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: mail.mli,v 1.3 2003/03/23 09:14:10 xleroy Exp $ *) (** Parsing of e-mail messages, including attachments *) type message = { headers: (string * string) list; body: string; parts: message list } (** The type of parsed e-mail messages. - [headers] is an A-list of pairs [(header-name, header-content)]. [header-name] is lowercased and includes [:], e.g. [subject:]. - [body] is the body of the message. Base64 and quoted-printable encodings are already decoded. For multipart messages, [body] is the initial blurb before the first part. - [parts] is empty except for multipart messages, in which case it lists all parts, recursively represented as messages. *) val parse_message: string -> message (** Parse the given textual message and return its structure. *) val header: string -> message -> string (** [header h msg] returns the contents of header named [h] in message [msg], or the empty string if this header is missing. Remember that header names are lowercased and include the final [:], e.g. [subject:]. *) val iter_text_parts: (message -> unit) -> message -> unit (** [iter_text_parts fn msg] applies [fn] to every (sub-)message contained in [msg] that is of type text. *) val iter_message: (string -> unit) -> message -> unit (** [iter_message fn msg] applies [fn] to the following parts of message [msg]: - the headers that match [!Config.mail_headers]; - the body of every sub-message of [msg] that is of type text. *) spamoracle-1.4/main.ml0100644004340400511750000002171007724121443014370 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: main.ml,v 1.14 2003/08/30 13:14:11 xleroy Exp $ *) (* Argument parsing and main program *) open Printf open Mbox open Database open Processing exception Usage of string let default_config_name = try Filename.concat (Sys.getenv "HOME") ".spamoracle.conf" with Not_found -> ".spamoracle.conf" let parse_config_file file = try let errs = Configfile.parse Config.options file in if errs <> [] then begin eprintf "Error while reading configuration file %s:\n" file; List.iter (fun (line, msg) -> eprintf "Line %d: %s\n" line msg) errs; exit 2 end with Sys_error msg -> eprintf "Cannot read configuration file %s:\n%s\n" file msg let mark_command args = let db = Database.read_short !Config.database_name in if args = [] then mark_message db (read_single_msg stdin) else List.iter (fun f -> mbox_file_iter f (mark_message db)) args let add_command args = let db = try Database.read_full !Config.database_name with Sys_error _ -> Database.create 997 in let processed = ref false and is_spam = ref true and verbose = ref false in let rec parse_args = function | "-v" :: rem -> verbose := true; parse_args rem | "-spam" :: rem -> is_spam := true; parse_args rem | "-good" :: rem -> is_spam := false; parse_args rem | f :: rem -> mbox_file_iter f (add_message db !verbose !is_spam); processed := true; parse_args rem | [] -> if not !processed then add_message db !verbose !is_spam (read_single_msg stdin); if !verbose then printf "\r%6d / %6d good / spam messages\n" db.f_num_good db.f_num_spam in parse_args args; Database.write_full !Config.database_name db let list_command args = let db = Database.read_full !Config.database_name in let res = ref [] in List.iter (fun s -> let re = Str.regexp (s ^ "$") in let match_word w (g, s) = if Str.string_match re w 0 then begin let p = if 2 * g + s < 5 then -1.0 else begin let pgood = float (2 * g) /. float db.f_num_good and pspam = float s /. float db.f_num_spam in max !Config.low_freq_limit (min !Config.high_freq_limit (pspam /. (pgood +. pspam))) end in res := (w, p, g, s) :: !res end in Hashtbl.iter match_word db.f_high_freq; Hashtbl.iter match_word db.f_low_freq) args; if !res = [] then Printf.printf "No matching word found in database.\n" else begin Printf.printf " Word Proba #good #spam\n"; List.iter (fun (w, p, g, s) -> if p >= 0.0 then Printf.printf "%-15s%8.2f%8d%8d\n" w p g s else Printf.printf "%-15s ----%8d%8d\n" w g s) (List.sort (fun (_, p1, _, _) (_, p2, _, _) -> compare p2 p1) !res) end let test_command args = let db = Database.read_short !Config.database_name in let low = ref 0.0 and high = ref 1.0 in let rec parse_args = function | "-min" :: s :: rem -> begin try low := float_of_string s with Failure _ -> raise(Usage("bad argument to -min")) end; parse_args rem | "-min" :: [] -> raise(Usage("no argument to -min")) | "-max" :: s :: rem -> begin try high := float_of_string s with Failure _ -> raise(Usage("bad argument to -max")) end; parse_args rem | "-max" :: [] -> raise(Usage("no argument to -max")) | f :: rem -> mbox_file_iter f (test_message db !low !high f); parse_args rem | [] -> () in parse_args args let stat_command args = let db = Database.read_short !Config.database_name in let stat_mbox f = let num_msgs = ref 0 and num_good = ref 0 and num_spam = ref 0 and num_unknown = ref 0 in mbox_file_iter f (fun s -> incr num_msgs; match stat_message db s with Msg_good -> incr num_good | Msg_spam -> incr num_spam | Msg_unknown -> incr num_unknown); let percentage a b = 100.0 *. float a /. float b in if !num_msgs > 0 then printf "%s: %.2f%% good, %.2f%% unknown, %.2f%% spam\n" f (percentage !num_good !num_msgs) (percentage !num_unknown !num_msgs) (percentage !num_spam !num_msgs) in List.iter stat_mbox args let words_command args = if args = [] then wordsplit_message (read_single_msg stdin) else List.iter (fun f -> mbox_file_iter f (fun msg -> print_string "----------------------------------------\n"; wordsplit_message msg)) args let backup_command () = Database.dump (Database.read_full !Config.database_name) stdout let restore_command () = Database.write_full !Config.database_name (Database.restore stdin) let rec parse_args_1 = function "-config" :: file :: rem -> parse_config_file file; parse_args_2 rem | "-config" :: [] -> raise(Usage("Option -config requires an argument")) | rem -> if Sys.file_exists default_config_name then parse_config_file default_config_name; parse_args_2 rem and parse_args_2 = function | "-f" :: file :: rem -> Config.database_name := file; parse_args_3 rem | "-f" :: [] -> raise(Usage("Option -f requires an argument")) | rem -> parse_args_3 rem and parse_args_3 = function "mark" :: rem -> mark_command rem | "add" :: rem -> add_command rem | "list" :: rem -> list_command rem | "test" :: rem -> test_command rem | "stat" :: rem -> stat_command rem | "backup" :: rem -> backup_command () | "restore" :: rem -> restore_command () | "words" :: rem -> words_command rem | s :: rem -> raise(Usage("Unknown command " ^ s)) | [] -> raise(Usage "") let usage_string = "\ Usage: spamoracle [-config conf] [-f db] mark {mailbox}* Add 'X-Spam:' headers to messages with result of analysis {mailbox}* Mailboxes containing messages to analyze and mark If none given, read single msg from standard input spamoracle [-config conf] [-f db] add [-v] -spam {spambox}* -good {goodbox}* Create or update database with known spam or non-spam messages -v Print progress bar -spam Indicate subsequent mailboxes contain spam -good Indicate subsequent mailboxes contain good msgs (not spam) {spambox}* Mailboxes containing spam {goodbox}* Mailboxes containing good messages (not spam) If no mailbox given, read single msg from standard input spamoracle [-config conf] [-f db] test [-min prob] [-max prob] {mailbox}* Analyze messages and print summary of results for each message -min Don't print messages with result below -max Don't print messages with result above {mailbox}* Mailboxes containing messages to analyze spamoracle [-config conf] [-f db] stat {mailbox}* Analyze messages and print percentages of spam/non-spam for each mailbox {mailbox}* Mailboxes containing messages to analyze spamoracle [-config conf] [-f db] list {regexp}* Dump word statistics in database {regexp}* Regular expressions for words we are interested in spamoracle [-config conf] [-f db] backup > database.backup Dump whole database in portable text format on standard output spamoracle [-config conf] [-f db] restore < database.backup Restore database from text backup file read from standard input spamoracle [-config conf] [-f db] words {mailbox}* Extract words from messages and print them {mailbox}* Mailboxes containing messages to scan If no mailbox given, read single msg from standard input Common options: -config Configuration file (default $HOME/.spamoracle.conf) -f Database to use (default $HOME/.spamoracle.db)" let main () = try parse_args_1 (List.tl (Array.to_list Sys.argv)) with | Usage msg -> eprintf "%s\n%s\n" msg usage_string; exit 2 | Sys_error msg -> eprintf "System error: %s\n" msg | Database.Error msg -> eprintf "%s\n" msg let _ = main() spamoracle-1.4/mbox.ml0100644004340400511750000000531407532373135014417 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: mbox.ml,v 1.4 2002/08/26 09:35:25 xleroy Exp $ *) (* Reading of a mailbox file and splitting into individual messages *) type t = { ic: in_channel; zipped: bool; mutable start: string; buf: Buffer.t } let open_mbox_file filename = if Filename.check_suffix filename ".gz" then { ic = Unix.open_process_in ("gunzip -c " ^filename); zipped = true; start = ""; buf = Buffer.create 50000 } else { ic = open_in filename; zipped = false; start = ""; buf = Buffer.create 50000 } let open_mbox_channel ic = { ic = ic; zipped = false; start = ""; buf = Buffer.create 50000 } let read_msg t = Buffer.clear t.buf; Buffer.add_string t.buf t.start; let rec read () = let line = input_line t.ic in if String.length line >= 5 && String.sub line 0 5 = "From " && Buffer.length t.buf > 0 then begin t.start <- (line ^ "\n"); Buffer.contents t.buf end else begin Buffer.add_string t.buf line; Buffer.add_char t.buf '\n'; read () end in try read() with End_of_file -> if Buffer.length t.buf > 0 then begin t.start <- ""; Buffer.contents t.buf end else raise End_of_file let close_mbox t = if t.zipped then ignore(Unix.close_process_in t.ic) else close_in t.ic let mbox_file_iter filename fn = let ic = open_mbox_file filename in try while true do fn(read_msg ic) done with End_of_file -> close_mbox ic let mbox_channel_iter inchan fn = let ic = open_mbox_channel inchan in try while true do fn(read_msg ic) done with End_of_file -> close_mbox ic let read_single_msg inchan = let res = Buffer.create 10000 in let buf = String.create 1024 in let rec read () = let n = input inchan buf 0 (String.length buf) in if n > 0 then begin Buffer.add_substring res buf 0 n; read () end in read (); Buffer.contents res spamoracle-1.4/mbox.mli0100644004340400511750000000375507532373135014577 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: mbox.mli,v 1.3 2002/08/26 09:35:25 xleroy Exp $ *) (** Reading of a mailbox file and splitting into individual messages *) type t (** The type of channels opened on a mailbox *) val open_mbox_file: string -> t (** Open the given file name as a mailbox, and return an mbox channel ready for reading. If the file name ends in [.gz], arrange for on-the-fly decompression with [zcat]. *) val open_mbox_channel: in_channel -> t (** Open the given input channel as a mailbox. *) val read_msg: t -> string (** Read the next message from the given channel, and return it as a string. Raise [End_of_file] if no message remains. *) val close_mbox: t -> unit (** Close the given mbox channel. *) val mbox_file_iter: string -> (string -> unit) -> unit (** [mbox_file_iter filename fn] reads messages from the file named [filename], and applies [fn] in turn to each message. *) val mbox_channel_iter: in_channel -> (string -> unit) -> unit (** [mbox_channel_iter ic fn] reads messages from the input channel [ic], and applies [fn] in turn to each message. *) val read_single_msg: in_channel -> string (** Read one message from the given channel, up to end of file *) spamoracle-1.4/processing.ml0100644004340400511750000000701107724121443015616 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: processing.ml,v 1.8 2003/08/30 13:14:11 xleroy Exp $ *) (* Processing messages *) open Printf open Mail open Database open Rankmsg (* Mark message with rank info *) let re_nl_nl = Str.regexp "\n\n" let mark_message db txt = let m = parse_message txt in let r = rank_message db m in try let pos_sep = Str.search_forward re_nl_nl txt 0 in output stdout txt 0 pos_sep; let verdict = if r.spam_prob <= !Config.good_mail_prob && r.num_meaningful >= !Config.min_meaningful_words then "no" else if r.spam_prob >= !Config.spam_mail_prob && r.num_meaningful >= !Config.min_meaningful_words then "yes" else "unknown" in printf "\n%s: %s; %.2f; %s" !Config.spam_header verdict r.spam_prob r.explanation; if !Config.summarize_attachments then begin let att = Attachments.summarize m in if att <> "" then printf "\n%s: %s" !Config.attachments_header att; end; output stdout txt pos_sep (String.length txt - pos_sep) with Not_found -> print_string txt (* Add messages to database *) let record_words db is_spam txt = Wordsplit.iter (fun w -> if is_spam then add_spam db w else add_good db w) txt let add_message db verbose is_spam msg = if verbose then begin printf "\r%6d / %6d" db.f_num_good db.f_num_spam; flush stdout end; iter_message (record_words db is_spam) (parse_message msg); if is_spam then db.f_num_spam <- db.f_num_spam + 1 else db.f_num_good <- db.f_num_good + 1 (* Test analysis on a message *) let test_message db low high f txt = let msg = parse_message txt in let r = rank_message db msg in if r.spam_prob >= low && r.spam_prob <= high then begin printf "--------------------------------------------------\n"; printf "From: %s\n" (header "from:" msg); printf "Subject: %s\n" (header "subject:" msg); printf "Score: %.2f -- %d\n" r.spam_prob r.num_meaningful; printf "Details: %s\n" r.explanation; if !Config.summarize_attachments then begin let att = Attachments.summarize msg in if att <> "" then printf "Attachments: %s\n" att end; printf "File: %s\n" f; end (* Statistics *) type message_class = Msg_good | Msg_unknown | Msg_spam let stat_message db txt = let msg = parse_message txt in let r = rank_message db msg in if r.spam_prob <= 0.2 && r.num_meaningful >= 5 then Msg_good else if r.spam_prob >= 0.8 && r.num_meaningful >= 5 then Msg_spam else Msg_unknown (* Word splitting *) let wordsplit_message txt = Format.open_hovbox 0; Mail.iter_message (fun txt -> Wordsplit.iter (fun word -> Format.print_string word; Format.print_space()) txt) (parse_message txt); Format.close_box(); Format.print_newline() spamoracle-1.4/processing.mli0100644004340400511750000000245007724121443015771 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: processing.mli,v 1.4 2003/08/30 13:14:11 xleroy Exp $ *) (* Processing messages *) val mark_message : Database.short -> string -> unit val record_words : Database.full -> bool -> string -> unit val add_message : Database.full -> bool -> bool -> string -> unit val test_message : Database.short -> float -> float -> string -> string -> unit type message_class = Msg_good | Msg_unknown | Msg_spam val stat_message : Database.short -> string -> message_class val wordsplit_message : string -> unit spamoracle-1.4/rankmsg.ml0100644004340400511750000000554407637275343015131 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: rankmsg.ml,v 1.4 2003/03/23 09:14:11 xleroy Exp $ *) (* Message ranking *) open Mail open Database let word_count_in w res = let count = ref 0 in for i = 0 to Array.length res - 1 do if w = fst res.(i) then incr count done; !count let add_word w p res = let i = ref 0 in while !i < Array.length res && abs_float (p -. 0.5) <= abs_float(snd res.(!i) -. 0.5) do incr i done; if !i < Array.length res then begin for j = Array.length res - 1 downto !i + 1 do res.(j) <- res.(j - 1) done; res.(!i) <- (w, p) end let normalize (p : float) low high = if p > high then high else if p < low then low else p let process_word (db, res) w = try let (g, b) = Hashtbl.find db.s_freq w in if word_count_in w res < !Config.max_repetitions then begin let g = 2 * g in let pgood = float g /. float db.s_num_good and pbad = float b /. float db.s_num_spam in let p = normalize (pbad /. (pgood +. pbad)) !Config.low_freq_limit !Config.high_freq_limit in add_word w p res end with Not_found -> () let process_words ctx txt = Wordsplit.iter (process_word ctx) txt let process_msg ctx m = iter_message (process_words ctx) m let bayes_rule res = let probs = List.map snd (Array.to_list res) in let prod = List.fold_left ( *. ) 1.0 probs and cprod = List.fold_left ( *. ) 1.0 (List.map (fun x -> 1.0 -. x) probs) in prod /. (prod +. cprod) type rank = { spam_prob: float; num_meaningful: int; explanation: string } let rank_message db msg = let res = Array.make !Config.num_words_retained ("", 0.5) in process_msg (db, res) msg; let p = bayes_rule res in let meaningful = ref 0 in while !meaningful < Array.length res && fst res.(!meaningful) <> "" do incr meaningful done; let summary = Buffer.create 200 in for i = 0 to !meaningful - 1 do let (w, p) = res.(i) in Printf.bprintf summary "%s:%02d " w (truncate (p *. 100.0)) done; { spam_prob = p; num_meaningful = !meaningful; explanation = Buffer.contents summary } spamoracle-1.4/rankmsg.mli0100644004340400511750000000202607532373136015263 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) (* *) (***********************************************************************) (* $Id: rankmsg.mli,v 1.2 2002/08/26 09:35:26 xleroy Exp $ *) (** Message ranking *) type rank = { spam_prob: float; num_meaningful: int; explanation: string } val rank_message: Database.short -> Mail.message -> rank spamoracle-1.4/spamoracle.10100644004340400511750000004033007737253476015341 0ustar xleroyformel.TH SPAMORACLE 1 .SH NAME spamoracle \- a spam classification tool .SH SYNOPSIS .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B mark [ .I mailbox ... ] .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B add .RB [ \-v ] .B \-spam .I spambox ... .B \-good .I goodbox ... .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B test .RB [ \-min .IR prob ] .RB [ \-max .IR prob ] [ .I mailbox ... ] .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B stat [ .I mailbox ... ] .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B list .I regexp ... .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B backup .B > .I backupfile .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B restore .B < .I backupfile .B spamoracle .RB [ \-config .IR conf ] .RB [ \-f .IR database ] .B words [ .I mailbox ... ] .SH DESCRIPTION SpamOracle is a tool to help detect and filter away "spam" (unsolicited commercial e-mail). It proceeds by statistical analysis of the words that appear in the e-mail, comparing the frequencies of words with those found in a user-provided corpus of known spam and known legitimate e-mail. The classification algorithm is based on Bayes' formula, and is described in Paul Graham's paper, .IR "A plan for spam" , .BR http://www.paulgraham.com/spam.html . This program is designed to work in conjunction with .BR procmail (1). The result of the analysis is output as an additional message header .B "X-Spam:" followed by .BR "yes" , .B "no" or .BR "unknown" , plus additional details. A procmail rule can then test this .B "X-Spam:" header and deliver the e-mail to the appropriate mailbox. In addition, SpamOracle also analyses MIME attachments, extracting relevant information such as MIME type, character encoding and attached file name, and summarizing them in an additional .B "X-Attachments:" header. This allows procmail to easily reject e-mails containing suspicious attachments, e.g. Windows executables which often indicate a virus. .SH "REQUIREMENTS AND LIMITATIONS" To use SpamOracle, your mail must be delivered to a Unix machine on which you have a shell account. This machine must have .BR procmail (1) (see .BR http://www.procmail.org/ ) installed. Your .B "~/.forward" file must be set up to run all incoming e-mail through .BR procmail (1). If your mail server supports the POP or IMAP protocols, you can also use .BR fetchmail (1) to fetch your mail from the server and have it delivered to your local machine. To provide the corpus of messages from which SpamOracle "learns", an archive of about 1000 of your e-mails is needed. The archive must be manually or semi-automatically split into known spams and known good messages. Mis-classified messages in the corpus (e.g. spams mistakenly stored among the good messages) will decrease the efficiency of the classification. The archive must be in Unix mailbox format, or in "one message per file" format (a la MH). Other formats, such as Emacs' Babyl, are not supported. The notion of "word" used by SpamOracle is slanted towards Western European languages, i.e. the ISO Latin-1 and Latin-9 character sets. Preliminary support for JIS-encoded Japanese can be selected at compile-time. SpamOracle will not work well if you receive many legitimate e-mails written in other character sets, such as Chinese or Korean sets. .SH INITIALIZATION To build the database of word frequencies from the corpus, do: .RS .ft 3 .nf .sp rm ~/.spamoracle.db spamoracle add -v -good goodmails -spam spammails .ft .LP .RE .fi By default, the database is stored in the file .B ".spamoracle.db" in your home directory. This can be overriden with the .B -f option: .BI "spamoracle -f mydatabase add" " ..." The .B \-v option prints progress information during the processing of the corpus. This assumes that the good, non-spam messages from the corpus are stored in the file .BR goodmails , and the known spam messages in the file .BR spammails . You can also fetch corpus messages from several files, and/or process them via several invocations of SpamOracle: .RS .ft 3 .nf .sp spamoracle add -good goodmails1 ... goodmailsN spamoracle add -spam spammails1 ... spammailsP .ft .LP .RE .fi .SH "TESTING THE DATABASE" To check that the database was built correctly, and familiarize yourself with the statistical analysis performed by SpamOracle, invoke the "test" mode on the mailboxes that you just used for building the corpus: .RS .ft 3 .nf .sp spamoracle test goodmails | more spamoracle test spammails | more .ft .LP .RE .fi For each message in the given mailboxes, you'll see a summary like this: .RS .ft 3 .nf .sp From: bbo Subject: Check This Out Score: 1.00 -- 15 Details: refid:98 $$$$:98 surfing:98 asp:95 click:93 cable:92 instantly:90 https:88 internet:87 www:86 U4:85 isn't:14 month:81 com:75 surf:75 Attachments: cset="GB2312" type="application/octet-stream" name="Guangwen4.zip" File: inbox/314 .ft .LP .RE .fi The first two lines are just the .B From: and .B Subject: fields of the original message. The .B "Score:" line summarizes the result of the analysis. The first number (between 0.0 and 1.0) is the probability that the message is actually spam --- or, equivalently, the degree of similarity of the message with the spam messages in the corpus. The second number (an integer between 0 and 15) is the number of "interesting" words found in the message. "Interesting" words are those that occur at least 5 times in the corpus. In the example, we have 15 interesting words (the maximum) and a score of 1.00, indicating a spam with high certainty. The .B "Details:" line provides an explanation of the score. It lists the 15 most interesting words found in the message, that is, the 15 interesting words whose probability of denoting a spam is farthest away from the neutral 0.5. Each word is given with its individual score, written as a percentage (between 01 and 99) rather than as a probability so as to save space. Here, we see a number of very "spammish" words such as .B "$$$$" or .BR "click" , with probability 0.98 and 0.93 respectively, and a few "innocent" words such as .B "isn't" (probability 0.14). The .B "U4" word with probability 0.85 is actually a pseudo-word representing a 4-letter word all in uppercase -- something spammers are fond of. The .B "Attachments:" line summarizes some information about MIME attachments for this message. Here, we have one attachment of type .BR "application/octect-stream" , file name .BR "Guangwen4.zip" , and character set .B "GB2312" (an encoding for Chinese). The .B "File:" line shows the file that is being tested. Normally, when running .BR "spamoracle test goodmails" , most messages should come out with low score (0.2 or less), and when running .BR "spamoracle test spammails" , most messages should come out with a high score (0.8 or more). If not, your corpus isn't very good, or not well classified into spam and non-spam. To quickly see the outliers, you can reduce the interval of scores for which message summaries are displayed, as follows: .RS .ft 3 .nf .sp spamoracle test -min 0.2 goodmails | more # Shows only good mails with score >= 0.2 spamoracle test -max 0.8 spammails | more # Shows only spam mails with score <= 0.8 .ft .LP .RE .fi Now, for a more challenging test, take a mailbox that contains unfiltered e-mails, i.e. a mixture of spam and legitimate e-mails, and run it through SpamOracle: .RS .ft 3 .nf .sp spamoracle test mymailbox | less .ft .LP .RE .fi Marvel at how well the oracle recognizes spam from the rest! If the result isn't that marvelous to you, keep in mind that certain spams are just too short to be recognized (not enough significant words). Also, perhaps your corpus was too small, or not well categorized... .SH "MARKING AND FILTERING INCOMING E-MAIL" Once the database is built, you're ready to run incoming e-mails through SpamOracle. The command .B spamoracle mark reads one e-mail from standard input, and copies it to standard output, with two headers inserted: .B "X-Spam:" and .BR "X-Attachments:" . The .B "X-Spam:" header has one the following formats: .B " X-Spam: yes; .IB score ; .I details or .B " X-Spam: no; .IB score ; .I details or .B " X-Spam: unknown; .IB score ; .I details The .I score and .I details are as described for .BR "spamoracle test" . The .BR "yes" / "no" / "unknown" tag synthesizes the results of the analysis: .B "yes" means that the score is >= 0.8 and at least 5 interesting words were found; .B "no" means that the score is <= 0.2 and at least 5 interesting words were found; .B "unknown" is returned otherwise. The .B "unknown" case generally occurs for very short messages, where not enough interesting words were found. The .B "X-Attachments:" header contains the same information as the .B "Attachments:" output of .BR "spamoracle test" , that is, a summary of the message attachments. To process automatically your incoming e-mail through SpamOracle and act upon the results of the analysis, just insert the following "recipes" in the file ~/.procmailrc: .RS .ft 3 .nf .sp :0fw | /usr/local/bin/spamoracle mark :0 * ^X-Spam: yes; spambox .ft .LP .RE .fi What these cryptic commands mean is: - Run every mail through the .B "spamoracle mark" command. (If spamoracle wasn't installed in /usr/local/bin, adjust the path as necessary.) This adds two headers to the message: .B "X-Spam:" and .BR "X-Attachments:" , describing the results of the spam analysis and the attachment analysis. - If we have an .B "X-Spam: yes" header, deliver the message to the file .B "spambox" rather than to your regular mailbox. Presumably, you'll read .B "spambox" once in a while, but less often than your regular mailbox. Daring users can put .B "/dev/null" instead of .B "spambox" to just throw away the message, but please don't do that until you've used SpamOracle for a while and are happy with the results. SpamOracle's false positive rate (i.e. legitimate mails classified as spam) is low (0.1%) but not null. So, better save the presumed spams somewhere, and scan them quickly from time to time. If you'd like to enjoy a bit of attachment-based filtering, here are some procmail rules for that: .RS .ft 3 .nf .sp :0 * ^X-Attachments:.*name=".*\\.(pif|scr|exe|bat|com)" spambox :0 * ^X-Attachments:.*type="audio/(x-wav|x-midi) spambox :0 * ^(Content-type:.*|X-Attachments:.*cset="|^Subject:.*=\\?)(ks_c|gb2312|iso-2|euc-|big5|windows-1251) spambox .ft .LP .RE .fi The first rule treats as spam every mail that has a Windows executable as attachment. These mails are typically sent by viruses. The second rule does the same with attachments of type x-wav or x-midi. I never normally receive music by e-mail, however some popular e-mail viruses seem fond of these attachment types. The third rule treats as spam every mail that uses character encodings corresponding to Korean, Chinese, Japanese, and Cyrillic. .SH "UPDATING THE DATABASE" At any time, you can add more known spams or known legitimate messages to the database by using the .B "spamoracle add" command. For instance, if you find a spam message that was not classified as such, run it through .BR "spamoracle add -spam" , so that SpamOracle can learn from its mistake. (Without additional arguments, this command will read a single message from standard input and record it as spam.) Under .BR mutt (1) for instance, just highlight the spam message and type .RS .ft 3 .nf .sp |spamoracle add -spam .ft .LP .RE .fi Similarly, if you find a legitimate message while checking your spam box, run it through .BR "spamoracle add -good" . Another option is to collect more known spams or more known good messages into mailbox files, and once in a while do .B spamoracle add -good new_good_mails or .BR "spamoracle add -spam new_spam_mails" . .SH "QUERYING THE DATABASE" For your edification and entertainment, the contents of the database can be queried by regular expressions. The .BI "spamoracle list " regexp command lists all words in the database that match .I regexp (an Emacs-style regular expression), along with their number of occurrences in spam mail and in good mail. For instance: .RS .ft 3 .nf .sp spamoracle list '.*' # show all words -- big list! spamoracle list 'sex.*' spamoracle list 'linux.*' .ft .LP .RE .fi .SH "DATABASE BACKUPS" The database used by SpamOracle is stored in a compact, binary format that is not humanly readable. Moreover, this format is subject to change in later versions of SpamOracle. To facilitate backups and upgrades, the database contents can also be manipulated in a portable, text format. The .B "spamoracle backup" command dumps the contents of the database to standard output, in a textual, portable format. The .B "spamoracle restore" command reads such a dump from standard input and rebuilds the database with this data. The recommended procedure for upgrading to a newer version of SpamOracle is: .RS .ft 3 .nf .sp # Before the upgrade: spamoracle backup > backupfile # Upgrade SpamOracle # Restore the database spamoracle restore < backupfile .ft .LP .RE .fi .SH "CONFIGURING FILTERING PARAMETERS" Many of the parameters that govern message classification can be configured via a configuration file. By default, the configuration is read from the file .B ".spamoracle.conf" in the user's home directory. A different configuration file can be specified on the command line using the .B -config option: .BI "spamoracle -config myconfigfile" " ..." The list of configurable parameters and the format of the configuration file are described in .BR spamoracle.conf (5). All parameters have reasonable defaults, but you can try to improve the quality of classification further by tweaking them. To determine the impact of your changes, use either the .B test or .B stat commands to .BR spamoracle . The .B spamoracle stat command prints a one-line summary of how many spam, non-spam, and unknown messages were found in the mailboxes given as arguments. .SH "TECHNICAL DETAILS" SpamOracle's notion of "word" is any run of 3 to 12 of the following characters: letters, single quotes, and dashes (-). If support for non-English european languages was compiled in, word characters also include the relevant accented letters for the languages in question. All words are mapped to lowercase, and accented letters are mapped to the corresponding non-accented letters. A run of 3 to 12 of the following characters also constitutes a word: digits, dots, commas, and dollar, Euro and percent signs. In addition, a run of three or more uppercase letters generates a pseudo-word .BI U n where .I n is the length of the run. Similarly, a run of three or more non-ASCII characters (code >= 128) generates a pseudo-word .BI W n where .I n is the length of the run. For instance, the following text: .RS .ft 3 .nf .sp SUMMER in English is written "été" in French ¹²³ .ft .LP .RE .fi is processed into the following words, assuming French support was selected at compile-time: .RS .ft 3 .nf .sp U5 summer english written ete french W3 .ft .LP .RE .fi and if French support was not selected: .RS .ft 3 .nf .sp U5 summer english written french W3 .ft .LP .RE .fi To see the words that are extracted from a message, issue the .B spamoracle words command. It reads either a single message from standard input, or all messages from the mailbox files given as arguments, decomposes the messages into words and prints the words. .SH RANDOM NOTES The database file can be compressed with .BR gzip (1) to save disk space, at the expense of slower .B spamoracle operations. If the database file specified with the .B -f option has the extension .BR .gz , .B spamoracle will automatically uncompress it on start-up, and re-compress it after updates. If your mail is stored in MH format, you may run into "command line too long" errors while trying to process a lot of small files with the .B spamoracle add command, e.g. when doing .br .B spamoracle add -good archives/*/* -spam spam/* .br Instead, do something like: .br .B find archives -type f -print | xargs spamoracle add -good .br .B find spam -type f -print | xargs spamoracle add -spam .SH AUTHOR Xavier Leroy .SH "SEE ALSO" .BR spamoracle.conf (5); .BR procmail (1); .BR fetchmail (1) .B http://cristal.inria.fr/~xleroy/software/ (SpamOracle distribution site) .B http://www.paulgraham.com/spam.html (Paul Graham's seminal paper) spamoracle-1.4/spamoracle.conf.50100644004340400511750000001435107724406743016266 0ustar xleroyformel.TH SPAMORACLE.CONF 5 .SH NAME spamoracle.conf \- SpamOracle configuration file format .SH DESCRIPTION The .B spamoracle.conf file is a configuration file governing the operation of the .BR spamoracle (1) e-mail classification tool. By default, the configuration file is searched in .IB $HOME /.spamoracle.conf but an alternate location can be specified using the .B -config flag to .BR spamoracle (1). .B Important note: most of the configuration parameters should not be modified lightly, as this may result in completely wrong e-mail classification. Familiarity with Graham's filtering algorithm, as described in the paper referenced at the end of this page, is required to really understand the effect of the parameters. .SH SYNTAX The .B spamoracle.conf file is composed of lines of the form .I variable .B = .IR value . Lines starting with a hash sign (#) are treated as comments and ignored. Blank lines are ignored. Depending on the type of the variable (see the list of variables below), the .I value part is of the following forms: .TP .I string A sequence of characters. Blanks (spaces, tabs) at the beginning and the end of the string are ignored. Alternatively, the string can be enclosed in double quotes ("), in which case spaces are not trimmed. Inside quoted strings, blackslashes (\) and double quotes (") must be escaped with a backslash, as in \\ or \" .TP .I boolean Either .BR on, .BR yes, .BR true, or .B 1 to activate the boolean option, or .BR off, .BR no, .BR false, or .B 0 to deactivate it. .TP .I integer A decimal integer .TP .I float A decimal floating-point number. .TP .I regexp A regular expression in .BR emacs (1) syntax. The repetition operators are .BR * , .BR + , and .BR ? . Alternation is written .B \e| and grouping is written .BR \e( ... \e) . Character classes are written between brackets .BR [ ... ] as usual. A single dot denotes any character except newline. Regular expressions are case-insensitive. .SH CONFIGURABLE PARAMETERS .TP .B database_file (type .IR string, default value .IB $HOME /.spamoracle.db ) .br The location of the file that contains the database of word frequencies used by .BR spamoracle (1). .TP .B html_retain_tags (type .IR boolean, default value .BR false ) .br In HTML-formatted e-mails and attachments, the names of HTML tags are normally not treated as words and are ignored for the word frequency calculations. If the .B html_retain_tags parameter is set to .BR true , HTML tags (such as .B img or .BR bold ) are treated as words and included in the computation of word frequencies. .TP .B html_tag_attributes (type .IR regexp , default value .br .BR a/href\e|img/src\e|img/alt\e|frame/src\e|font/face\e|font/color ) .br This regular expression matches pairs of HTML tags and HTML attributes written as .IB tag / attribute. When scanning HTML-formatted e-mails and attachments, attributes to HTML tags are normally ignored, unless the tag/attribute pair matches the regular expression .BR html_tag_attributes . If the tag/attribute pair matches this regexp, the value of the attribute (for instance, the URL for the .BR a / href attribute) is scanned for words. .TP .B mail_headers (type .IR regexp , default value .BR from:\e|subject: ) .br A regular expression determining which headers of an e-mail message are scanned for words. .TP .B spam_header (type .IR string , default value .BR X-Spam ) .br The name of the header that .B spamoracle mark adds to incoming e-mail messages, with the results of the spam/non-spam classification. .TP .B attachments_header (type .IR string , default value .BR X-Attachments ) .br The name of the header that .B spamoracle mark adds to incoming e-mail messages, with the one-line summary of attachment types, names and character sets. The generation of this header can be turned off with the .B summarize_attachment parameter. .TP .B summarize_attachment (type .IR boolean , default value .BR true ) .br If this parameter is set, .B spamoracle mark generates a one-line summary of the attachments of the incoming messages, and inserts this summary in the message headers. Setting this parameter to .B false disables the generation of this extra header. .TP .B num_meaningful_words (type .IR integer , default value .BR 15 ) .br Maximal number of "meaningful" words that are retained for computing the spam probability. During mail analysis, .B spamoracle extracts all words of the message, and retains those whose spam frequency (frequency of occurrence in spam messages) is closest to 1 or to 0. At most .B num_meaningful_words such "meaningful" words are retained. .TP .B max_repetitions (type .IR integer , default value .BR 2 ) .br Maximum number of times a given word can occur in the set of "meaningful" words retained for computing the spam probability. The default value of 2 means that at most 2 occurrences of the same word will be retained. .TP .B low_freq_limit (type .IR float , default value .BR 0.01 ) .TP .B high_freq_limit (type .IR float , default value .BR 0.99 ) .br The spam frequency of a word is computed as the number of occurrences in spam divided by number of occurrences in all messages. This ratio is then clipped to the interval [ .BR low_freq_limit , .B high_freq_limit ], so that words that are extremely rare or extremely common in spam do not bias the probability computation too much. The default values of 0.01 and 0.99 are adequate for a corpus of a few thousand e-mails. For larger corpora (e.g. 10000 e-mails), the values 0.001 and 0.999 may give better results. .TP .B min_meaningful_words (type .IR integer , default value .BR 5 ) .br Minimum number of "meaningful" words below which .B spamoracle mark refuses to classify the e-mail and outputs "unknown" status. This happens with very short e-mails, or e-mails that consist exclusively of links and pictures. .TP .B good_mail_prob (type .IR float , default value .BR 0.2 ) .br Spam probability below which the e-mail is classified as non-spam. .TP .B spam_mail_prob (type .IR float , default value .BR 0.8 ) .br Spam probability above which the e-mail is classified as spam. Messages whose probability falls between .B good_mail_prob and .B spam_mail_prob are classified as "unknown". .SH AUTHOR Xavier Leroy .SH "SEE ALSO" .BR spamoracle (1) .B http://www.paulgraham.com/spam.html (Paul Graham's seminal paper) spamoracle-1.4/wordsplit.mli0100644004340400511750000000136307531433166015652 0ustar xleroyformel(** Decompose a string into words. *) val iter: (string -> unit) -> string -> unit (** [iter fn txt] applies [fn] to each word in [txt]. A word is a sequence of 3 to 12 word characters. Word characters include letters, numbers, French accented letters, dashes, single quotes, dollar and percent signs. Words are converted to lowercase and accents on letters are removed before they are passed to [fn]. In addition, for each run of 3 or more uppercase letters, [fn] is applied to ["Ux"] where [x] is the length of the run, e.g. ["U8"] for 8 uppercase letters in a row. Similarly, for each run of 3 or more characters above 127, [fn] is applied to ["Wx"] where [x] is the length of the run. *) spamoracle-1.4/wordsplit.mlp0100644004340400511750000001211707724424415015661 0ustar xleroyformel(***********************************************************************) (* *) (* SpamOracle -- a Bayesian spam filter *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. This file is distributed under the terms of the *) (* GNU Public License version 2. *) (* *) (***********************************************************************) (* $Id: wordsplit.mlp,v 1.8 2003/08/31 16:57:49 xleroy Exp $ *) (* Decompose a string into words. *) { (* Map uppercase to lowercase. Remove ISO Latin 1 accents. *) let tbl = "\ \000\001\002\003\004\005\006\007\008\t\n\011\012\013\014\015\ \016\017\018\019\020\021\022\023\024\025\026\027\028\029\030\031 \ !\"#$%&'()*+,-./\ 0123456789:;<=>?\ @abcdefghijklmno\ pqrstuvwxyz[\\]^_\ `abcdefghijklmno\ pqrstuvwxyz{|}~\127\ \128\129\130\131\132\133\134\135\136\137\138\139\140\141\142\143\ \144\145\146\147\148\149\150\151\152\153\154\155\156\157\158\159\ \160¡¢£¤¥¦§¨©ª«¬­®¯\ °±²³´µ¶·¸¹º»¼½¾¿\ aaaaaaeceeeeiiii\ Ðnooooo×ouuuuypß\ aaaaaaeceeeeiiii\ ðnooooo÷ouuuuypy" let normalize s = for i = 0 to String.length s - 1 do s.[i] <- tbl.[Char.code s.[i]] done let all_uppercase s = try for i = 0 to String.length s - 1 do let c = s.[i] in if (c >= 'A' && c <= 'Z') || (c >= '\192' && c <= '\214') || (c >= '\216' && c <= '\222') then () else raise Exit done; true with Exit -> false let adjust s n = if String.length s <= n then s else String.sub s 0 n let unstretch s = let t = String.create ((String.length s + 1) / 2) in for i = 0 to String.length t - 1 do t.[i] <- s.[i + i] done; t } let letter = [ 'a'-'z' 'A'-'Z' #ifdef FRENCH 'À' 'Â' 'Ç' 'È' 'É' 'Ê' 'Ë' 'Î' 'Ï' 'Ô' 'Ù' 'Û' 'Ü' 'à' 'â' 'ç' 'è' 'é' 'ê' 'ë' 'î' 'ï' 'ô' 'ù' 'û' 'ü' 'ÿ' #endif #ifdef SPANISH 'Á' 'É' 'Í' 'Ó' 'Ú' 'Ü' 'Ñ' 'á' 'é' 'í' 'ó' 'ú' 'ü' 'ñ' #endif #ifdef ITALIAN 'É' 'Í' 'Ú' 'À' 'È' 'Ò' 'é' 'í' 'ú' 'à' 'è' 'ò' #endif #ifdef GERMAN 'Ä' 'Ö' 'Ü' 'ä' 'ö' 'ü' 'ß' #endif #ifdef PORTUGUESE 'À' 'Á' 'Â' 'Ã' 'Ç' 'É' 'Ê' 'Í' 'Ó' 'Ô' 'Õ' 'Ú' 'Ü' 'à' 'á' 'â' 'ã' 'ç' 'é' 'ê' 'í' 'ó' 'ô' 'õ' 'ú' 'ü' #endif ] let word_constituent = letter | ['-' '\''] let stretched_word = letter ' ' letter ' ' letter ' ' letter (' ' letter) * | letter '-' letter '-' letter '-' letter ('-' letter) * | letter '_' letter '_' letter '_' letter ('_' letter) * | letter '.' letter '.' letter '.' letter ('.' letter) * | letter '*' letter '*' letter '*' letter ('*' letter) * | letter '\'' letter '\'' letter '\'' letter ('\'' letter) * let numeric = ['0'-'9' '.' ',' '$' '%' '\164' (* Euro *)] let weird_character = ['\127' - '\255'] #ifdef JAPANESE let jis_snd = ['\032'-'\127'] let kanji = ['\048'-'\116'] jis_snd let katakana = (['\037'] jis_snd | "\033\060") let jis_char = ['\032'-'\116'] jis_snd let jis_in = "\027$" '(' ? ['@' 'B'] let jis_out = "\027(" ['B' 'J' 'H'] #endif rule split = parse stretched_word { fun action -> let s = unstretch (Lexing.lexeme lexbuf) in if all_uppercase s then action ("U" ^ string_of_int (String.length s)); if String.length s <= 12 then (normalize s; action s); split lexbuf action } | word_constituent word_constituent word_constituent word_constituent * { fun action -> let s = Lexing.lexeme lexbuf in if all_uppercase s then action ("U" ^ string_of_int (String.length s)); if String.length s <= 12 then (normalize s; action s); split lexbuf action } | numeric numeric numeric numeric * { fun action -> let s = Lexing.lexeme lexbuf in if String.length s <= 8 then (normalize s; action s); split lexbuf action } #ifdef JAPANESE | jis_in { split_jis lexbuf } #endif | weird_character weird_character weird_character weird_character * { fun action -> let s = Lexing.lexeme lexbuf in action ("W" ^ string_of_int (String.length s)); split lexbuf action } | eof { fun action -> () } | _ { split lexbuf } #ifdef JAPANESE and split_jis = parse kanji kanji * { fun action -> let s = Lexing.lexeme lexbuf in action ("\027$B" ^ adjust s 8 ^ "\027(B"); split_jis lexbuf action } | katakana katakana katakana katakana * { fun action -> let s = Lexing.lexeme lexbuf in action ("\027$B" ^ adjust s 12 ^ "\027(B"); split_jis lexbuf action } | jis_char { split_jis lexbuf } | eof { fun action -> () } | jis_out { split lexbuf } | _ { split lexbuf } #endif { let iter action txt = split (Lexing.from_string txt) action }