sa-learn-cyrus-0.3.5/0000755000175000017500000000000011657051715012464 5ustar hjbhjbsa-learn-cyrus-0.3.5/INSTALL0000644000175000017500000000107311657051414013512 0ustar hjbhjbINSTALL - sa-learn-cyrus (1) Copy sa-learn-cyrus to directory within root's path, e.g. /usr/local/sbin (2) Copy the sample configuration file sa-learn-cyrus.conf. A good place may be the configuration directory of SpamAssassin, e.g. /etc/spamassassin (3) Customize the configuration file according to your needs. [The following steps are optional] (4) make-doc.sh (start it from this directory) will create the documentation files in ./doc/ (5) Copy the documentation files into the appropriate directories on your system. Good luck! hjb :-? sa-learn-cyrus-0.3.5/make-doc.sh0000755000175000017500000000135011657051414014476 0ustar hjbhjb#!/bin/sh # # make documentation (manpage) from pod # # hjb -- 2008-03-22 name=sa-learn-cyrus source=./$name version=`$source --version` man_section=8 if [ ! -x $source ]; then echo Cannot find $source echo Start this script in the same directory where $name is located. exit 1 fi echo making documentation for $name-$version # text documentation text_manpage=doc/$name.txt echo manpage as plain text: $text_manpage $source --man-text > $text_manpage # html documentation html_manpage=doc/$name.html echo manpage in html format: $html_manpage $source --man-html > $html_manpage # manpage manpage=doc/$name.$man_section echo manpage: $manpage $source --man-manpage=$man_section > $manpage gzip=`which gzip` $gzip -f $manpage sa-learn-cyrus-0.3.5/sa-learn-cyrus0000755000175000017500000012362711657051414015266 0ustar hjbhjb#!/usr/bin/perl # # sa-learn-cyrus feeds spam and non-spam (ham) messages to sa-learn. # It's main purpose is to train SA's bayes database with spam/ham # messages sorted by the mailbox owner into special subfolders. # # Copyright (C) 2004-2011 Hans-Juergen Beie # # This program is free software; you can redistribute it and/or modify it under # the terms of the Artistic License 2.0 or the GNU General Public License as # published by the Free Software Foundation; either version 2 of the license, # or (at your option) any later version. # # ------------------------------------------------------------- my $me = 'sa-learn-cyrus'; my $VERSION = '0.3.5'; my $lupdate = '2011-11-02'; my $author = 'hjb'; my $header = "$me-$VERSION ($author -- $lupdate)"; my $basename = `basename $0`; chomp($basename); # ------------------------------------------------------------- use strict qw(vars); use Getopt::Long; use Pod::Usage; use File::Spec; use File::Temp; # ------------------------------------------------------------- my $my_full_name = File::Spec->rel2abs($0); my ($vol,$dir, $name) = File::Spec->splitpath($my_full_name); my $my_path = File::Spec->catpath($vol, $dir); $my_path =~ s/(.*)\/$/$1/; # ------------------------------------------------------------- # Default configuration my %conf = ( # - base - 'basename' => $basename, 'pid' => $$, 'me' => $me, 'header' => $header, 'config_file' => '/etc/spamassassin/sa-learn-cyrus.conf', 'my_full_name' => $my_full_name, 'my_path' => $my_path, # [global] 'global:verbose' => 1, 'global:simulate' => 'no', 'global:tmp_dir' => '', 'global:tmp_file' => '', 'global:lock_file' => "/var/lock/$me\.lock", 'global:log_with_tag' => 'yes', # [mailbox] 'mailbox:include_list' => '', 'mailbox:include_regexp' => '', 'mailbox:exclude_list' => '', 'mailbox:exclude_regexp' => '', 'mailbox:spam_folder' => 'Spam', 'mailbox:remove_spam' => 'no', 'mailbox:ham_folder' => 'NoSpam', 'mailbox:remove_ham' => 'no', # [sa] 'sa:site_config_path' => '/etc/spamassassin', 'sa:prefs_file' => '/etc/spamassassin/local.cf', 'sa:learn_cmd' => '/usr/bin/sa-learn', 'sa:user' => 'mail', 'sa:group' => 'mail', 'sa:tokens' => '_toks', 'sa:debug' => 'no', 'sa:bayes_storage' => 'berkely', 'sa:fix_db_permissions' => 'yes', 'sa:bayes_path' => '~/.spamassassin/bayes', 'sa:virtual_config_dir' => '', # [imap] 'imap:domains' => '', 'imap:base_dir' => '/var/spool/cyrus/mail', 'imap:initial_letter' => 'yes', 'imap:purge_cmd' => '/usr/sbin/ipurge', 'imap:user' => 'cyrus', 'imap:unixhierarchysep' => 'no', ); # ------------------------------------------------------------- # exit codes my %EX = ( 'ex_OK' => 0, 'ex_NoMailboxesFound' => 1, # nothing to do 'ex_SytaxError' => -1, # cmd line error 'ex_NoConfigFile' => -2, # config_file not readable 'ex_ConfigError' => -3, # error in config_file 'ex_TmpDirNotFound' => -4, # tmp_dir not found 'ex_MissingTmpDir' => -5, # tmp_dir not specified 'ex_ProcLocked' => -6, # there is another sa-lean-cyrus running 'ex_SAConfNotFound' => -7, # SA configuration not found 'ex_PermissionDenied' => -8, # only root is allowed to execute this program 'ex_LockFailed' => -9, # lock file could not be created ); $| = 1; # flush output imediately # # parse cmd line # my $args = join(' ', @ARGV); my %OPT = (); Getopt::Long::Configure ("bundling"); unless ( GetOptions(\%OPT, 'help|h', 'man', 'version|V', 'verbose|v=i', 'config|c=s', 'simulate|s', 'sa-debug|d', 'imap-domains|D=s', 'man-manpage=i', 'man-html', 'man-text', ) ) { bye('ex_SyntaxError', "Error reading arguments: '$args', try --help for help"); } my @user_names = @ARGV; # # check cmd options # if ( $OPT{'help'} ) { my $sections = "USAGE"; pod2usage( { -message => $header, -exitval => 0, -verbose => 99, -sections => $sections, #-noperldoc => 1, -output => \*STDOUT } ); bye('ex_OK', ''); }; if ( $OPT{'man'} ) { my $sections = "NAME|USAGE|FUNCTION|DESCRIPTION|ARGUMENTS|OPTIONS|CONFIGURATION|FILES|SEE ALSO|PREREQUISITES|AUTHOR|COPYRIGHT AND LICENSE|DISCLAIMER|ACKNOWLEDGMENTS"; pod2usage( { -exitval => 0, -verbose => 99, -sections => $sections, -output => \*STDOUT, } ); bye('ex_OK', ''); }; if ( $OPT{'version'} ) { print "$VERSION\n"; bye('ex_OK', ''); }; if ( $OPT{'man-manpage'} ) { make_manpage(\%conf,$OPT{'man-manpage'}); bye('ex_OK', ''); }; if ( $OPT{'man-html'} ) { make_htmldoc(\%conf); bye('ex_OK', ''); }; if ( $OPT{'man-text'} ) { make_txtdoc(\%conf); bye('ex_OK', ''); }; if ( $OPT{'config'} ) { if (-r $OPT{'config'}) { $conf{'config_file'} = $OPT{'config'}; } else { bye('ex_NoConfigFile', "Configuration file '$OPT{'config'}' not readable: $!"); } }; log_msg('info', $header); bye('ex_PermissionDenied', "Only root can run $me") if $< != 0; read_config(\%conf) or bye('ex_ConfigError', "Error reading config file '$conf{config_file}'"); # check bayes_storage $conf{'sa:bayes_storage'} = lc($conf{'sa:bayes_storage'}); unless ( $conf{'sa:bayes_storage'} =~ /^(berkely|sql)/ ) { bye('ex_ConfigError', "Unknown value of option bayes_storage = $conf{'sa:bayes_storage'}. Known values are 'berkely' and 'sql'."); } # verbositiy unless ( $OPT{'verbose'} ) { if ( defined $conf{'global:verbose'} ) { $OPT{'verbose'} = $conf{'global:verbose'}; } } log_msg('info', "Verbose level: $OPT{'verbose'}") if $OPT{'verbose'}; # mailboxes if ( @user_names ) { $conf{'mailbox:include_list'} = join(' ', @user_names); # disable include/exclude settings from config file $conf{'mailbox:include_regexp'} = ''; $conf{'mailbox:exclude_list'} = ''; $conf{'mailbox:exclude_regexp'} = '.*'; } # global:tmp_file (depracted) if ( $conf{'global:tmp_file'} ) { log_msg('warn', "Paramater 'gobal:tmp_file' is depracted. Use 'global:tmp_dir' instead!"); unless ( $conf{'global:tmp_dir'} ) { # take tmp_dir from tmp_file for backward compatibility my ($vol,$dir,$file) = File::Spec->splitpath($conf{'global:tmp_file'}); $conf{'global:tmp_dir'} = File::Spec->catpath($vol,$dir,''); log_msg('warn', "Setting global:tmp_dir = $conf{'global:tmp_dir'}"); } } # global:tmp_dir if ( $conf{'global:tmp_dir'} ) { bye ('ex_TmpDirNotFound', "Cannot find tmp_dir $conf{'global:tmp_dir'}") unless -d $conf{'global:tmp_dir'}; } else { bye('ex_MissingTmpDir', "Missing tmp_dir."); } # sa:debug if ( $OPT{'sa-debug'} ) { $conf{'sa:debug'} = 'yes'; log_msg('info', "SA debug mode enabled"); } if ( $OPT{'simulate'} ) { $conf{'global:simulate'} = 'yes'; } else { $conf{'global:simulate'} = lc($conf{'global:simulate'}); } log_msg('info', "Running in simulation mode") if $conf{'global:simulate'} eq 'yes'; # imap:domains if ( $OPT{D} ) { $conf{'imap:domains'} = $OPT{D}; } log_msg('info', "Scanning mailboxes in domains '$conf{'imap:domains'}'") if ( $conf{'imap:domains'} ne '' ) && $OPT{'verbose'}; # check if we are locked if ( my $pid = we_are_locked(\%conf) ) { bye('ex_ProcLocked', "There is another $me (pid = $pid) running."); } create_lock_file(\%conf); # read SA configuraion if ( $conf{'sa:bayes_storage'} eq 'berkely' && !$conf{'sa:virtual_config_dir'} ) { read_sa_config(\%conf) or bye('ex_SANotFound', "Check your configuration!"); } if ( $conf{'sa:virtual_config_dir'} && $conf{'sa:fix_db_permissions'} =~ /^[yY]/ ) { log_msg('warn', 'virtual_config_dir is incompatible with fix_db_permissions. Automatically disabling fix_db_permissions.'); $conf{'sa:fix_db_permissions'} = 'no'; } my %mailboxes = find_mailboxes(\%conf); unless (keys %mailboxes) { bye ('ex_NoMailboxesFound', "No matching mailboxes found. There's nothing to do for me."); } my $simulate = ( $conf{'global:simulate'} eq 'yes') ; my $mails_learned = 0; # total count of learned mails foreach my $key (sort keys %mailboxes) { my $imap_mail_path = $mailboxes{$key}; my ($domain, $user) = split(/:/, $key); my $user_domain = $user; $user_domain .= '@' . $domain unless $domain eq ''; my $imap_unixhierarchysep = ( $conf{'imap:unixhierarchysep'} =~ /^[yY]/ ) ? "/" : "."; my $imap_mail_box = 'user' . $imap_unixhierarchysep . $user; if ( -d $imap_mail_path ) { log_msg('info', "Mailbox '$user_domain'") if $OPT{'verbose'} > 0; for my $learn ('spam' , 'ham') { my $learn_folder = $conf{"mailbox:$learn" . '_folder'}; # if unixhierarchy is set, veryify folderstring [FM] $learn_folder =~ s/\./\//g if $conf{'imap:unixhierarchysep'} =~ /^[yY]/; my $learn_path = $learn_folder; $learn_path =~ s/\./\//g; $learn_folder = "$imap_mail_box$imap_unixhierarchysep$learn_folder"; $learn_path = $imap_mail_path . '/' . $learn_path; my $mail_cnt = 0; if ( -d $learn_path ) { my @mails = find_mails($learn_path); $mail_cnt = @mails; $mails_learned += $mail_cnt; log_msg('info', " $mail_cnt mails found in $learn folder '$learn_folder'") if $OPT{'verbose'} > 0; } else { log_msg('info', " No $learn folder '$learn_folder'") if $OPT{'verbose'} > 0; } if ( $mail_cnt ) { my $virtual_config_dir; if ($conf{'sa:virtual_config_dir'}) { $virtual_config_dir = $conf{'sa:virtual_config_dir'}; $virtual_config_dir =~ s/%%/%/g; $virtual_config_dir =~ s/%l/$user/g; $virtual_config_dir =~ s/%d/$domain/g; $virtual_config_dir =~ s/%u/$user_domain/g; } log_msg('info', " Learning $learn from folder '$learn_folder' in path '$learn_path'.") if $OPT{'verbose'} > 0; # sa-learn command parameters my @args; my ($tmp, $tmp_file) = new_temp_file(\%conf); $args[0] = $conf{'sa:learn_cmd'}; $args[0] .= " --debug" if $conf{'sa:debug'} =~ /^[yY]/; $args[0] .= " --siteconfigpath=$conf{'sa:site_config_path'}"; $args[0] .= " --prefspath=$conf{'sa:prefs_file'}"; $args[0] .= " --no-sync" if $conf{'sa;sync_once'}; $args[0] .= ' --' . $learn; $args[0] .= " --dir $learn_path"; $args[0] .= " --username='$user'" if $conf{'sa:virtual_config_dir'}; $args[0] .= " --dbpath=$virtual_config_dir/bayes" if $conf{'sa:virtual_config_dir'}; $args[0] .= " 1>$tmp_file"; $args[0] .= ' 2>&1'; log_msg('info', " Executing '@args'") if $OPT{'verbose'} > 2; my $result = 0; if ( $simulate ) { log_msg('info', " Simulate: '@args'"); unlink($tmp); } else { $result = system(@args); if ( open($tmp, $tmp_file) ) { while(<$tmp>) { chomp; log_msg('info', " sa-learn> $_"); } close($tmp); unlink($tmp_file); } else { log_msg('err', " Cannot read from '$tmp_file': $!"); } } if ( $result == 0 ) { if ( $conf{"mailbox:remove_$learn"} =~ /^[yY]/ ) { # delete learned mails log_msg('info', " Purging learned $learn mails from folder '$learn_folder'") if $OPT{'verbose'} > 0; my @args; my ($tmp, $tmp_file) = new_temp_file(\%conf); my $cmd = "$conf{'imap:purge_cmd'} -f -b 0 $learn_folder"; $cmd .= '@' . $domain unless $domain eq ''; $args[0] = "su $conf{'imap:user'} -s /bin/sh -c '$cmd'"; $args[0] .= " 1>$tmp_file"; $args[0] .= " 2>&1"; log_msg('info', " Executing '@args'") if $OPT{'verbose'} > 2; if ( $simulate ) { log_msg('info', " Simulate: '@args'"); unlink($tmp); } else { $result = system(@args); if ( open($tmp, $tmp_file) ) { while(<$tmp>) { chomp; log_msg('info', " ipurge> $_") unless /Permission denied/; } close($tmp); unlink($tmp); } else { log_msg('err', " Cannot read from '$tmp_file': $!"); } } log_msg('error', " Purging learned $learn mails using command '@args' failed: $?") unless $result == 0 ; } } else { log_msg('error', " Learning $learn using command '@args' failed: $?"); } } } } else { log_msg('info', "No mailbox '$user_domain' found in '$imap_mail_path'") if $OPT{'verbose'} > 0; } } if ( ($conf{'sa:bayes_storage'} eq 'berkely') && ($conf{'sa:fix_db_permissions'} =~ /^[yY]/) ) { # set uid/gid of bayes tokens file # this may prevent permission problems for spamd my $tokens = $conf{'sa:bayes_path'} . $conf{'sa:tokens'}; if ( -e $tokens ) { log_msg('info', "Tokens in '$tokens'") if $OPT{'verbose'} > 1; my $owner_group = "$conf{'sa:user'}:$conf{'sa:group'}"; unless ( $owner_group eq ':' ) { # chmod user:group tokens-file log_msg('info', "Changing ownership of '$tokens' to '$owner_group'") if $OPT{'verbose'} > 0; my $result = 0; my @args; my ($tmp, $tmp_file) = new_temp_file(\%conf); $args[0] = "chown $owner_group $tokens"; $args[0] .= " 1>$tmp_file"; $args[0] .= ' 2>&1'; log_msg('info', " Executing '@args'") if $OPT{'verbose'} > 2; if ( $simulate ) { log_msg('info', " Simulate: '@args'"); unlink($tmp); } else { $result = system(@args); if ( open($tmp, $tmp_file) ) { while(<$tmp>) { chomp; log_msg('info', " chown> $_"); } close($tmp); unlink($tmp); } else { log_msg('err', " Cannot read from '$tmp_file': $!"); } } unless ( $result == 0 ) { log_msg('error', "Changing ownership of '$tokens' using command '@args' failed: $?"); } } } else { log_msg('info', "No tokens '$tokens' found.") if $OPT{'verbose'} > 1; } } if ( $conf{'sa:sync_once'} && $mails_learned ) { # sync database log_msg('info', "Synchronizing Bayes database...") if $OPT{'verbose'} > 0; my $result = 0; my @args; my ($tmp, $tmp_file) = new_temp_file(\%conf); $args[0] = $conf{'sa:learn_cmd'}; $args[0] .= " --debug" if $conf{'sa:debug'} =~ /^[yY]/; $args[0] .= " --sync"; $args[0] .= " 1>$tmp_file"; $args[0] .= ' 2>&1'; log_msg('info', " Executing '@args'") if $OPT{'verbose'} > 2; my $result = 0; if ( $simulate ) { log_msg('info', " Simulate: '@args'"); unlink($tmp); } else { $result = system(@args); if ( open($tmp, $tmp_file) ) { while(<$tmp>) { chomp; log_msg('info', " sa-learn> $_"); } close($tmp); unlink($tmp_file); } else { log_msg('err', " Cannot read from '$tmp_file': $!"); } } } bye('ex_OK', 'done.'); #--------------------------------------------------------------- # # terminate program # sub bye { my ($exit_code, $msg) = @_; my $err = - $EX{$exit_code}; if ( $err > 0 ) { log_msg('error', "$msg") if $msg; log_msg('error', '*** aborted ***'); } else { if ( $conf{'lock_created'} ) { log_msg('info', "Removing lock file '$conf{'global:lock_file'}'") if $OPT{'verbose'} > 2; unlink($conf{'global:lock_file'}); } log_msg('info', "$msg") if $msg; } File::Temp::cleanup(); exit $err; } #--------------------------------------------------------------- # # log messages # sub log_msg { my ($level, $msg) = @_; $level = lc($level); my $log_with_tag = $conf{'global:log_with_tag'} =~ /^[yY]/; my $label = iso_dts() . ' ' . $conf{'me'} . '[' . $conf{'pid'} . ']'; if ( $level =~ /err|warn/ ) { if ( $log_with_tag ) { warn "$label $level: $msg\n"; } else { warn "$level: $msg\n"; } } else { if ( $log_with_tag ) { print "$label: $msg\n"; } else { print "$msg\n"; } } } #--------------------------------------------------------------- # # Convert time to date/time string (ISO-8601) # sub iso_dts { my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; my $now = sprintf ("%04d-%02d-%02d %02d:%02d:%02d", $year+1900, $mon+1, $mday, $hour, $min, $sec); return $now; }; #--------------------------------------------------------------- # # Trim leading and trailing white space # sub trim { my ($string) = @_; $string =~ s/^\s+//; $string =~ s/\s+$//; return $string; }; #--------------------------------------------------------------- # # create lock file # sub create_lock_file { my ($conf) = @_; $conf->{'lock_created'} = 0; return if $conf->{'global:simulate'} eq 'yes'; log_msg('info', "Creating lock file '$conf->{'global:lock_file'}'") if $OPT{'verbose'} > 1; open(LF, ">$conf->{'global:lock_file'}") or bye('ex_LockFailed', "Cannot create lock file '$conf->{'global:lock_file'}': $!"); print LF "$conf->{'pid'}\n"; close (LF); $conf->{'lock_created'} = 1; } #--------------------------------------------------------------- # # check if we are locked # sub we_are_locked { my($conf) = @_; my $pid = '???'; my $lock_file = $conf->{'global:lock_file'}; return undef unless -f $lock_file; # # we are locked, get the pid ... # if ( open (LF, $lock_file) ) { while () { chomp; $pid = $_; } close(LF); } else { log_msg('warn', "Cannot open lock file $lock_file: $!"); } return $pid; } #--------------------------------------------------------------- # # create new temp file object # sub new_temp_file { my ($conf) = @_; my $tmp = new File::Temp( TEMPLATE => $conf->{'me'} . '-XXXXX', DIR => $conf->{'global:tmp_dir'}, SUFFIX => '.' . $conf->{'pid'}, ); my $tmp_file = $tmp->filename; log_msg('info', "New tmp_file '$tmp_file'") if $OPT{'verbose'} > 2; return ($tmp, $tmp_file); } #--------------------------------------------------------------- # # read config params from a Windows like ini file # # returns a hash of params if successfull, otherwhise returns undef # sub read_config { my ($config) = @_; my $conf_file = $config->{config_file}; unless ( open (CNF, $conf_file) ) { log_msg('err', "Cannot open config file '$conf_file': $!"); return undef; } log_msg('info', "Reading config file '$conf_file'") if $OPT{'verbose'} > 1; my $section = ''; while ( ) { chomp; my $line = trim($_); # trim white space next if $line eq ''; # ignore empty lines next if $line =~ /^[#;]/; # ignore comment lines my $sec = get_section($line); if ( $sec ) { # it's a section, something like '[foobar]' $section = $sec; log_msg('info', "[$section]") if $OPT{'verbose'} > 2; } elsif ( $section ne '' ) { # check for 'param = value' pairs my ($param, $value); if ( ($param, $value) = get_param_value($line) ) { # it's a line like 'param = value' $value = unquote($value); $config->{"$section:$param"} = $value; log_msg('info', " $section:$param >$value<") if $OPT{'verbose'} > 2; } } } close(CNF); return $config; } #--------------------------------------------------------------- # # Get a section name, something like '[foobar]'? # Section names may contain alphanumerical characters including '_' and '-'. # sub get_section { my ($string) = @_; if ( $string =~ s/^\[(.+)\]$/$1/ ) { return trim($1); } else { return undef; }; }; #--------------------------------------------------------------- # # Get a 'param = value' pair # sub get_param_value { my ($string) = @_; #my ($param, $value) = split(/\s*=\s*/, $string, 2); my ($param, $value) = split(/\s*=\s*/, $string); if ( $param and ($value ne '') ) { return ($param, $value); } else { return undef; }; }; #--------------------------------------------------------------- # # unquote # sub unquote { my ($string) = @_; $string =~ s/^\'(.*)\'$/$1/; $string =~ s/^\"(.*)\"$/$1/; return $string; }; #--------------------------------------------------------------- # # read SA config # sub read_sa_config { my ($conf) = @_; my $conf_file = $conf->{'sa:prefs_file'}; open(SAC, $conf_file) or bye('ex_SAConfNotFound', "Cannot open '$conf_file': $!"); log_msg('info', "Reading SA config '$conf_file'") if $OPT{'verbose'} > 0; while( ) { chomp; my $line = trim($_); next if $line eq ''; next if $line =~ /^#/; my ($param, $value) = split(/\s+/, $line); if ( $param eq 'bayes_path' ) { $conf->{'sa:bayes_path'} = $value; log_msg('info', "Bayes path is '$value'") if $OPT{'verbose'} > 0; return 1; } } close(SAC); log_msg('warn', "'bayes_path' not found in SA's config '$conf_file'. Assuming '$conf->{'sa:bayes_path'}'"); return undef; } #--------------------------------------------------------------- # # find mails in mailbox # my @MAILS; sub file_wanted { my $file = $_; # file my $file_name = $File::Find::name; # full path name return unless -f $file_name; # plain files only return if $file =~ /^cyrus\./; # no mail push(@MAILS, $file_name); } sub find_mails { use File::Find; my ($mbox_dir) = @_; @MAILS = (); find({ wanted => \&file_wanted, no_chdir => 0 }, $mbox_dir); return @MAILS; } #--------------------------------------------------------------- # # search mailboxes in imap spool # sub find_mailboxes { my ($conf) = @_; my %mailbox; log_msg('info', "Searching for mailboxes in path $conf->{'imap:base_dir'} ...") if $OPT{'verbose'} > 1; # use domains? my @domains = (''); if ( $conf->{'imap:domains'} ne '' ) { @domains = split(/\s+/, $conf->{'imap:domains'}); log_msg('info', " domains to search in: '@domains'") if $OPT{'verbose'} > 1; } foreach my $domain (sort @domains) { my $base_dir = $conf->{'imap:base_dir'}; $domain = lc($domain); unless ( $domain eq '' ) { $base_dir .= '/domain'; $base_dir .= '/' . substr($domain, 0, 1) if $conf->{'imap:initial_letter'} =~ /^[yY]/; $base_dir .= '/' . $domain; log_msg('info', " searching in domain '$domain', path is '$base_dir'") if $OPT{'verbose'} > 2; unless ( -d $base_dir ) { log_msg('warn', " no such directory '$base_dir'"); next; } } if ( $conf->{'imap:initial_letter'} =~ /^[yY]/ ) { for my $first_char ('a' .. 'z') { my $path = $base_dir . '/' . $first_char . '/user'; log_msg('info', " searching in path '$path'") if $OPT{'verbose'} > 2; next unless (-d $path); parse_mailbox_dir($conf, $path, $domain, \%mailbox); } } else { # version 0.3.3: bugfix by Andreas Czerniak # my $path = $conf->{'imap:base_dir'} . '/user'; my $path = $base_dir . '/user'; parse_mailbox_dir($conf, $path, $domain, \%mailbox); } } my $cnt = keys %mailbox; log_msg('info', "$cnt mailboxes found.") if $OPT{'verbose'} > 1; return %mailbox; } #--------------------------------------------------------------- # # parse mailbox directory # sub parse_mailbox_dir { my ($conf, $path, $domain, $mailbox) = @_; # parse directory my @items; if ( opendir(DIR, $path) ) { @items = readdir(DIR); closedir(DIR); } else { log_msg('error', " Cannot parse path '$path': $!"); return undef; } # # search for mailbox directories # my @include_list =split(/\s/, $conf->{'mailbox:include_list'}); my @exclude_list = split(/\s/, $conf->{'mailbox:exclude_list'}); my $include_regexp = $conf->{'mailbox:include_regexp'}; my $exclude_regexp = $conf->{'mailbox:exclude_regexp'}; $include_regexp = '' if @include_list; # explicit list: no regexp $exclude_regexp = '' if @exclude_list; # explicit list: no regexp LOOP_DIR: foreach my $name (sort @items) { next if $name =~ /^\.{1,2}$/; # ignore '.' and '..' next unless -d "$path/$name"; # must be a directory my $user_domain = $name; $user_domain .= '@' . $domain unless $domain eq ''; # check include_list if ( @include_list ) { log_msg('info', " checking '$name' with include_list '@include_list'") if $OPT{'verbose'} > 2; foreach my $user (@include_list) { if ( $user eq $name ) { log_msg('info', " '$user_domain' matches include_list '@include_list'") if $OPT{'verbose'} > 1; $mailbox->{"$domain:$name"} = $path . '/' . $name; next LOOP_DIR; } } log_msg('info', " '$user_domain' doesn't match include_list '@include_list'") if $OPT{'verbose'} > 2; next LOOP_DIR; } # check exclude_list if ( @exclude_list ) { log_msg('info', " checking '$user_domain' with exclude_list '@exclude_list'") if $OPT{'verbose'} > 2; foreach my $user (@exclude_list) { if ( $user eq $name ) { log_msg('info', " ignored '$user_domain', matches exclude_list '@exclude_list'") if $OPT{'verbose'} > 1; next LOOP_DIR; } } } # check include_regexp if ( $include_regexp ne '' ) { log_msg('info', " checking '$user_domain' with include_regexp '$include_regexp'") if $OPT{'verbose'} > 2; unless ( $name =~ /$include_regexp/ ) { log_msg('info', " ignored '$user_domain', doesn't match include_regexp '$include_regexp'") if $OPT{'verbose'} > 1; next LOOP_DIR; } } # check exclude_regexp if ( $exclude_regexp ne '' ) { log_msg('info', " checking '$user_domain' with exclude_regexp '$exclude_regexp'") if $OPT{'verbose'} > 2; if ( $name =~ /$exclude_regexp/ ) { log_msg('info', " ignored '$user_domain', matches exclude_regexp '$exclude_regexp'") if $OPT{'verbose'} > 1; next LOOP_DIR; } } log_msg('info', " found '$user_domain'") if $OPT{'verbose'} > 1; $mailbox->{"$domain:$name"} = $path . '/' . $name; } return $mailbox; } #--------------------------------------------------------------- # # make text doc # sub make_txtdoc { use Pod::Text; my ($conf) = @_; my $parser = Pod::Text->new (release => $conf->{'version'}, section => 8); $parser->parse_from_file($conf->{'my_full_name'}); } #--------------------------------------------------------------- # # make manpage # sub make_manpage { use Pod::Man; my ($conf, $section) = @_; my $parser = Pod::Man->new (release => $conf->{'version'}, section => $section); $parser->parse_from_file($conf->{'my_full_name'}); } #--------------------------------------------------------------- # # make html doc # sub make_htmldoc { use Pod::Html; my ($conf) = @_; my $tmp_dir = File::Spec->tmpdir(); # seems only to work if current dir is the same as the source is locaated my $cur_dir = File::Spec->curdir(); chdir($conf->{'my_path'}); pod2html($conf->{'me'}, "--title=$conf->{'header'}", "--htmldir=$conf->{'my_path'}", "--backlink=Back to Top", '--index', "--cachedir=$tmp_dir", '--flush', "--infile=$conf->{'my_full_name'}" ); chdir($cur_dir); } __END__ ################################################################ =head1 NAME sa-learn-cyrus - Train Spamassassin with spam/ham from user's imap mailboxes =head1 USAGE sa-learn-cyrus [ options ] user-name(s) user-name(s) One ore more user/mailbox name(s). options: --help Prints a brief help message and exits. -h --man Prints the manual page and exits. --verbose level Be verbose if level > 0 -v level --config file Use a configuration file other than the default -c file one. --sa-debug Run sa-learn in debug mode. -d --simulate Run in simulation mode (show commands only). -s --imap-domains domains Search mailboxes in list of domains. -D domains =head1 DESCRIPTION B feeds spam and non-spam (ham) messages to Spamassassin's database. Its main purpose is to train SA's bayes database with spam/ham messages sorted by the mailbox owners into special subfolders. It is intended to be used on smal mail systems (e.g. home office) with a single server-wide SA configuration. Launching B at regular intervalls (cron job) may improve SA's hit rate considerably, provided that the users are well instructed what to move to their ham/spam folders and what not. =head1 FUNCTION B scans local mail spools as used by Cyrus IMAPd for special subfolders. These subfolders are supposed to contain mails which have been classfied as spam or ham by the mailbox owners. Example: The users move spam mails which have not been tagged as spam by SpamAssassin (false positives) to a subfolder F. Other mails, which may be classified by SA as spam in the future because of certain characteristics are copied to a subfolder F. B feeds the content of these spam/ham folders to SA's Bayes database using the B tool which is shipped with the Spamassassin package. Afterwards these mails are deleted (optionally) by means of B which is a helper tool coming along with the Cyrus IMAPd package. =head1 ARGUMENTS B optionally takes a list of mailbox/user names as agruments: sa-learn-cyrus fred wilma fritz hjb If not supplied all mailboxes found will be handled. =head1 OPTIONS All options supplied on the comand line will override corresponding parameters given in the configuration file. Please note that the basic parameters of sa-learn-cyrus have to be defined in a configuration file. sa-learn-cyrus cannot be controlled solely by means of command. =over 4 =item B<--config file, -c file> Use configuration file other then the default one. Always adopt the configuartion file to your needs before using sa-learn-cyrus on a live system. Otherwise you may loose data or corrupt your SA data base! =item B<--verbose level, -v level> Specify level of verbosity. (Default = 0) =item B<--sa-debug, -d> Run sa-learn in debug mode. This may be useful to examine problems with sa-learn. =item B<--simulate, -s> Run B in simulation mode. This is useful for first tests after initial configuration or if problem are encountered. In simulation mode B doesn't execute any system commands nor does it touch any data. It just displays what it would do. =item B<--imap-domains list-of-domains, -D list-of-domains> If your Cyrus installation uses the "domain support" you may use this option to tell what domains you want to be searched. --domains example.com,another.org is equivalent to [imap] ... domains = example.com another.org ... in the configuration file. =back =head1 CONFIGURATION By default B expects its configuration file as F. One has to change this setting in the code, if another default file is wanted. Another than the default file can always be choosen with the C<--config option>. A sample configuration file is shipped with sa-learn-cyrus. =head2 Format The configuration file has a format as knwon from rsync or samba is very similar to the format of Windows ini files. The file consist of sequence sections. The begin of each section is designated with a section name, a word in square brackets, e.g. C<[global]>. The section entries consist of parameters, which are key/value pairs each on a single line. Key an value are separated by an equal sign like key = value The value is a single word or a list of words each of them representing a number or a string. Words may be surrounded ba any number of spaces for better readability. Empty lines and lines with a leading hash character C<#> are ingored. =head2 Section [global] The [global] section contains all global controll parameters. =over 4 =item B B creates some temporary files during each run. This is the directory where thes files are created. =item B To avoid race conditions, B uses a simple file locking mechanism. Each new sa-learn-cyrus process looks for this file before it realy does anything. If this file exists, the process exits with a warning, assuming that another sa-learn-cyrus process is running. =item B The level of verbosity. Values range from 0 (low) to 3 (high). A reasonable level to start with is 1. =item B B should be run in simulation mode (C) after the first customization of the configuration to avoid loss of data or corruption of SA's database in case of wrongly configured parameters. =item B Prepend the ouput (log) with a tag (date, time, pid). Set to C to avoid additional tagging when piped to syslog. Default is C. =back =head2 Section [mailbox] Section [mailbox] contains all parameters to select the mailboxes, to specify the special subfolders, and to define the actions to apply. =over 4 =item B Only spam/ham mails of these mailboxes are fed to Spamassassin's database. If this List ist empty, all mailboxes will be used. C may be used instead of the list on the command line. Example: include_list = fred wilma fritz hjb =item B If include_list is empty, a regular expression given here is applied to all mailbox names to select mailboxes. This parameter is ignored if include_list is not empty. Example: Include all mailboxes beginning with 'knf-'. include_regexp = ^knf- =item B A list of mailboxes wich will be excluded. If include_list is not empty, this parameter is ignored. =item B Mailbox names which match with this regular expresson are excluded from processing. Example: Ignore all mailboxes ending with '.beie' exclude_regexp = \.beie$ =item B The name of the special subfolder in each mailbox which contains spam. The name should be a complete folder path relative to the root folder INBOX. The Cyrus nomenclature is applied (same as with cyradm). Example: spam_folder = Learn.Spam This is a subfolder in a folder tree like this: INBOX +--Drafts +--Templates +--Sent +--Learn | +--Ham | +--Spam <-- spam subfolder | =item B The name of the special subfolder in each mailbox which contains ham. (Same naming scheme as with C, see above.) =item B Are the spam messages in the C to be removed after feeding them to the SA database or not? =item B Are the ham messages in the C to be removed after feeding them to the SA database or not? =back =head2 Section [sa] Spamassassin (SA) configuration items. =over 4 =item B Path to system-wide SA preferences. Example: site_config_path = /etc/spamassassin =item B Bayes storage mechanism (berkely|sql) berkely: Berkely DB (default) sql: SQL Database =item B Path of the system-wide SA configuartin file. Example: prefs_file = /etc/spamassassin/local.cf =item B Path to the sa-learn utility. Example: learn_cmd = /usr/bin/sa-learn =item B Should permissions of DB files be fixed? Ignored unless C =item B The user id SA runs with. Required if C. Example: user = mail =item B The group id SA runs with. Required if C. Example: group = mail =item B Skip synchronization after every change of database, but sync once after all messages have been learned. May speed up learning from many folders. Default is C. =item B Use this if you use the C<--virtual-config-dir> option of C (it needs to match exactly). See the C man page for more information. =item B Run sa-learn in debug mode or not. C may be useful to examine problems. =back =head2 Section [imap] The section [imap] contains the necessary configuration parameter to locate an manage the (Cyrus) IMAPd spool files. =over 4 =item B The root of the base directory of the IMAP spool (below that the mailboxes are located). =item B If base_dir is divided in subdirectories named with the initial letters of mailbox names set C (default), otherwise choose no. Examples for joe's mailbox: /j/user/joe/ : initial_letter = yes /user/joe/ : initial_letter = no =item B If your Cyrus spool uses domain hierarchy supply a list of domains. If domain support is not used leave this entry empty. The C option (see above) is applied to domains, too. Example for mailboxes fritz@bar.org and joe@foo.com : The mail files within the Cyrus spool are located at /domain/b/bar.org/f/fritz /domain/f/foo.com/j/joe List the domains as domains = foo.com bar.org =item B Choose C if Cyrus is configured to accept usernames like 'hans.mueller.somedomain.tld'. Otherwise set C. =item B The path to the Cyrus B utility for purging mail messages. Example: purge_cmd = /usr/sbin/ipurge =item B The user Cyrus-IMAPd runs as. Example: user = cyrus =back =head1 FILES F =head1 SEE ALSO C, C, C, C, C, C The current version of this script is available at L =head1 PREREQUISITES B (part of the SpamAssassin package), B (part of Cyrus IMAPd) =head1 AUTHOR Hans-Juergen Beie Ehjb@pollux.franken.deE =head1 COPYRIGHT AND LICENSE Copyright 2004-2011 by Hans-Juergen Beie. This program is free software; you can redistribute it and/or modify it under the terms of the Artistic License 2.0 (L) or the GNU General Public License as published by the Free Software Foundation; either version 2 of the license (L), or (at your option) any later version. =head1 DISCLAIMER This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. =head1 ACKNOWLEDGMENTS Thanks to Robert Carnecky and Jan Hauke Rahm for testing and suggestions for the implementation of the domain support. David Caldwell contributed the the virtual_config_dir feature. Some other contributers are listed in the CHANGELOG. Many thanks to them for their help and suggestions. =cut sa-learn-cyrus-0.3.5/doc/0000755000175000017500000000000011657051653013232 5ustar hjbhjbsa-learn-cyrus-0.3.5/doc/CHANGELOG0000644000175000017500000001176011657051413014443 0ustar hjbhjbChangelog -- sa-learn-cyrus Version 0.3.5 (2011-11-02) o Applied a patch from David Caldwell to use the --virtual-config-dir option of spamd. New parameter sa:virtual_config_dir o POD / man page updated. Version 0.3.4 (2011-10-03) o To avoid learning from spam/ham directories containing no mails every directory is checkd before handed over to sa-learn. Learning on larger mail systems is now a lot faster. Thanks to Timo Veith for this suggestion. o Added support for Bayes storage in SQL database. See config parameter sa:bayes_storage = berkely|sql o sa:prefs_file is used to get the Berkely DB path in case of sa:bayes_storage = berkely. A missing entry there will no longer cause an error. The default DB path is now '~/.spamassassin/bayes'. o New config paramter sa:fix_db_permissions to enable/disable file permissions in case of sa:bayes_storage = berkely. o Added option '-s /bin/sh' to ipurge in order to work even if imap:user has no valid shell. Thanks to Albert Siersema o Synchronization of the database may be performed only once after all mails have been treated by sa-learn. See new parameter sa:sync_once = yes|no Thanks to Timo Veith for this suggestion. o Bugfix: Spam/Ham folders more than two level deep are now supported. Thanks to David Caldwell. o Prependig the ouput (log) with a tag (date, time, pid) can now switched off to avoid additional tagging when piped to syslog. See parameter global:log_with_tag. o Some minor bug fixes. Version 0.3.3 (2008-06-18) o Bugfix: Base directory of imap spool corrected for the case where "domain support" is used without "initial letters". Thanks to Andreas Czerniak Version 0.3.2 (2008-04-26) o Bugfix: With imap:domains not empty (domain support activated) the parser didn't find all mailboxes having the same local part. o Log output now includes the domain names along with mailboxes. Version 0.3.1 (2008-03-22) o Changed the routines to make documentation files, to make them usable on Debian/etch, which uses an ancient Pod::Text module Version 0.3.0 (2008-03-21) o Getopt::Long: added --long --options support o use Pod::Usage: to use man page and --help with one single documentation o added option --man to print man page o added documention (POD) in the code o added options --man-text, -man-html, man-mapage=section to create man page documentation files Version 0.3.0rc1 (2008-02-12) o temp files are now managed by File::Temp o use a lock file ti avoid race conditions o command option -D for domain listst added Version 0.2.6 (2008-02-06) o Bugfixes in evaluation of parameters include_list, include_regexp, exclude_list, exclude_regexp Version 0.2.5 (2008-02-04) o Cyrus's domain support is now honored. To use this option, choose [imap] domains = one.domain.tld another.domain.tld Thanks to Robert Carnecky for suggestions and testing. o New option to simulate learning and purging of mails. This may be handy for testing the setup. This can be activated in the configuration file [global] simulate = no|yes or with the command option -s. Version 0.2.4 (2007-07-29) o Cyrus's 'unixhierarchysep' feature is now supported. To activate this option, choose [imap] unixhierarchysep = yes in the configuration file. Thanks to Franz Mueller for this contribution. o Copyright statement added Version 0.2.3 (2005-05-22) o Added a new option -d to run sa-learn in debug mode. There's also an configuration parameter for this purpose in section [sa] of the configuration file: debug = no|yes Version 0.2.2 (2004-12-15) o The function parse_dir (from module File::Listing) has to evaluate the output of the `ls -l` command, which in turn may depend on the system configuration. To avoid these problems, the parse_dir funktion has been replaced with classical opendir/closedir to search for mailbox files. o There's a new configuration parameter initial_letter in the section [imap] of the configuration file. If the imap spool has subdivisions with the initial letters of the mailbox names (default) this should be set to initial_letter = yes Otherwise set initial_letter = no o syslog stuff (which never had worked) has been removed. Version 0.2.1 (2004-11-18) o First published version sa-learn-cyrus-0.3.5/sa-learn-cyrus.conf0000644000175000017500000000672011657051414016201 0ustar hjbhjb# Configuration for sa-learn-cyrus # # hjb -- 2011-11-02 # # ------------------------------------------------------- # global parameters # [global] # Directory to store output of sa-learn and ipurge temporarily tmp_dir = /tmp # To avoid race conditions, we use a lock file. lock_file = /var/lock/sa-learn-cyrus.lock # level of verbosity (0 .. 3)? verbose = 1 # Don't excute commands, show only what would be executed, # Change this to 'no' after testing. simulate = yes # Prepend log output with a tag (date, time, pid)? # Choose 'no' if you prefer to pipe the output to syslog # (default is 'yes') log_with_tag = yes # ------------------------------------------------------- # Mailbox # [mailbox] # List of mailboxes/users which will be considered. # If this list is empty all mailboxes will be searched. # include_list = '' # If include_list is empty, only mailboxes matching this pattern will be considered include_regexp = '.*' # List of mailboxes/users which will be ignored exclude_list = '' # If exclude_list is empty, mailboxes matching this pattern will be ignored exclude_regexp = '' # Spam folder relative to INBOX (cyrus nomenclature: e.g. 'junk.Spam') spam_folder = 'Learn.Spam' # Ham folder relative to INBOX (cyrus nomenclature: e.g. 'junk.Ham') ham_folder = 'Learn.Ham' # Remove spam after feeding it to SA remove_spam = yes # Remove ham after feeding it to SA remove_ham = yes # ------------------------------------------------------- # Spamassassin # [sa] # run sa-learn in debug mode (useful to examine problems) debug = no # Path with system-wide SA preferences site_config_path = /etc/spamassassin # Path to sa-learn learn_cmd = /usr/bin/sa-learn # Bayes storage mechanism (berkely|sql) # berkely: Berkely DB (default) # sql: SQL Database bayes_storage = berkely # SA configuration file. # Used to get the Bayes database path if bayes_storage = berkely # Required to run sa-learn. prefs_file = /etc/spamassassin/local.cf # Should permissions of DB files be fixed? # Ignored unless bayes_storage = berkely fix_db_permissions = yes # SA user and group (required if fix_db_permissions = yes) user = mail group = mail # Skip synchronization after every change of database, but sync once # after all messages have been learned. # May speed up learning from many folders. sync_once = yes # Use this if you use the --virtual-config-dir option of spamd (it # needs to match exactly). See the spamd man page for more # information. virtual_config_dir = '' # ------------------------------------------------------- # IMAP # [imap] # Base directory of IMAP spool (below that mailboxes are located) base_dir = /var/spool/cyrus/mail # If base_dir has subdivisions with initial letters of mailbox names # set initial_letter = yes (default), otherwise choose no. # Example for joe's mailbox: # yes: /j/user/joe/ # no: /user/joe/ initial_letter = yes # If your cyrus spool uses domain hierarchy give a list of domains # Example for mailbox fritz@bar.org and joe@foo.com # /domain/b/bar.org/f/fritz # /domain/f/foo.com/j/joe # domains = foo.com bar.org # # If you don't use Cyrus's domain support leave the entry empty. # The initial_letter option (see above) is applied to domains, too. domains = '' # Choose 'unixhierarchysep = yes' if Cyrus is configured to accept usernames # like 'hans.mueller.somedomain.tld' unixhierarchysep = no # imap command to purge mail messages purge_cmd = /usr/sbin/ipurge # Cyrus-IMAPd user user = cyrus