#!/usr/bin/perl -w -T # <@LICENSE> # Copyright 2004 Apache Software Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use strict; use bytes; use Getopt::Long; use Pod::Usage; use File::Spec; use vars qw( $spamtest %opt $isspam $forget $messagecount $learnedcount $messagelimit $synconly $learnprob @targets $bayes_override_path ); my $PREFIX = '@@PREFIX@@'; # substituted at 'make' time my $DEF_RULES_DIR = '@@DEF_RULES_DIR@@'; # substituted at 'make' time my $LOCAL_RULES_DIR = '@@LOCAL_RULES_DIR@@'; # substituted at 'make' time use lib '@@INSTALLSITELIB@@'; # substituted at 'make' time BEGIN { # Locate locally installed SA libraries *without* using FindBin, which generates # warnings and causes more trouble than its worth. We don't need to be too # smart about this BTW. my @bin = File::Spec->splitpath($0); my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1]) : $bin[1]) # /home/jm/foo -> /home/jm || File::Spec->curdir; # foo -> . # check to make sure it wasn't just installed in the normal way. # note that ./lib/Mail/SpamAssassin.pm takes precedence, for # building SpamAssassin on a machine where an old version is installed. if (-e $bin.'/lib/Mail/SpamAssassin.pm' || !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm') { # Firstly, are we running "make test" in the "t" dir? the test files # *need* to use 'blib', so that 'use bytes' is removed for pre-5.6 perls # beforehand by the preproc. However, ./spamassassin does not, as the # preproc will have stripped out the "use rule files from cwd" code from # Mail::SpamAssassin. So we want to use blib just for the t scripts. if ( $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm' ) { unshift(@INC, '../blib/lib'); } else { # These are common paths where the SA libs might be found. foreach (qw(lib ../lib/site_perl ../lib/spamassassin ../share/spamassassin/lib)) { my $dir = File::Spec->catdir($bin, split('/', $_)); if(-f File::Spec->catfile($dir, "Mail", "SpamAssassin.pm")) { unshift(@INC, $dir); last; } } } } } use Mail::SpamAssassin; use Mail::SpamAssassin::ArchiveIterator; use Mail::SpamAssassin::Message; use Mail::SpamAssassin::PerMsgLearner; ########################################################################### # used to be CmdLearn::cmd_run() ... %opt = ( 'force-expire' => 0, 'use-ignores' => 0, 'nosync' => 0, ); Getopt::Long::Configure( qw(bundling no_getopt_compat permute no_auto_abbrev no_ignore_case) ); GetOptions( 'forget' => \$forget, 'ham|nonspam' => sub { $isspam = 0; }, 'spam' => sub { $isspam = 1; }, 'sync' => \$synconly, 'rebuild' => sub { $synconly = 1; warn "The --rebuild option has been deprecated. Please use --sync instead.\n" }, 'username|u=s' => \$opt{'username'}, 'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'}, 'prefspath|prefs-file|p=s' => \$opt{'prefspath'}, 'siteconfigpath=s' => \$opt{'siteconfigpath'}, 'folders|f=s' => \$opt{'folders'}, 'force-expire|expire' => \$opt{'force-expire'}, 'local|L' => \$opt{'local'}, 'no-sync|nosync' => \$opt{'nosync'}, 'showdots' => \$opt{'showdots'}, 'use-ignores' => \$opt{'use-ignores'}, 'no-rebuild|norebuild' => sub { $opt{'nosync'} = 1; warn "The --no-rebuild option has been deprecated. Please use --no-sync instead.\n" }, 'learnprob=f' => \$opt{'learnprob'}, 'randseed=i' => \$opt{'randseed'}, 'stopafter=i' => \$opt{'stopafter'}, 'debug-level|D:s' => \$opt{'debug-level'}, 'help|h|?' => \$opt{'help'}, 'version|V' => \$opt{'version'}, 'dump:s' => \$opt{'dump'}, 'import' => \$opt{'import'}, 'backup' => \$opt{'backup'}, 'clear' => \$opt{'clear'}, 'restore=s' => \$opt{'restore'}, 'dir' => sub { $opt{'old_format'} = 'dir'; }, 'file' => sub { $opt{'old_format'} = 'file'; }, 'mbox' => sub { $opt{'format'} = 'mbox'; }, 'mbx' => sub { $opt{'format'} = 'mbx'; }, 'single' => sub { $opt{'old_format'} = 'single'; }, 'db|dbpath=s' => \$bayes_override_path, 're|regexp=s' => \$opt{'regexp'}, '<>' => \&target, ) or usage( 0, "Unknown option!" ); if ( defined $opt{'help'} ) { usage( 0, "For more information read the manual page" ); } if ( defined $opt{'version'} ) { print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n"; exit 0; } if ( $opt{'force-expire'} ) { $synconly = 1; } if ( !defined $isspam && !defined $synconly && !defined $forget && !defined $opt{'dump'} && !defined $opt{'import'} && !defined $opt{'clear'} && !defined $opt{'backup'} && !defined $opt{'restore'} && !defined $opt{'folders'} ) { usage( 0, "Please select either --spam, --ham, --folders, --forget, --sync, --import,\n--dump, --clear, --backup or --restore" ); } # We need to make sure the journal syncs pre-forget... if ( defined $forget && $opt{'nosync'} ) { $opt{'nosync'} = 0; warn "sa-learn warning: --forget requires read/write access to the database, and is incompatible with --no-sync\n"; } if ( defined $opt{'old_format'} ) { #Format specified in the 2.5x form of --dir, --file, --mbox, --mbx or --single. #Convert it to the new behavior: if ( $opt{'old_format'} eq 'single' ) { push ( @ARGV, '-' ); } } # create the tester factory $spamtest = new Mail::SpamAssassin( { rules_filename => $opt{'configpath'}, site_rules_filename => $opt{'siteconfigpath'}, userprefs_filename => $opt{'prefspath'}, username => $opt{'username'}, debug => defined( $opt{'debug-level'} ), local_tests_only => 1, dont_copy_prefs => 1, PREFIX => $PREFIX, DEF_RULES_DIR => $DEF_RULES_DIR, LOCAL_RULES_DIR => $LOCAL_RULES_DIR, } ); $spamtest->init(1); # kluge to support old check_bayes_db operation if ( defined $bayes_override_path ) { # Add a default prefix if the path is a directory if ( -d $bayes_override_path ) { $bayes_override_path = File::Spec->catfile( $bayes_override_path, 'bayes' ); } # init() above ties to the db r/o and leaves it that way # so we need to untie before dumping (it'll reopen) $spamtest->finish_learner(); $spamtest->{conf}->{bayes_path} = $bayes_override_path; } if ( defined $opt{'dump'} ) { my ( $magic, $toks ); if ( $opt{'dump'} eq 'all' || $opt{'dump'} eq '' ) { # show us all tokens! ( $magic, $toks ) = ( 1, 1 ); } elsif ( $opt{'dump'} eq 'magic' ) { # show us magic tokens only ( $magic, $toks ) = ( 1, 0 ); } elsif ( $opt{'dump'} eq 'data' ) { # show us data tokens only ( $magic, $toks ) = ( 0, 1 ); } else { # unknown option warn "Unknown dump option '" . $opt{'dump'} . "'\n"; $spamtest->finish_learner(); exit 1; } if (!$spamtest->dump_bayes_db( $magic, $toks, $opt{'regexp'}) ) { $spamtest->finish_learner(); die "ERROR: Bayes dump returned an error, please re-run with -D for more information\n"; } $spamtest->finish_learner(); exit 0; } if ( defined $opt{'import'} ) { my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade(); $spamtest->finish_learner(); exit( !$ret ); } if (defined $opt{'clear'}) { unless ($spamtest->{bayes_scanner}->{store}->clear_database()) { $spamtest->finish_learner(); die "ERROR: Bayes clear returned an error, please re-run with -D for more information\n"; } $spamtest->finish_learner(); exit 0; } if (defined $opt{'backup'}) { unless ($spamtest->{bayes_scanner}->{store}->backup_database()) { $spamtest->finish_learner(); die "ERROR: Bayes backup returned an error, please re-run with -D for more information\n"; } $spamtest->finish_learner(); exit 0; } if (defined $opt{'restore'}) { my $filename = $opt{'restore'}; unless ($filename) { $spamtest->finish_learner(); die "ERROR: You must specify a filename to restore.\n"; } unless ($spamtest->{bayes_scanner}->{store}->restore_database($filename, $opt{'showdots'})) { $spamtest->finish_learner(); die "ERROR: Bayes restore returned an error, please re-run with -D for more information\n"; } $spamtest->finish_learner(); exit 0; } if ( !$spamtest->{conf}->{use_bayes} ) { warn "ERROR: configuration specifies 'use_bayes 0', sa-learn disabled\n"; exit 1; } $spamtest->init_learner( { force_expire => $opt{'force-expire'}, learn_to_journal => $opt{'nosync'}, wait_for_lock => 1, caller_will_untie => 1 } ); $spamtest->{bayes_scanner}{use_ignores} = $opt{'use-ignores'}; if ($synconly) { $spamtest->rebuild_learner_caches( { verbose => 1, showdots => $opt{'showdots'} } ); $spamtest->finish_learner(); exit 0; } $messagelimit = $opt{'stopafter'}; $learnprob = $opt{'learnprob'}; if ( defined $opt{'randseed'} ) { srand( $opt{'randseed'} ); } # sync the journal first if we're going to go r/w so we make sure to # learn everything before doing anything else. # if ( !$opt{nosync} ) { $spamtest->rebuild_learner_caches(); } # run this lot in an eval block, so we can catch die's and clear # up the dbs. eval { $SIG{INT} = \&killed; $SIG{TERM} = \&killed; if ( $opt{folders} ) { open( F, $opt{folders} ) || die $!; while () { chomp; next unless ($_); if (/^(?:ham|spam):/) { push ( @targets, $_ ); } else { target($_); } } close(F); } # add leftover args as targets foreach (@ARGV) { target($_); } #No arguments means they want stdin: if ( $#targets < 0 ) { target('-'); } # mbox and mbx doesn't deal with STDIN, so make a temp file if they want STDIN. # do it here since they may specify "-" on the commandline # my $tempfile; if ( $targets[0] =~ /:mbo?x:-$/ ) { my $handle; local $/ = undef; # go into slurp mode ( $tempfile, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile(); print {$handle} ; close $handle; # re-aim the targets at the tempfile instead of STDIN $targets[0] =~ s/:-$/:$tempfile/; } my $iter = new Mail::SpamAssassin::ArchiveIterator( { 'opt_j' => 0, 'opt_n' => 1, 'opt_all' => 1, } ); $iter->set_functions( \&wanted, sub { } ); $messagecount = 0; $learnedcount = 0; eval { $iter->run(@targets); }; print STDERR "\n" if ( $opt{showdots} ); print "Learned from $learnedcount message(s) ($messagecount message(s) examined).\n"; # If we needed to make a tempfile, go delete it. if ( defined $tempfile ) { unlink $tempfile; } if ($@) { die $@ unless ( $@ =~ /HITLIMIT/ ); } }; if ($@) { my $failure = $@; $spamtest->finish_learner(); die $failure; } $spamtest->finish_learner(); exit 0; ########################################################################### sub killed { $spamtest->finish_learner(); die "interrupted"; } sub target { my ($target) = @_; my $class = ( $isspam ? "spam" : "ham" ); my $format = ( defined( $opt{'format'} ) ? $opt{'format'} : "detect" ); push ( @targets, "$class:$format:$target" ); } ########################################################################### sub wanted { my ( $class, $id, $time, $dataref ) = @_; my $spam = $class eq "s" ? 1 : 0; if ( defined($learnprob) ) { if ( int( rand( 1 / $learnprob ) ) != 0 ) { print STDERR '_' if ( $opt{showdots} ); return; } } if ( defined($messagelimit) && $learnedcount > $messagelimit ) { die 'HITLIMIT'; } $messagecount++; my $ma = Mail::SpamAssassin->parse($dataref); if ( $ma->get_header("X-Spam-Checker-Version") ) { my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1); $ma->finish(); $ma = $new_ma; } my $status = $spamtest->learn( $ma, undef, $spam, $forget ); my $learned = $status->did_learn(); if ( !defined $learned ) { # undef=learning unavailable die "ERROR: the Bayes learn function returned an error, please re-run with -D for more information\n"; } elsif ( $learned == 1 ) { # 1=message was learned. 0=message wasn't learned $learnedcount++; } # Do cleanup ... $status->finish(); undef $status; $ma->finish(); undef $ma; print STDERR '.' if ( $opt{showdots} ); } ########################################################################### sub usage { my ( $verbose, $message ) = @_; my $ver = Mail::SpamAssassin::Version(); print "SpamAssassin version $ver\n"; pod2usage( -verbose => $verbose, -message => $message, -exitval => 64 ); } # --------------------------------------------------------------------------- =head1 NAME sa-learn - train SpamAssassin's Bayesian classifier =head1 SYNOPSIS B [options] [file]... B [options] --dump [ all | data | magic ] Options: --ham Learn messages as ham (non-spam) --spam Learn messages as spam --forget Forget a message --use-ignores Use bayes_ignore_from and bayes_ignore_to --sync Syncronize the database and the journal if needed --force-expire Force a database sync and expiry run --dbpath Allows commandline override (in bayes_path form) for where to read the Bayes DB from --dump [all|data|magic] Display the contents of the Bayes database Takes optional argument for what to display --regexp For dump only, specifies which tokens to dump based on a regular expression. -f file, --folders=file Read list of files/directories from file --dir Ignored; historical compatability --file Ignored; historical compatability --mbox Input sources are in mbox format --mbx Input sources are in mbx format --showdots Show progress using dots --no-sync Skip syncronizing the database and journal after learning -L, --local Operate locally, no network accesses --import Migrate data from older version/non DB_File based databases --clear Wipe out existing database --backup Backup, to STDOUT, existing database --restore Restore a database from filename -u username, --username=username Override username taken from the runtime environment -C path, --configpath=path, --config-file=path Path to standard configuration dir -p prefs, --prefspath=file, --prefs-file=file Set user preferences file --siteconfigpath=path Path for site configs (def: /etc/mail/spamassassin) -D, --debug-level Print debugging messages -V, --version Print version -h, --help Print usage message =head1 DESCRIPTION Given a typical selection of your incoming mail classified as spam or ham (non-spam), this tool will feed each mail to SpamAssassin, allowing it to 'learn' what signs are likely to mean spam, and which are likely to mean ham. Simply run this command once for each of your mail folders, and it will ''learn'' from the mail therein. Note that I in the mail folder names is supported; in other words, listing a folder name as C<*> will scan every folder that matches. SpamAssassin remembers which mail messages it has learnt already, and will not re-learn those messages again, unless you use the B<--forget> option. Messages learnt as spam will have SpamAssassin markup removed, on the fly. If you make a mistake and scan a mail as ham when it is spam, or vice versa, simply rerun this command with the correct classification, and the mistake will be corrected. SpamAssassin will automatically 'forget' the previous indications. =head1 OPTIONS =over 4 =item B<--ham> Learn the input message(s) as ham. If you have previously learnt any of the messages as spam, SpamAssassin will forget them first, then re-learn them as ham. Alternatively, if you have previously learnt them as ham, it'll skip them this time around. If the messages have already been filtered through SpamAssassin, the learner will ignore any modifications SpamAssassin may have made. =item B<--spam> Learn the input message(s) as spam. If you have previously learnt any of the messages as ham, SpamAssassin will forget them first, then re-learn them as spam. Alternatively, if you have previously learnt them as spam, it'll skip them this time around. If the messages have already been filtered through SpamAssassin, the learner will ignore any modifications SpamAssassin may have made. =item B<--folders>=I, B<-f> I sa-learn will read in the list of folders from the specified file, one folder per line in the file. If the folder is prefixed with C or C, sa-learn will learn that folder appropriately, otherwise the folders will be assumed to be of the type specified by B<--ham> or B<--spam>. =item B<--use-ignore> Don't learn the message if a from address matches configuration file item C or a to address matches C. The option might be used when learning from a large file of messages from which the hammy spam messages or spammy ham messages have not been removed. =item B<--sync> Syncronize the journal and databases. Upon successfully syncing the database with the entries in the journal, the journal file is removed. =item B<--force-expire> Forces an expiry attempt, regardless of whether it may be necessary or not. Note: This doesn't mean any tokens will actually expire. Please see the EXPIRATION section below. Note: C<--force-expire> also causes the journal data to be syncronized into the Bayes databases. =item B<--forget> Forget a given message previously learnt. =item B<--dbpath> Allows a commandline override of the I configuration option. =item B<--dump> I