#!/usr/bin/perl -w -T # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use strict; use bytes; use Getopt::Long; use Pod::Usage; use File::Spec; use vars qw( $spamtest %opt $isspam $forget $messagecount $learnedcount $messagelimit $progress $total_messages $init_results $start_time $synconly $learnprob @targets $bayes_override_path ); my $PREFIX = '@@PREFIX@@'; # substituted at 'make' time my $DEF_RULES_DIR = '@@DEF_RULES_DIR@@'; # substituted at 'make' time my $LOCAL_RULES_DIR = '@@LOCAL_RULES_DIR@@'; # substituted at 'make' time use lib '@@INSTALLSITELIB@@'; # substituted at 'make' time BEGIN { # see comments in "spamassassin.raw" for doco my @bin = File::Spec->splitpath($0); my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1]) : $bin[1]) || File::Spec->curdir; if (-e $bin.'/lib/Mail/SpamAssassin.pm' || !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm' ) { my $searchrelative; $searchrelative = 1; # disabled during "make install": REMOVEFORINST if ($searchrelative && $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm') { unshift ( @INC, '../blib/lib' ); } else { foreach ( qw(lib ../lib/site_perl ../lib/spamassassin ../share/spamassassin/lib)) { my $dir = File::Spec->catdir( $bin, split ( '/', $_ ) ); if ( -f File::Spec->catfile( $dir, "Mail", "SpamAssassin.pm" ) ) { unshift ( @INC, $dir ); last; } } } } } use Mail::SpamAssassin; use Mail::SpamAssassin::ArchiveIterator; use Mail::SpamAssassin::Message; use Mail::SpamAssassin::PerMsgLearner; use Mail::SpamAssassin::Util::Progress; ########################################################################### # used to be CmdLearn::cmd_run() ... %opt = ( 'force-expire' => 0, 'use-ignores' => 0, 'nosync' => 0, 'cf' => [] ); Getopt::Long::Configure( qw(bundling no_getopt_compat permute no_auto_abbrev no_ignore_case) ); GetOptions( 'forget' => \$forget, 'ham|nonspam' => sub { $isspam = 0; }, 'spam' => sub { $isspam = 1; }, 'sync' => \$synconly, 'rebuild' => sub { $synconly = 1; warn "The --rebuild option has been deprecated. Please use --sync instead.\n" }, 'username|u=s' => \$opt{'username'}, 'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'}, 'prefspath|prefs-file|p=s' => \$opt{'prefspath'}, 'siteconfigpath=s' => \$opt{'siteconfigpath'}, 'cf=s' => \@{$opt{'cf'}}, 'folders|f=s' => \$opt{'folders'}, 'force-expire|expire' => \$opt{'force-expire'}, 'local|L' => \$opt{'local'}, 'no-sync|nosync' => \$opt{'nosync'}, 'showdots' => \$opt{'showdots'}, 'progress' => \$opt{'progress'}, 'use-ignores' => \$opt{'use-ignores'}, 'no-rebuild|norebuild' => sub { $opt{'nosync'} = 1; warn "The --no-rebuild option has been deprecated. Please use --no-sync instead.\n" }, 'learnprob=f' => \$opt{'learnprob'}, 'randseed=i' => \$opt{'randseed'}, 'stopafter=i' => \$opt{'stopafter'}, 'debug|debug-level|D:s' => \$opt{'debug'}, 'help|h|?' => \$opt{'help'}, 'version|V' => \$opt{'version'}, 'dump:s' => \$opt{'dump'}, 'import' => \$opt{'import'}, 'backup' => \$opt{'backup'}, 'clear' => \$opt{'clear'}, 'restore=s' => \$opt{'restore'}, 'dir' => sub { $opt{'old_format'} = 'dir'; }, 'file' => sub { $opt{'old_format'} = 'file'; }, 'mbox' => sub { $opt{'format'} = 'mbox'; }, 'mbx' => sub { $opt{'format'} = 'mbx'; }, 'single' => sub { $opt{'old_format'} = 'single'; }, 'db|dbpath=s' => \$bayes_override_path, 're|regexp=s' => \$opt{'regexp'}, '<>' => \&target, ) or usage( 0, "Unknown option!" ); if ( defined $opt{'help'} ) { usage( 0, "For more information read the manual page" ); } if ( defined $opt{'version'} ) { print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n"; exit 0; } # set debug areas, if any specified (only useful for command-line tools) if (defined $opt{'debug'}) { $opt{'debug'} ||= 'all'; } if ( $opt{'force-expire'} ) { $synconly = 1; } if ($opt{'showdots'} && $opt{'progress'}) { print "--showdots and --progress may not be used together, please select just one\n"; exit 0; } if ( !defined $isspam && !defined $synconly && !defined $forget && !defined $opt{'dump'} && !defined $opt{'import'} && !defined $opt{'clear'} && !defined $opt{'backup'} && !defined $opt{'restore'} && !defined $opt{'folders'} ) { usage( 0, "Please select either --spam, --ham, --folders, --forget, --sync, --import,\n--dump, --clear, --backup or --restore" ); } # We need to make sure the journal syncs pre-forget... if ( defined $forget && $opt{'nosync'} ) { $opt{'nosync'} = 0; warn "sa-learn warning: --forget requires read/write access to the database, and is incompatible with --no-sync\n"; } if ( defined $opt{'old_format'} ) { #Format specified in the 2.5x form of --dir, --file, --mbox, --mbx or --single. #Convert it to the new behavior: if ( $opt{'old_format'} eq 'single' ) { push ( @ARGV, '-' ); } } my $post_config = ''; # kluge to support old check_bayes_db operation # bug 3799: init() will go r/o with the configured DB, and then dbpath needs # to override. Just access the dbpath version via post_config_text. if ( defined $bayes_override_path ) { # Add a default prefix if the path is a directory if ( -d $bayes_override_path ) { $bayes_override_path = File::Spec->catfile( $bayes_override_path, 'bayes' ); } $post_config .= "bayes_path $bayes_override_path\n"; } # These options require bayes_scanner, which requires "use_bayes 1", but # that's not necessary for these commands. if (defined $opt{'dump'} || defined $opt{'import'} || defined $opt{'clear'} || defined $opt{'backup'} || defined $opt{'restore'}) { $post_config .= "use_bayes 1\n"; } $post_config .= join("\n", @{$opt{'cf'}})."\n"; # create the tester factory $spamtest = new Mail::SpamAssassin( { rules_filename => $opt{'configpath'}, site_rules_filename => $opt{'siteconfigpath'}, userprefs_filename => $opt{'prefspath'}, username => $opt{'username'}, debug => $opt{'debug'}, local_tests_only => $opt{'local'}, dont_copy_prefs => 1, PREFIX => $PREFIX, DEF_RULES_DIR => $DEF_RULES_DIR, LOCAL_RULES_DIR => $LOCAL_RULES_DIR, post_config_text => $post_config, } ); $spamtest->init(1); if (Mail::SpamAssassin::Util::am_running_on_windows()) { binmode(STDIN); # bug 4363 binmode(STDOUT); } if ( defined $opt{'dump'} ) { my ( $magic, $toks ); if ( $opt{'dump'} eq 'all' || $opt{'dump'} eq '' ) { # show us all tokens! ( $magic, $toks ) = ( 1, 1 ); } elsif ( $opt{'dump'} eq 'magic' ) { # show us magic tokens only ( $magic, $toks ) = ( 1, 0 ); } elsif ( $opt{'dump'} eq 'data' ) { # show us data tokens only ( $magic, $toks ) = ( 0, 1 ); } else { # unknown option warn "Unknown dump option '" . $opt{'dump'} . "'\n"; $spamtest->finish_learner(); exit 1; } if (!$spamtest->dump_bayes_db( $magic, $toks, $opt{'regexp'}) ) { $spamtest->finish_learner(); die "ERROR: Bayes dump returned an error, please re-run with -D for more information\n"; } $spamtest->finish_learner(); exit 0; } if ( defined $opt{'import'} ) { my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade(); $spamtest->finish_learner(); exit( !$ret ); } if (defined $opt{'clear'}) { unless ($spamtest->{bayes_scanner}->{store}->clear_database()) { $spamtest->finish_learner(); die "ERROR: Bayes clear returned an error, please re-run with -D for more information\n"; } $spamtest->finish_learner(); exit 0; } if (defined $opt{'backup'}) { unless ($spamtest->{bayes_scanner}->{store}->backup_database()) { $spamtest->finish_learner(); die "ERROR: Bayes backup returned an error, please re-run with -D for more information\n"; } $spamtest->finish_learner(); exit 0; } if (defined $opt{'restore'}) { my $filename = $opt{'restore'}; unless ($filename) { $spamtest->finish_learner(); die "ERROR: You must specify a filename to restore.\n"; } unless ($spamtest->{bayes_scanner}->{store}->restore_database($filename, $opt{'showdots'})) { $spamtest->finish_learner(); die "ERROR: Bayes restore returned an error, please re-run with -D for more information\n"; } $spamtest->finish_learner(); exit 0; } if ( !$spamtest->{conf}->{use_bayes} ) { warn "ERROR: configuration specifies 'use_bayes 0', sa-learn disabled\n"; exit 1; } $spamtest->init_learner( { force_expire => $opt{'force-expire'}, learn_to_journal => $opt{'nosync'}, wait_for_lock => 1, caller_will_untie => 1 } ); $spamtest->{bayes_scanner}{use_ignores} = $opt{'use-ignores'}; if ($synconly) { $spamtest->rebuild_learner_caches( { verbose => 1, showdots => $opt{'showdots'} } ); $spamtest->finish_learner(); exit 0; } $messagelimit = $opt{'stopafter'}; $learnprob = $opt{'learnprob'}; if ( defined $opt{'randseed'} ) { srand( $opt{'randseed'} ); } # sync the journal first if we're going to go r/w so we make sure to # learn everything before doing anything else. # if ( !$opt{nosync} ) { $spamtest->rebuild_learner_caches(); } # what is the result of the run? will end up being the exit code. my $exit_status = 0; # run this lot in an eval block, so we can catch die's and clear # up the dbs. eval { $SIG{INT} = \&killed; $SIG{TERM} = \&killed; if ( $opt{folders} ) { open( F, $opt{folders} ) || die $!; while () { chomp; next unless ($_); if (/^(?:ham|spam):\w*:/) { push ( @targets, $_ ); } else { target($_); } } close(F); } ########################################################################### # Deal with the target listing, and STDIN -> tempfile my $tempfile; # will be defined if stdin -> tempfile push(@targets, @ARGV); @targets = ('-') unless @targets; for(my $elem = 0; $elem <= $#targets; $elem++) { # ArchiveIterator doesn't really like STDIN, so if "-" is specified # as a target, make it a temp file instead. if ( $targets[$elem] =~ /(?:^|:)-$/ ) { if (defined $tempfile) { # uh-oh, stdin specified multiple times? warn "skipping extra stdin target (".$targets[$elem].")\n"; splice @targets, $elem, 1; $elem--; # go back to this element again next; } else { my $handle; local $/ = undef; # go into slurp mode ( $tempfile, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile(); print {$handle} ; close $handle; # re-aim the targets at the tempfile instead of STDIN $targets[$elem] =~ s/-$/$tempfile/; } } # make sure the target list is in the normal AI format if ($targets[$elem] !~ /^[^:]*:[a-z]+:/) { my $item = splice @targets, $elem, 1; target($item); # add back to the list $elem--; # go back to this element again next; } } ########################################################################### my $iter = new Mail::SpamAssassin::ArchiveIterator( { 'opt_all' => 0, # skip messages over 250k 'opt_want_date' => 0, } ); $iter->set_functions(\&wanted, \&result); $messagecount = 0; $learnedcount = 0; $init_results = 0; $start_time = time; # if exit_status isn't already set to non-zero, set it to the reverse of the # run result (0 is bad, 1+ is good -- the opposite of exit status codes) eval { $exit_status ||= ! $iter->run(@targets); }; print STDERR "\n" if ($opt{showdots}); $progress->final() if ($opt{progress} && $progress); my $phrase = defined $forget ? "Forgot" : "Learned"; print "$phrase tokens from $learnedcount message(s) ($messagecount message(s) examined)\n"; # If we needed to make a tempfile, go delete it. if ( defined $tempfile ) { unlink $tempfile; } if ($@) { die $@ unless ( $@ =~ /HITLIMIT/ ); } }; if ($@) { my $failure = $@; $spamtest->finish_learner(); die $failure; } $spamtest->finish_learner(); exit $exit_status; ########################################################################### sub killed { $spamtest->finish_learner(); die "interrupted"; } sub target { my ($target) = @_; my $class = ( $isspam ? "spam" : "ham" ); my $format = ( defined( $opt{'format'} ) ? $opt{'format'} : "detect" ); push ( @targets, "$class:$format:$target" ); } ########################################################################### sub init_results { $init_results = 1; return unless $opt{'progress'}; $total_messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES; $progress = Mail::SpamAssassin::Util::Progress->new({total => $total_messages,}); } ########################################################################### sub result { my ($class, $result, $time) = @_; # don't open results files until we get here to avoid overwriting files &init_results if !$init_results; $progress->update($messagecount) if ($opt{progress} && $progress); } ########################################################################### sub wanted { my ( $class, $id, $time, $dataref ) = @_; my $spam = $class eq "s" ? 1 : 0; if ( defined($learnprob) ) { if ( int( rand( 1 / $learnprob ) ) != 0 ) { print STDERR '_' if ( $opt{showdots} ); return 1; } } if ( defined($messagelimit) && $learnedcount > $messagelimit ) { $progress->final() if ($opt{progress} && $progress); die 'HITLIMIT'; } $messagecount++; my $ma = $spamtest->parse($dataref); if ( $ma->get_header("X-Spam-Checker-Version") ) { my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1); $ma->finish(); $ma = $new_ma; } my $status = $spamtest->learn( $ma, undef, $spam, $forget ); my $learned = $status->did_learn(); if ( !defined $learned ) { # undef=learning unavailable die "ERROR: the Bayes learn function returned an error, please re-run with -D for more information\n"; } elsif ( $learned == 1 ) { # 1=message was learned. 0=message wasn't learned $learnedcount++; } # Do cleanup ... $status->finish(); undef $status; $ma->finish(); undef $ma; print STDERR '.' if ( $opt{showdots} ); return 1; } ########################################################################### sub usage { my ( $verbose, $message ) = @_; my $ver = Mail::SpamAssassin::Version(); print "SpamAssassin version $ver\n"; pod2usage( -verbose => $verbose, -message => $message, -exitval => 64 ); } # --------------------------------------------------------------------------- =head1 NAME sa-learn - train SpamAssassin's Bayesian classifier =head1 SYNOPSIS B [options] [file]... B [options] --dump [ all | data | magic ] Options: --ham Learn messages as ham (non-spam) --spam Learn messages as spam --forget Forget a message --use-ignores Use bayes_ignore_from and bayes_ignore_to --sync Syncronize the database and the journal if needed --force-expire Force a database sync and expiry run --dbpath Allows commandline override (in bayes_path form) for where to read the Bayes DB from --dump [all|data|magic] Display the contents of the Bayes database Takes optional argument for what to display --regexp For dump only, specifies which tokens to dump based on a regular expression. -f file, --folders=file Read list of files/directories from file --dir Ignored; historical compatibility --file Ignored; historical compatibility --mbox Input sources are in mbox format --mbx Input sources are in mbx format --showdots Show progress using dots --progress Show progress using progress bar --no-sync Skip synchronizing the database and journal after learning -L, --local Operate locally, no network accesses --import Migrate data from older version/non DB_File based databases --clear Wipe out existing database --backup Backup, to STDOUT, existing database --restore Restore a database from filename -u username, --username=username Override username taken from the runtime environment, used with SQL -C path, --configpath=path, --config-file=path Path to standard configuration dir -p prefs, --prefspath=file, --prefs-file=file Set user preferences file --siteconfigpath=path Path for site configs (default: /etc/mail/spamassassin) --cf='config line' Additional line of configuration -D, --debug [area=n,...] Print debugging messages -V, --version Print version -h, --help Print usage message =head1 DESCRIPTION Given a typical selection of your incoming mail classified as spam or ham (non-spam), this tool will feed each mail to SpamAssassin, allowing it to 'learn' what signs are likely to mean spam, and which are likely to mean ham. Simply run this command once for each of your mail folders, and it will ''learn'' from the mail therein. Note that csh-style I in the mail folder names is supported; in other words, listing a folder name as C<*> will scan every folder that matches. See C for more details. SpamAssassin remembers which mail messages it has learnt already, and will not re-learn those messages again, unless you use the B<--forget> option. Messages learnt as spam will have SpamAssassin markup removed, on the fly. If you make a mistake and scan a mail as ham when it is spam, or vice versa, simply rerun this command with the correct classification, and the mistake will be corrected. SpamAssassin will automatically 'forget' the previous indications. Users of C who wish to perform training remotely, over a network, should investigate the C switch. =head1 OPTIONS =over 4 =item B<--ham> Learn the input message(s) as ham. If you have previously learnt any of the messages as spam, SpamAssassin will forget them first, then re-learn them as ham. Alternatively, if you have previously learnt them as ham, it'll skip them this time around. If the messages have already been filtered through SpamAssassin, the learner will ignore any modifications SpamAssassin may have made. =item B<--spam> Learn the input message(s) as spam. If you have previously learnt any of the messages as ham, SpamAssassin will forget them first, then re-learn them as spam. Alternatively, if you have previously learnt them as spam, it'll skip them this time around. If the messages have already been filtered through SpamAssassin, the learner will ignore any modifications SpamAssassin may have made. =item B<--folders>=I, B<-f> I sa-learn will read in the list of folders from the specified file, one folder per line in the file. If the folder is prefixed with C or C, sa-learn will learn that folder appropriately, otherwise the folders will be assumed to be of the type specified by B<--ham> or B<--spam>. C above is optional, but is the same as the standard for ArchiveIterator: mbox, mbx, dir, file, or detect (the default if not specified). =item B<--mbox> sa-learn will read in the file(s) containing the emails to be learned, and will process them in mbox format (one or more emails per file). =item B<--mbx> sa-learn will read in the file(s) containing the emails to be learned, and will process them in mbx format (one or more emails per file). =item B<--use-ignores> Don't learn the message if a from address matches configuration file item C or a to address matches C. The option might be used when learning from a large file of messages from which the hammy spam messages or spammy ham messages have not been removed. =item B<--sync> Syncronize the journal and databases. Upon successfully syncing the database with the entries in the journal, the journal file is removed. =item B<--force-expire> Forces an expiry attempt, regardless of whether it may be necessary or not. Note: This doesn't mean any tokens will actually expire. Please see the EXPIRATION section below. Note: C<--force-expire> also causes the journal data to be synchronized into the Bayes databases. =item B<--forget> Forget a given message previously learnt. =item B<--dbpath> Allows a commandline override of the I configuration option. =item B<--dump> I