#!/usr/bin/perl -w # options # # -c=file set configuration/rules directory # -f=file read list of targets from # -j=jobs specify the number of check jobs to run simultaneously # --mh find_missed('ham.log') # --ms find_missed('spam.log') # --net turn on network checks! # --mid report Message-ID from each message # --debug report debugging information # --progress show progress updates during check # --rewrite=OUT save rewritten message to OUT (default is /tmp/out) # --showdots print a dot for each scanned message # --rules=RE Only test rules matching the given regexp RE # --restart=N restart all of the children after processing N messages # # log options # -o write all logs to stdout # --loghits log the text hit for patterns (useful for debugging) # --hamlog=log use as ham log ('ham.log' is default) # --spamlog=log use as spam log ('spam.log' is default) # # message selection options # -n no date sorting or spam/ham interleaving # --all don't skip big messages # --head=N only check first N ham and N spam (N messages if -n used) # --tail=N only check last N ham and N spam (N messages if -n used) # # simple target options (implies -o and no ham/spam classification) # --dir subsequent targets are directories # --file subsequent targets are files in RFC 822 format # --mbox subsequent targets are mbox files # # # Just left over functions we should remove at some point: # --bayes report score from Bayesian classifier # # non-option arguments are used as target names (mail files and folders), # the target format is: :: # is "spam" or "ham" # is "dir", "file", or "mbox" # is a file or directory name. globbing of ~ and * is supported ########################################################################### use vars qw($opt_c $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits $opt_mid $opt_mh $opt_ms $opt_net $opt_nosort $opt_progress $opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart $opt_rewrite); use FindBin; use lib "$FindBin::Bin/../lib"; eval "use bytes"; use Mail::SpamAssassin::ArchiveIterator; use Mail::SpamAssassin; use Mail::SpamAssassin::NoMailAudit; use Getopt::Long; use POSIX qw(strftime); # default settings $opt_c = "$FindBin::Bin/../rules"; $opt_j = 1; $opt_net = 0; $opt_hamlog = "ham.log"; $opt_spamlog = "spam.log"; GetOptions("c=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug", "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net", "progress", "rewrite:s", "showdots", "spamlog=s", "tail=i", "rules=s", "restart=i", "dir" => sub { $opt_format = "dir"; }, "file" => sub { $opt_format = "file"; }, "mbox" => sub { $opt_format = "mbox"; }, '<>' => \&target); if ($opt_f) { open (F, $opt_f) || die $!; push (@targets, map { chomp; $_ } ); close (F); } if ($opt_ms) { find_missed($opt_spamlog); } elsif ($opt_mh) { find_missed($opt_hamlog); } $spamtest = new Mail::SpamAssassin ({ 'debug' => $opt_debug, 'rules_filename' => $opt_c, 'userprefs_filename' => "$FindBin::Bin/spamassassin/user_prefs", 'userstate_dir' => "$FindBin::Bin/spamassassin", 'save_pattern_hits' => $opt_loghits, 'dont_copy_prefs' => 1, 'local_tests_only' => $opt_net ? 0 : 1, 'only_these_rules' => $opt_rules, 'ignore_safety_expire_timeout' => 1, PREFIX => '', DEF_RULES_DIR => $opt_c, LOCAL_RULES_DIR => '', }); $spamtest->compile_now(1); $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf"); my $who = `id -un 2>/dev/null`; chomp $who; my $where = `uname -n 2>/dev/null`; chomp $where; my $when = `date -u`; chomp $when; my $revision = "unknown"; if (open(TESTING, "$opt_c/70_cvs_rules_under_test.cf")) { chomp($revision = ); $revision =~ s/.*Revision:\s*(\S+).*/$1/; close(TESTING); } my $log_header = "# mass-check results from $who\@$where, on $when\n" . "# M:SA version ".$spamtest->Version()."\n" . '# CVS tag: $Name: $' . "\n" . "# CVS revision: $revision\n"; my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost'; chomp $host; my $updates = 10; my $total_count = 0; my $spam_count = 0; my $ham_count = 0; if ($opt_o) { autoflush STDOUT 1; print STDOUT $log_header; } else { open(HAM, "> $opt_hamlog"); open(SPAM, "> $opt_spamlog"); autoflush HAM 1; autoflush SPAM 1; print HAM $log_header; print SPAM $log_header; } my $iter = new Mail::SpamAssassin::ArchiveIterator({ 'opt_j' => $opt_j, 'opt_n' => $opt_n, 'opt_all' => $opt_all, 'opt_head' => $opt_head, 'opt_tail' => $opt_tail, 'opt_restart' => $opt_restart, }); if ($opt_progress) { my $now = strftime("%Y-%m-%d %X", localtime(time)); printf STDERR "status: pre-scanning and sorting. now: %s\n", $now; } $iter->set_functions(\&wanted, \&result); $iter->run(@targets); print STDERR "\n" if ($opt_showdots); exit; ########################################################################### sub target { my ($target) = @_; if (!defined($opt_format)) { push (@targets, $target); } else { $opt_o = 1; push (@targets, "spam:$opt_format:$target"); } } ########################################################################### sub result { my ($class, $result, $time) = @_; if ($class eq "s") { if ($opt_o) { print STDOUT $result; } else { print SPAM $result; } $spam_count++; } elsif ($class eq "h") { if ($opt_o) { print STDOUT $result; } else { print HAM $result; } $ham_count++; } $total_count++; #warn ">> result: $total_count $class $time\n"; if ($opt_progress) { progress($time); } } sub wanted { my ($id, $time, $dataref) = @_; my $out; my $ma = Mail::SpamAssassin::NoMailAudit->new('data' => $dataref); $ma->{noexit} = 1; # remove SpamAssassin markup, if present and the mail was spam $_ = $ma->get ("X-Spam-Status"); if (defined($_) && /^Yes, hits=/) { my $newtext = $spamtest->remove_spamassassin_markup($ma); my @newtext = split (/^/m, $newtext); $dataref = \@newtext; $ma = Mail::SpamAssassin::NoMailAudit->new ('data' => $dataref); } my $status = $spamtest->check($ma); my @extra; if (defined($time)) { push(@extra, "time=$time"); } if (defined $status->{bayes_score}) { push(@extra, "bayes=" . $status->{bayes_score}); } if ($opt_mid) { my $mid = $ma->get_header("Message-Id"); if ($mid) { # message contains a Message-Id: while($mid =~ s/\([^\(\)]*\)//s) {}; # remove comments and $mid =~ s/^\s+|\s+$//sg; # leading and trailing spaces $mid =~ s/\s.*$//s; # keep only the first token } else { # it doesn't have a Message-Id: $mid = $id; # so build one from the id $mid =~ s,^.*/,,; # remove the path $mid = "<$mid\@$host.masses.spamassassin.org>"; # and put it together } $mid =~ tr/-A-Za-z0-9_!#%&=~<@>/./c; # replace dangerous chars with . (so regexp search just works) push(@extra, "mid=$mid"); } my $yorn = $status->is_spam() ? 'Y' : '.'; my $hits = $status->get_hits(); my $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit()))); my $extra = join(",", @extra); if (defined $opt_rewrite) { $status->rewrite_mail(); open(REWRITE, "> " . ($opt_rewrite ? $opt_rewrite : "/tmp/out")); print REWRITE $status->get_full_message_as_text(); close(REWRITE); } $id =~ s/\s/_/g; $out .= sprintf("%s %2d %s %s %s\n", $yorn, $hits, $id, $tests, $extra); if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) { $out .= logkilled($ma, $id, "possible virus"); } if ($opt_loghits) { my $log = ''; foreach my $t (sort keys %{$status->{pattern_hits}}) { $_ = $status->{pattern_hits}->{$t}; $_ ||= ''; s/\r/\\r/gs; # fix unprintables s/\n/\\n/gs; $log .= "$t=\"$_\" "; } if ($log) { chomp $log; $out .= "# $log\n"; } } $status->finish(); undef $ma; # clean 'em up undef $status; if ($opt_showdots) { print STDERR '.'; } return $out; } sub logkilled { my ($ma, $id, $reason) = @_; my $from = $ma->get_header("From") || 'undef'; my $to = $ma->get_header("To") || 'undef'; my $subj = $ma->get_header("Subject") || 'undef'; my $mid = $ma->get_header("Message-Id") || 'undef'; chomp ($from); chomp ($to); chomp ($subj); chomp ($mid); return "# skipped killfiled message ($reason): from=$from to=$to subj=$subj mid=$mid id=$id\n"; } sub progress { my ($time) = @_; $time ||= 0; my $messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES; my $statusevery = int($messages / $updates); $statusevery ||= 1; # if $messages < $updates, just give a status line per msg. # Are we at the end or otherwise at a point we should print status? Then do it. if ($messages == $total_count || $total_count % $statusevery == 0) { my $time = strftime("%Y-%m-%d", localtime($time)); my $now = strftime("%Y-%m-%d %X", localtime(time)); printf STDERR "status: %3.0f%% ham: %-6d spam: %-6d date: %s now: %s\n", ($total_count / $messages) * 100, $ham_count, $spam_count, $time, $now; } } ########################################################################### sub find_missed { my $file = shift; my $threshold = 5; my $shouldbespam = 1; if ($file =~ /ham/) { $shouldbespam = 0; } my $scores = readscores(); open (IN, "<$file"); while () { next if /^#/; /^.\s+\d+\s+(\S+)\s*/ or next; my $id = $1; my $score = 0.0; $_ = $'; s/bayes=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//; foreach my $tst (split (/,/, $_)) { next if ($tst eq ''); if (!defined $scores->{$tst}) { warn "unknown test in $file, ignored: $tst\n"; next; } $score += $scores->{$tst}; } if ($shouldbespam && $score < $threshold) { found_missed($score, $id, $_); } elsif (!$shouldbespam && $score > $threshold) { found_missed($score, $id, $_); } } close IN; } sub readscores { warn "Reading scores from \"$opt_c\"...\n"; system("./parse-rules-for-masses -d \"$opt_c\"") and die; use vars qw(%scores); require "./tmp/rules.pl"; return \%scores; } sub found_missed { my $score = shift; my $id = shift; my $tests = shift; print "$score $id $tests\n"; }