#!/usr/bin/perl use lib "../lib"; use Mail::SpamAssassin; use Mail::SpamAssassin::NoMailAudit; #use Devel::Peek; $cffile = '../rules'; if (defined $ARGV[0] && $ARGV[0] eq '-c') { shift; $cffile = $ARGV[0]; shift; } if (defined $ARGV[0] && $ARGV[0] eq '-ms') { shift; find_missed('spam.log'); } if (defined $ARGV[0] && $ARGV[0] eq '-mn') { shift; find_missed('nonspam.log'); } my $FORK = 0; if (defined $ARGV[0] && $ARGV[0] eq '-f') { $FORK = 1; } $spamtest = new Mail::SpamAssassin ({ 'rules_filename' => $cffile, 'userprefs_filename' => "../spamassassin.prefs", 'local_tests_only' => 1 }); $spamtest->compile_now(); my $count = 0; mass_check_all_folders (\&wanted, @ARGV); exit; sub wanted { my ($id, $dataref) = @_; # my $ma = Mail::Audit->new('data' => \@msg); my $ma = Mail::SpamAssassin::NoMailAudit->new ('data' => $dataref); $ma->{noexit} = 1; my $status = $spamtest->check ($ma); $status->rewrite_mail (); $_ = $ma->get ("X-Spam-Status"); /^(\S+), hits=(\S+) required=\S+ tests=(.+)\s*$/s; my $yorn = $status->is_spam(); my $hits = $status->get_hits(); my $tests = $status->get_names_of_tests_hit(); printf "%s %2d %s %s\n", ($yorn ? 'Y' : '.'), $hits, $id, $tests; $status->finish(); undef $ma; # clean 'em up undef $status; # system ("ps alxww | grep mass-check | grep perl | grep -v grep");#JMD if ($FORK) { exit; } } ########################################################################### sub mass_check_all_folders { my $sub = shift; foreach my $folder (@_) { if ($folder =~ /.tar$/) { # it's an MH or Cyrus folder in a tar file use Archive::Tar; mass_check_mh_tar_file($sub, $folder); } elsif (-d $folder && (-f "$folder/1" || -f "$folder/1.gz" || -f "$folder/cyrus.index")) { # it's an MH folder or a Cyrus mailbox mass_check_mh_folder($sub, $folder); } elsif (-f $folder) { mass_check_mailbox($sub, $folder); } } } sub mass_check_mh_tar_file { my $sub = shift; my $filename = shift; my $tar = Archive::Tar->new(); $tar->read($filename); my @files = $tar->list_files(['name']); foreach my $mail (@files) { next if $mail =~ m#/$# or $mail =~ /cyrus\.(index|header|cache)/; my $msg_data = $tar->get_content($mail); my @msg = split("\r\n",$tar->get_content($mail)); $mail =~ s/\s/_/g; &$sub ($mail, \@msg); } } sub mass_check_mh_folder { my $sub = shift; my $folder = shift; my @files = <$folder/[0-9]*>; foreach my $mail (@files) { # jm: commented size checks here; I can't see how they could be working, # as "250_000" is not an int. New size check implemented below if ($mail =~ /\.gz$/) { # next if `gunzip -c $mail|wc -c` > 250_000; #skip messages bigger than 250k open (STDIN, "gunzip -cd $mail |") or warn "gunzip $mail failed: $@"; } elsif ($mail =~ /\.bz2$/) { # next if `bzip2 -dc $mail|wc -c` > 250_000; #skip messages bigger than 250k open (STDIN, "bzip2 -cd $mail |") or warn "bunzip2 $mail failed: $@"; } else { # next if `wc -c $mail` > 250_000; #skip messages bigger than 250k open (STDIN, "<$mail") or warn "open $mail failed: $@"; } # skip too-big mails if (-s STDIN > 250*1024) { close STDIN; next; } my @msg = (); close STDIN; &$sub ($mail, \@msg); } } sub mass_check_mailbox { my $sub = shift; my $folder = shift; if ($folder =~ /\.gz$/) { open (MBOX, "gunzip -cd $folder |") or warn "gunzip $folder failed: $@"; } elsif ($folder =~ /\.bz2$/) { open (MBOX, "bzip2 -cd $folder |") or warn "bunzip2 $folder failed: $@"; } else { open (MBOX, "<$folder") or warn "open $folder failed: $@"; } while () { /^From \S+ +... ... / and last; } while (!eof MBOX) { my @msg = (); my $msgid = undef; my $hits = ''; $count++; while () { /^Message-[Ii][Dd]: (.*)\s*$/ and $msgid = $1; /^X-Spam-Status: .* tests=(.*)$/ and $hits = $1; if (/^$/) { if (!defined ($msgid)) { $msgid = "<$count\@no_msgid_in_msg.taint.org>"; push (@msg, "Message-Id: $msgid\n"); } } /^From \S+ +... ... / and last; push (@msg, $_); } if (scalar @msg > 1000) { next; } # too big # switch to a fork-based model to save RAM if ($FORK && fork()) { wait; next; } $msgid = "$folder:$msgid"; # so we can find it again $msgid =~ s/\s/_/gs; # make safe &$sub ($msgid, \@msg); } close MBOX; } sub sortbynum { $a =~ m,\/(\d+).*$,; my $anum = $1; $b =~ m,\/(\d+).*$,; my $bnum = $1; ($anum <=> $bnum); } ############################################################################ sub find_missed { my $file = shift; my $threshold = 5; my $shouldbespam = 1; if ($file =~ /nonspam/) { $shouldbespam = 0; } my $scores = readscores(); open (IN, "<$file"); while () { /^.\s+\d+\s+(\S+)\s*/ or next; my $id = $1; my $score = 0.0; $_ = $'; s/,,+/,/g; s/^\s+//; s/\s+$//; foreach my $tst (split (/,/, $_)) { next if ($tst eq ''); if (!defined $scores->{$tst}) { warn "unknown test in $file, ignored: $tst\n"; next; } $score += $scores->{$tst}; } if ($shouldbespam && $score < $threshold) { found_missed ($score, $id, $_); } elsif (!$shouldbespam && $score > $threshold) { found_missed ($score, $id, $_); } } close IN; } sub readscores { my $scores = { }; print "Reading scores from \"$cffile\"...\n"; if (-d $cffile) { open (IN, "<$cffile/*scores*.cf") or warn "cannot read $cffile\n"; } else { open (IN, "<$cffile") or warn "cannot read $cffile\n"; } while () { s/#.*$//g; s/^\s+//; s/\s+$//; if (/^(header|body|full)\s+(\S+)\s+/) { $scores->{$2} ||= 1; } elsif (/^score\s+(\S+)\s+(.+)$/) { $scores->{$1} = $2; } } close IN; $scores; } sub found_missed { my $score = shift; my $id = shift; my $tests = shift; print "$score $id $tests\n"; }