#!/usr/bin/perl -w # options # # -c=file set $cffile # -f use fork-based model to save RAM (used for mailbox files) # --all don't skip big messages # --mh use mh-style folder (directory of messages named with numbers) # --mn find_missed('nonspam.log') # --ms find_missed('spam.log') # --single folder is a single mail message # # options for maildir and mh-style folders # --head=N only check first N messages # --sort sort contents of folders # --tail=N only check last N messages # # all following arguments are used as the name of mail folders use vars qw($opt_c $opt_f $opt_all $opt_head $opt_mh $opt_ms $opt_mn $opt_single $opt_sort $opt_tail); use FindBin; use lib "$FindBin::Bin/../lib"; use Mail::SpamAssassin; use Mail::SpamAssassin::NoMailAudit; use Getopt::Long; # flush buffer immediately so if mass-check fails or stops we don't get a corrupt line $|++; GetOptions("c=s", "f", "all", "head=i", "mh", "ms", "mn", "single", "sort", "tail=i"); #use Devel::Peek; if ($opt_c) { $cffile = $opt_c; } else { $cffile = "$FindBin::Bin/../rules"; } if ($opt_ms) { find_missed('spam.log'); } elsif ($opt_mn) { find_missed('nonspam.log'); } my $FORK = 0; if ($opt_f) { $FORK = 1; } $spamtest = new Mail::SpamAssassin ({ 'rules_filename' => $cffile, 'userprefs_filename' => "$FindBin::Bin/spamassassin.prefs", 'local_tests_only' => 1 }); $spamtest->compile_now(1); my $count = 0; mass_check_all_folders (\&wanted, @ARGV); exit; sub wanted { my ($id, $dataref) = @_; # my $ma = Mail::Audit->new('data' => \@msg); my $ma = Mail::SpamAssassin::NoMailAudit->new ('data' => $dataref); $ma->{noexit} = 1; $spamtest->remove_spamassassin_markup($ma); my $status = $spamtest->check ($ma); $status->rewrite_mail (); $_ = $ma->get ("X-Spam-Status"); /^(\S+), hits=(\S+) required=\S+ tests=(.+)\s*$/s; my $yorn = $status->is_spam(); my $hits = $status->get_hits(); my $tests = $status->get_names_of_tests_hit(); printf "%s %2d %s %s\n", ($yorn ? 'Y' : '.'), $hits, $id, $tests; $status->finish(); undef $ma; # clean 'em up undef $status; # system ("ps alxww | grep mass-check | grep perl | grep -v grep");#JMD if ($FORK) { exit; } } ########################################################################### sub mass_check_all_folders { my $sub = shift; foreach my $folder (@_) { if ($folder =~ /\.tar$/) { # it's an MH or Cyrus folder or Maildir in a tar file use Archive::Tar; mass_check_tar_file($sub, $folder); } elsif (-d $folder && ($opt_mh || -f "$folder/1" || -f "$folder/1.gz" || -f "$folder/cyrus.index")) { # it's an MH folder or a Cyrus mailbox mass_check_mh_folder($sub, $folder); } elsif (-d $folder && -d "$folder/cur" && -d "$folder/new" ) { # Maildir! mass_check_maildir($sub, $folder); } elsif (-f $folder && $opt_single) { # single message (for testing that variables are cleared appropriately) mass_check_single($sub, $folder); } elsif (-f $folder) { mass_check_mailbox($sub, $folder); } } } sub mass_check_tar_file { my $sub = shift; my $filename = shift; my $tar = Archive::Tar->new(); $tar->read($filename); my @files = $tar->list_files(['name']); foreach my $mail (@files) { next if $mail =~ m#/$# or $mail =~ /cyrus\.(index|header|cache)/; my $msg_data = $tar->get_content($mail); my @msg = split("\n",$tar->get_content($mail)); $mail =~ s/\s/_/g; &$sub ($mail, \@msg); } } sub mass_check_mh_folder { my $sub = shift; my $folder = shift; opendir(DIR, $folder) || die "Can't open $folder dir: $!"; my @files = grep { -f } map { "$folder/$_" } grep { /^[0-9]/ } readdir(DIR); closedir(DIR); @files = sortbynum(@files) if $opt_sort; splice(@files, $opt_head) if $opt_head; splice(@files, 0, -$opt_tail) if $opt_tail; foreach my $mail (@files) { if ($mail =~ /\.gz$/) { open (STDIN, "gunzip -cd $mail |") or warn "gunzip $mail failed: $@"; } elsif ($mail =~ /\.bz2$/) { open (STDIN, "bzip2 -cd $mail |") or warn "bunzip2 $mail failed: $@"; } else { open (STDIN, "<$mail") or warn "open $mail failed: $@"; } # skip too-big mails if (! $opt_all && -s STDIN > 250*1024) { close STDIN; next; } my @msg = (); close STDIN; &$sub ($mail, \@msg); } } sub mass_check_maildir { my $sub = shift; my $folder = shift; opendir(CURDIR, "$folder/cur") || die "Can't open $folder/cur dir: $!"; opendir(NEWDIR, "$folder/new") || die "Can't open $folder/new dir: $!"; my @files; push @files, grep { -f } map { "$folder/cur/$_" } readdir(CURDIR); push @files, grep { -f } map { "$folder/new/$_" } readdir(NEWDIR); closedir(CURDIR); closedir(NEWDIR); @files = sortbynum(@files) if $opt_sort; splice(@files, $opt_head) if $opt_head; splice(@files, 0, -$opt_tail) if $opt_tail; foreach my $mail (@files) { if ($mail =~ /\.gz$/) { open (STDIN, "gunzip -cd $mail |") or warn "gunzip $mail failed: $@"; } elsif ($mail =~ /\.bz2$/) { open (STDIN, "bzip2 -cd $mail |") or warn "bunzip2 $mail failed: $@"; } else { open (STDIN, "<$mail") or warn "open $mail failed: $@"; } # skip too-big mails if (! $opt_all && -s STDIN > 250*1024) { close STDIN; next; } my @msg = (); close STDIN; &$sub ($mail, \@msg); } } sub mass_check_single { my $sub = shift; my $folder = shift; if ($folder =~ /\.gz$/) { open (STDIN, "gunzip -cd $folder |") or warn "gunzip $folder failed: $@"; } elsif ($folder =~ /\.bz2$/) { open (STDIN, "bzip2 -cd $folder |") or warn "bunzip2 $folder failed: $@"; } else { open (STDIN, "<$folder") or warn "open $folder failed: $@"; } # skip too-big mails if (! $opt_all && -s STDIN > 250*1024) { close STDIN; next; } my @msg = (); close STDIN; &$sub ($folder, \@msg); } sub mass_check_mailbox { my $sub = shift; my $folder = shift; if ($folder =~ /\.gz$/) { open (MBOX, "gunzip -cd $folder |") or warn "gunzip $folder failed: $@"; } elsif ($folder =~ /\.bz2$/) { open (MBOX, "bzip2 -cd $folder |") or warn "bunzip2 $folder failed: $@"; } else { open (MBOX, "<$folder") or warn "open $folder failed: $@"; } while () { /^From \S+ +... ... / and last; } while (!eof MBOX) { my @msg = (); my $msgid = undef; my $hits = ''; $count++; while () { /^Message-[Ii][Dd]: (.*)\s*$/ and $msgid = $1; /^X-Spam-Status: .* tests=(.*)$/ and $hits = $1; if (/^$/) { if (!defined ($msgid)) { $msgid = "<$count\@no_msgid_in_msg.taint.org>"; push (@msg, "Message-Id: $msgid\n"); } } /^From \S+ +... ... / and last; push (@msg, $_); } if (! $opt_all && scalar @msg > 1000) { next; } # too big # switch to a fork-based model to save RAM if ($FORK && fork()) { wait; next; } $msgid = "$folder:$msgid"; # so we can find it again $msgid =~ s/\s/_/gs; # make safe &$sub ($msgid, \@msg); } close MBOX; } sub sortbynum { return map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [$_, /\/(\d+).*$/] } @_; } ############################################################################ sub find_missed { my $file = shift; my $threshold = 5; my $shouldbespam = 1; if ($file =~ /nonspam/) { $shouldbespam = 0; } my $scores = readscores(); open (IN, "<$file"); while () { /^.\s+\d+\s+(\S+)\s*/ or next; my $id = $1; my $score = 0.0; $_ = $'; s/,,+/,/g; s/^\s+//; s/\s+$//; foreach my $tst (split (/,/, $_)) { next if ($tst eq ''); if (!defined $scores->{$tst}) { warn "unknown test in $file, ignored: $tst\n"; next; } $score += $scores->{$tst}; } if ($shouldbespam && $score < $threshold) { found_missed ($score, $id, $_); } elsif (!$shouldbespam && $score > $threshold) { found_missed ($score, $id, $_); } } close IN; } sub readscores { my $scores = { }; print "Reading scores from \"$cffile\"...\n"; if (-d $cffile) { open (IN, "<$cffile/*scores*.cf") or warn "cannot read $cffile\n"; } else { open (IN, "<$cffile") or warn "cannot read $cffile\n"; } while () { s/#.*$//g; s/^\s+//; s/\s+$//; if (/^(header|body|full)\s+(\S+)\s+/) { $scores->{$2} ||= 1; } elsif (/^score\s+(\S+)\s+(.+)$/) { $scores->{$1} = $2; } } close IN; $scores; } sub found_missed { my $score = shift; my $id = shift; my $tests = shift; print "$score $id $tests\n"; }