#!/usr/bin/perl my $THRESHOLD = 50; my $TEST = 0; if (defined $ARGV[0] && $ARGV[0] eq '-t') { $TEST = 1; shift @ARGV; } my $stoplist = { }; while () { chomp; s/#.*$//; next unless (/\S/); /(.*) (.*)/; my $word1 = $1; my $word2 = $2; if(!defined($stoplist->{$word1})) { $stoplist->{$word1} = { }; } $stoplist->{$1}->{$2} = 1; } my $freqs = { }; my $highest_score; if ($TEST) { read_spamword_freqs(); } my $score; my $hitwords; mass_check_all_folders (\&wanted, @ARGV); if (!$TEST) { summarise(); } exit; sub wanted { my ($id, $dataref) = @_; local ($_); $score = 0; $hitwords = { }; my @lines = @{$dataref}; # copy it $text = ''; while ($_ = shift @lines) { /^\s*$/ and last; } while ($_ = shift @lines) { /^SPAM: / and next; # SpamAssassin markup /Content-.*: / and next; /^--/ and next; s/=$//gis; $text .= $_; } # sort out markup $text =~ s/=20/ /gis; $text =~ s/=3E/>/gis; # spam trick, disguise HTML $text =~ s/=[0-9a-f][0-9a-f]//gis; $text =~ s/\&[-_a-zA-Z0-9]+;/ /gs; $text =~ s/<[a-z0-9]+\b[^>]*>//gis; $text =~ s/<\/[a-z0-9]+>//gis; $text =~ s/[^A-Za-z]/ /gs; $text =~ s/\s+/ /gs; # kill ignored stopwords -- too small for us to match $text =~ s/ (?:to|of|in|a|an|and|the|on|if|or) / /gs; # 2 words of 3 to 20 letters if ($TEST) { my $lastword; while ($text =~ /([a-z]{3,20})\b/ig) { if (defined $lastword) { test_word_pair ($lastword, $1); } $lastword = $1; } while ($text =~ /!/g) { $score += 1; # add for each excl mark } my $hit = ''; foreach my $k (sort keys %{$hitwords}) { next unless ($hitwords->{$k} > ($highest_score / 10)); $hit .= ' '.$k; } $hit =~ s/^ //; if ($score > $THRESHOLD) { print "Y $score $id ($hit)\n"; } else { print ". $score $id ($hit)\n"; } } else { my $lastword; while ($text =~ /\b([a-z]{3,20})\b/ig) { if (defined $lastword) { add_word_pair ($lastword, $1); } $lastword = $1; } } } sub add_word_pair { my ($word1, $word2) = @_; $word1 = lc $word1; $word2 = lc $word2; next if (defined ($stoplist->{$word1}) and defined($stoplist->{$word1}->{$word2})); if(!defined($freqs->{$word1})) { $freqs->{$word1} = { }; } if(!defined($freqs->{$word1}->{$word2})) { $freqs->{$word1}->{$word2} = 1; } else { $freqs->{$word1}->{$word2}++; } } sub test_word_pair { my ($word1, $word2) = @_; my $w = lc $word1." ".$word2; my $freq = $freqs->{$w}; return if (!defined $freq); $score += (($freq*10) / $highest_score); $hitwords->{$w} = $freq; } sub summarise { foreach my $w1 (keys %{$freqs}) { foreach my $w2 (keys %{$freqs->{$w1}}) { next if ($freqs->{$w1}->{$w2} < 2); printf ("%d $w1 $w2\n", $freqs->{$w1}->{$w2}); } } } sub read_spamword_freqs { open (IN, ") { s/\s+/ /g; s/#.*$//; s/^ +//; if (/spamphrase (\d+) (\S+ \S+)/) { my $w = $2; my $c = $1; $freqs->{$w} = $c; if ($c > $highest) { $highest = $c; } } } close IN; $highest_score = $highest; } ########################################################################### sub mass_check_all_folders { my $sub = shift; foreach my $folder (@_) { if ($folder =~ /.tar$/) { # it's an MH or Cyrus folder in a tar file use Archive::Tar; mass_check_mh_tar_file($sub, $folder); } elsif (-d $folder && (-f "$folder/1" || -f "$folder/1.gz" || -f "$folder/cyrus.index")) { # it's an MH folder or a Cyrus mailbox mass_check_mh_folder($sub, $folder); } elsif (-f $folder) { mass_check_mailbox($sub, $folder); } } } sub sortbynum { $a =~ m,\/(\d+).*$,; my $anum = $1; $b =~ m,\/(\d+).*$,; my $bnum = $1; ($anum <=> $bnum); } sub mass_check_mh_tar_file { my $sub = shift; my $filename = shift; my $tar = Archive::Tar->new(); $tar->read($filename); my @files = $tar->list_files(['name']); foreach my $mail (@files) { next if $mail =~ m#/$# or $mail =~ /cyrus\.(index|header|cache)/; my $msg_data = $tar->get_content($mail); my @msg = split("\r\n",$tar->get_content($mail)); $mail =~ s/\s/_/g; &$sub ($mail, \@msg); } } sub mass_check_mh_folder { my $sub = shift; my $folder = shift; my @files = <$folder/[0-9]*>; foreach my $mail (@files) { # jm: commented size checks here; I can't see how they could be working, # as "250_000" is not an int. New size check implemented below if ($mail =~ /\.gz$/) { # next if `gunzip -c $mail|wc -c` > 250_000; #skip messages bigger than 250k open (STDIN, "gunzip -cd $mail |") or warn "gunzip $mail failed: $@"; } elsif ($mail =~ /\.bz2$/) { # next if `bzip2 -dc $mail|wc -c` > 250_000; #skip messages bigger than 250k open (STDIN, "bzip2 -cd $mail |") or warn "bunzip2 $mail failed: $@"; } else { # next if `wc -c $mail` > 250_000; #skip messages bigger than 250k open (STDIN, "<$mail") or warn "open $mail failed: $@"; } # skip too-big mails if (-s STDIN > 250*1024) { close STDIN; next; } my @msg = (); close STDIN; &$sub ($mail, \@msg); } } sub mass_check_mailbox { my $sub = shift; my $folder = shift; if ($folder =~ /\.gz$/) { open (MBOX, "gunzip -cd $folder |") or warn "gunzip $folder failed: $@"; } elsif ($folder =~ /\.bz2$/) { open (MBOX, "bzip2 -cd $folder |") or warn "bunzip2 $folder failed: $@"; } else { open (MBOX, "<$folder") or warn "open $folder failed: $@"; } while () { /^From \S+ +... ... / and last; } while (!eof MBOX) { my @msg = (); my $msgid = undef; $count++; while () { /^Message-[Ii][Dd]: (.*)\s*$/ and $msgid = $1; if (/^$/) { if (!defined ($msgid)) { $msgid = "<$count\@no_msgid_in_msg.taint.org>"; push (@msg, "Message-Id: $msgid\n"); } } /^From \S+ +... ... / and last; push (@msg, $_); } if (scalar @msg > 1000) { next; } # too big # switch to a fork-based model to save RAM if ($FORK && fork()) { wait; next; } $msgid = "$folder:$msgid"; # so we can find it again $msgid =~ s/\s/_/gs; # make safe &$sub ($msgid, \@msg); } close MBOX; } ############################################################################ __DATA__ # List of ignored words based on the traditional stop-list for IR. # This one pasted from http://dvl.dtic.mil/stop_list.html , with additional # HTML tags added by jm. about above according across after afterwards against albeit all almost alone along already also although always among amongst an and another any anybody anyhow anyone anything anyway anywhere apart are around as at author av available be became because become becomes becoming been before beforehand behind being below beside besides between beyond both but by can cannot canst certain cf cfrd choose conducted considered contrariwise cos could crd cu day described describes designed determine determined different discussed do does doesn't doing dost doth double down dual due during each either else elsewhere enough et etc even ever every everybody everyone everything everywhere except excepted excepting exception exclude excluding exclusive far farther farthest few ff first for formerly forth forward found from front further furthermore furthest general given get go had halves hardly has hast hath have he hence henceforth her here hereabouts hereafter hereby herein hereto hereupon hers herself him himself hindmost his hither hitherto how however howsoever I ie if in inasmuch inc include included including indeed indoors inside insomuch instead into investigated inward inwards is it its itself just kind kg km last latter latterly less lest let like little ltd made many may maybe me meantime meanwhile might more moreover most mostly more mr mrs ms much must my myself namely need neither never nevertheless next no nobody none nonetheless noone nope nor not nothing notwithstanding now nowadays nowhere obtained of off often ok on once one only onto or other others otherwise ought our ours ourselves out outside over own per performance performed perhaps plenty possible present presented presents provide provided provides quite rather really related report required results round said sake same sang save saw see seeing seem seemed seeming seems seen seldom selected selves sent several sfrd shalt she should shown sideways significant since slept slew slung slunk smote so some somebody somehow someone something sometime sometimes somewhat somewhere spake spat spoke spoken sprang sprung srd stave staves still studies such supposing tested than that the thee their them themselves then thence thenceforth there thereabout thereabouts thereafter thereby therefore therein thereof thereon thereto thereupon these they this those thou though thrice through throughout thru thus thy thyself till to together too toward types towards unable under underneath unless unlike until up upon upward upwards us use used using various very via vs want was we week well were what whatever whatsoever when whence whenever whensoever where whereabouts whereafter whereas whereat whereby wherefore wherefrom wherein whereinto whereof whereon wheresoever whereto whereunto whereupon wherever wherewith whether whew which whichever whichsoever while whilst whither who whoa whoever whole whom whomever whomsoever whose whosoever why will wilt with within without worse worst would wow ye yet year yippee you your yours yourself yourselves # HTML tags ( Dec 26 2001 jm: ) # font spam color size points face width mail align http body arial center height table relay strong name href html body head nbsp # some more common ones we should ignore address alternative best bidi boundary card check click close code contact doctype dtd endif find form full future help home hours information internet life line list mails make message month months mso multipart new number order people phone please public receive received reply send service subject take text time today transitional work xmlns shipping ship business life # some double-word-phrase stoplist entries does not for you has been per week you are you can you have you may you will you would your computer for the and the all the http www you know red hat with the from the have the but will have been then the and then out the there are this one not sure could you looks like for example they are are not you think mailing list email address subject line