#!/usr/bin/perl -w use Getopt::Std; getopts("fm:px"); sub usage { die "hit-frequencies [-f] [-m RE] [-p] [-x] [spam log] [nonspam log] -f falses. count only false-negative or false-positive matches -m RE print rules matching regular expression -p percentages. implies -x -x extended output, with S/O ratio and scores if either the spam or and nonspam logs are unspecified, the defaults are \"spam.log\" and \"nonspam.log\" in the cwd. "; } if ($opt_p) { $opt_x = 1; } my %freq_spam = (); my %freq_nonspam = (); my $num_spam = 0; my $num_nonspam = 0; my %ranking = (); readlogs(); readscores(); my $hdr_all = $num_spam + $num_nonspam; my $hdr_spam = $num_spam; my $hdr_nonspam = $num_nonspam; if ($opt_p) { if ($opt_f) { printf "%7s %7s %7s %6s %6s %s\n", "OVERALL%", "FNEG%", "FPOS%", "S/O", "SCORE", "NAME"; } else { printf "%7s %7s %7s %6s %6s %s\n", "OVERALL%", "SPAM%", "NONSPAM%", "S/O", "SCORE", "NAME"; } printf "%7d %7d %7d %6.2f %6.2f (all messages)\n", $hdr_all, $hdr_spam, $hdr_nonspam, soratio ($num_spam,$num_nonspam), 0; $hdr_spam = ($num_spam / $hdr_all) * 100.0; $hdr_nonspam = ($num_nonspam / $hdr_all) * 100.0; $hdr_all = 100.0; # this is obvious printf "%7.3f %7.3f %7.3f %6.2f %6.2f (all messages as %%)\n", $hdr_all, $hdr_spam, $hdr_nonspam, soratio ($num_spam,$num_nonspam), 0; } elsif ($opt_x) { printf "%7s %7s %7s %6s %6s %s\n", "OVERALL", "SPAM", "NONSPAM", "S/O", "SCORE", "NAME"; printf "%7d %7d %7d %6.2f %6.2f (all messages)\n", $hdr_all, $hdr_spam, $hdr_nonspam, soratio ($num_spam,$num_nonspam), 0; } else { printf "%10s %10s %10s %s\n", "OVERALL", "SPAM", "NONSPAM", "NAME"; printf "%10d %10d %10d (all messages)\n", $hdr_all, $hdr_spam, $hdr_nonspam; } my %done = (); my @tests = (); foreach my $test (keys %freq_spam, keys %freq_nonspam) { next if ($test =~ /^__/); next if $done{$test}; $done{$test} = 1; push (@tests, $test); if (!defined $tflags{$test}) { $tflags{$test} = ''; } my $fs = $freq_spam{$test}; $fs ||= 0; my $fn = $freq_nonspam{$test}; $fn ||= 0; my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; my $fnadj = $num_nonspam == 0 ? 0 : ($fn / ($num_nonspam)) * 100.0; my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj); my $matched = $fsadj; if ($tflags{$test} =~ /nice/) { $soratio = 1.0 - $soratio; $matched = $fnadj; } $ranking{$test} = (($soratio * 100) ** 8) * $matched; } foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) { next if ($test =~ /^__/); my $fs = $freq_spam{$test}; $fs ||= 0; my $fn = $freq_nonspam{$test}; $fn ||= 0; my $fa = $fs+$fn; next if ($opt_m && $test !~ m/$opt_m/); # match certain tests next if ($tflags{$test} =~ /net/); # ignore net tests # adjust based on corpora sizes (and cvt to % while we're at it) my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; my $fnadj = $num_nonspam == 0 ? 0 : ($fn / ($num_nonspam)) * 100.0; if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; } if ($opt_p) { $fa = ($fa / ($num_spam + $num_nonspam)) * 100.0; $fs = $fsadj; $fn = $fnadj; } my $soratio = $soratio{$test}; if (!defined $soratio) { $soratio{$test} = soratio ($fsadj, $fnadj); } $score{$test} ||= 1.0; if ($opt_p) { printf "%7.3f %7.3f %7.3f %6.2f %6.2f %s\n", $fa, $fs, $fn, $soratio, $score{$test}, $test; } elsif ($opt_x) { printf "%7d %7d %7d %6.2f %6.2f %s\n", $fa, $fs, $fn, $soratio, $score{$test}, $test; } else { printf "%10d %10d %10d %s\n", $fa, $fs, $fn, $test; } } exit; sub readlogs { my $spam = $ARGV[0] || "spam.log"; my $nonspam = $ARGV[1] || "nonspam.log"; foreach my $file ($spam, $nonspam) { open (IN, "<$file"); my $isspam = 0; ($file eq $spam) and $isspam = 1; while () { next if (/^#/); /^(.)\s+(-?\d+)\s+(\S+)\s*(\S*)/ or next; my $caught = ($1 eq 'Y'); my $hits = $2; $_ = $4; s/,,+/,/g; if ($isspam) { if ($opt_f) { if (!$caught) { $num_spam++; } } else { $num_spam++; } } else { if ($opt_f) { if ($caught) { $num_nonspam++; } } else { $num_nonspam++; } } my @tests = split (/,/, $_); foreach my $t (@tests) { next if ($t eq ''); if ($isspam) { if ($opt_f) { if (!$caught) { $freq_spam{$t}++; } } else { $freq_spam{$t}++; } } else { if ($opt_f) { if ($caught) { $freq_nonspam{$t}++; } } else { $freq_nonspam{$t}++; } } } } close IN; } } sub readscores { my @files = <../rules/[0-9]*.cf>; my $file; foreach $file (@files) { open (IN, "<$file"); while () { s/#.*$//g; s/^\s+//; s/\s+$//; if (/^(header|rawbody|body|full|uri)\s+(\S+)\s+/) { $freq_spam{$2} ||= 0; $freq_nonspam{$2} ||= 0; $tflags{$2} ||= ''; $score{$2} ||= 1.0; } elsif (/^tflags\s+(\S+)\s+(.+)$/) { $tflags{$1} = $2; } elsif (/^score\s+(\S+)\s+(.+)$/) { $score{$1} = $2; } } close IN; } } sub soratio { my ($s, $n) = @_; $s ||= 0; $n ||= 0; if ($s + $n > 0) { return $s / ($s + $n); } else { return 0; } }