#!/usr/bin/perl -w # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use strict; use FindBin; use Getopt::Std; getopts("fm:M:X:l:L:pxhc:at:s:ioTSdP"); use vars qw { $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c $opt_a $opt_t $opt_s $opt_i $sorting $opt_o $opt_T $opt_S $opt_X $opt_d $opt_P }; # as per http://wiki.apache.org/spamassassin/RulesProjPromotion, for -P my $promote_so_min = 0.95; my $promote_hitrate_min = 0.02; my $promote_fprate_max = 1.00; sub usage { die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC] [-s SC] [-a] [-p] [-x] [-i] [-T] [-S] [-o] [-d] [spam log] [ham log] -c p use p as the rules directory -f falses. count only false-negative or false-positive matches -m RE print rules matching regular expression -t RE print rules with tflags matching regular expression -M RE only consider log entries matching regular expression -X RE don't consider log entries matching regular expression -l LC also print language specific rules for lang code LC (or 'all') -L LC only print language specific rules for lang code LC (or 'all') -a display all tests -p percentages. implies -x -x extended output, with S/O ratio and scores -s SC which scoreset to use -i use IG (information gain) for ranking -T display rule times. implies -x, -p -o display hit overlaps against all other rules -S display score-map of hits -P flag which rules pass the promotion criteria -d XML output. conflicts with -x, -p options -l and -L are mutually exclusive. options -M and -X are *not* mutually exclusive. if either the spam or and ham logs are unspecified, the defaults are \"spam.log\" and \"ham.log\" in the cwd. "; } usage() if($opt_h || ($opt_l && $opt_L)); usage() if($opt_d && ($opt_x || $opt_p)); if ($opt_p) { $opt_x = 1; } if ($opt_d) { $opt_x = $opt_p = 1; } $opt_s = 0 if ( !defined $opt_s ); my $cffile = $opt_c || "$FindBin::Bin/../rules"; # "our" so that the require'd file can overwrite them my $rules_pl_unparseable; our %rules = (); our %scores = (); my %soratio = (); my %freq_spam = (); my %freq_ham = (); my %hmap_spam = (); my %hmap_ham = (); my %scoremap_spam = (); my %scoremap_ham = (); my %freq = (); my $num_spam = 0; my $num_ham = 0; my %ranking = (); my $ok_lang = ''; my %rule_times = (); readscores($cffile); $ok_lang = lc ($opt_l || $opt_L || ''); if ($ok_lang eq 'all') { $ok_lang = '.'; } if ($opt_t && $rules_pl_unparseable) { die "-t requires rules.pl to be parseable"; } foreach my $key (keys %rules) { if ( ($opt_L && !$rules{$key}->{lang}) || ($rules{$key}->{lang} && (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/io) ) ) { delete $rules{$key} ; next; } $freq_spam{$key} = 0; $freq_ham{$key} = 0; if ($opt_o) { $hmap_spam{$key} = ''; $hmap_ham{$key} = ''; } } readlogs(); my $hdr_all = $num_spam + $num_ham; my $hdr_spam = $num_spam; my $hdr_ham = $num_ham; my $sorting = $opt_i ? "IG" : "RANK"; if ($opt_d) { $hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements $hdr_spam = ($num_spam / $hdr_all) * 100.0; $hdr_ham = ($num_ham / $hdr_all) * 100.0; $opt_P = 1; print qq{ $num_spam $num_spam $hdr_spam $hdr_spam }; } elsif ($opt_p) { printf "%7s %7s %7s %6s %6s %6s %s\n", "MSECS", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%", "S/O", $sorting, "SCORE", "NAME"; printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", 0, $hdr_spam, $hdr_ham, soratio ($num_spam,$num_ham), 0, 0; $hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements $hdr_spam = ($num_spam / $hdr_all) * 100.0; $hdr_ham = ($num_ham / $hdr_all) * 100.0; $hdr_all = 100.0; # this is obvious printf "%7.5f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n", 0, $hdr_spam, $hdr_ham, soratio ($num_spam,$num_ham), 0, 0; } elsif ($opt_p) { printf "%8s %7s %7s %6s %6s %6s %s\n", "OVERALL%", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%", "S/O", $sorting, "SCORE", "NAME"; printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", $hdr_all, $hdr_spam, $hdr_ham, soratio ($num_spam,$num_ham), 0, 0; $hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements $hdr_spam = ($num_spam / $hdr_all) * 100.0; $hdr_ham = ($num_ham / $hdr_all) * 100.0; $hdr_all = 100.0; # this is obvious printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n", $hdr_all, $hdr_spam, $hdr_ham, soratio ($num_spam,$num_ham), 0, 0; } elsif ($opt_x) { printf "%7s %7s %7s %6s %6s %6s %s\n", "OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME"; printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", $hdr_all, $hdr_spam, $hdr_ham, soratio ($num_spam,$num_ham), 0, 0; } else { printf "%10s %10s %10s %s\n", "OVERALL", "SPAM", "HAM", "NAME"; printf "%10d %10d %10d (all messages)\n", $hdr_all, $hdr_spam, $hdr_ham; } my %done = (); my @tests = (); my $rank_hi = 0; my $rank_lo = 9999999; # variables for wanted/unwanted RANK my %wanted; my %unwanted; my %isnice; my %wranks; my %uranks; # rules that we want to look at $freq{$_}++ for keys %freq_ham; $freq{$_}++ for keys %freq_spam; my $test; foreach $test (keys %freq) { my $parsed_rules_entry = $rules{$test}; # do not require 'tmp/rules.pl' to have been built from the # exact same ruleset version; this assumption screws up nightly # mass-check reports if they are generated with a different SVN rev # next unless (exists $rules{$test}); next if (!$opt_a && $test =~ /^__/); next if $done{$test}; $done{$test} = 1; push (@tests, $test); my $isnice = 0; if ($parsed_rules_entry) { if ($parsed_rules_entry->{tflags} && $parsed_rules_entry->{tflags} =~ /\bnice\b/) { $isnice = 1; } } $isnice{$test} = $isnice; my $fs = $freq_spam{$test}; $fs ||= 0; my $fn = $freq_ham{$test}; $fn ||= 0; my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0; my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj); if ($isnice) { $soratio = 1.0 - $soratio; my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp; } if ($opt_i) { # come up with a ranking my $rank; # IG system: from "Learning to Filter Unsolicited Commercial E-Mail", # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the # Boolean attributes (ie. the rules). Measures "the average reduction in # the entropy of C (classification) given the value of X (the rule)". Makes # a good ranking measure with a proper statistical basis. ;) # # Still would like to get an entropy measure in, too. # # sum P(X = x ^ C = c) # IG(X,C) = x in [0, 1] P(X = x ^ C = c) . log2( ------------------- ) # c in [Ch, Cs] P(X = x) . P(C = c) # my $safe_nspam = $num_spam || 0.0000001; my $safe_nham = $num_ham || 0.0000001; my $num_all = ($num_spam + $num_ham); my $safe_all = $num_all || 0.0000001; my $f_all = $fs+$fn; my $px0 = (($num_all - $f_all) / $safe_all); # P(X = 0) my $px1 = ($f_all / $safe_all); # P(X = 1) my $pccs = ($num_spam / $safe_all); # P(C = Cs) my $pcch = ($num_ham / $safe_all); # P(C = Ch) my $px1ccs = ($fs / $safe_nspam); # P(X = 1 ^ C = Cs) my $px1cch = ($fn / $safe_nham); # P(X = 1 ^ C = Ch) my $px0ccs = (($num_spam - $fs) / $safe_nspam); # P(X = 0 ^ C = Cs) my $px0cch = (($num_ham - $fn) / $safe_nham); # P(X = 0 ^ C = Ch) my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001; my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001; my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001; my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001; sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931... my $safe_px0ccs = ($px0ccs || 0.0000001); my $safe_px0cch = ($px0cch || 0.0000001); my $safe_px1ccs = ($px1ccs || 0.0000001); my $safe_px1cch = ($px1cch || 0.0000001); $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) + ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) + ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) + ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) ); $ranking{$test} = $rank; $rank_hi = $rank if ($rank > $rank_hi); $rank_lo = $rank if ($rank < $rank_lo); } else { # RANK: basic wanted/unwanted ranking # # The rank of each test based on two ranks: (1) the number of wanted # hits and (2) the number of unwanted hits. Each test is ranked # positionally for both its wanted and unwanted hits (ties are # allowed) and the two ranks are normalized to have the same range. # Those two ranks are added together, producing a single RANK number # that is then normalized to [0, 1]. The result is equivalent to: # # RANK(rule) = (percentile(wanted) + percentile(unwanted))/2 # $wanted{$test} = $isnice ? $fn : $fs; $unwanted{$test} = $isnice ? $fs : $fn; # count number of ranks of each type $wranks{$wanted{$test}} = 1; $uranks{$unwanted{$test}} = 1; } } # finish basic wanted/unwanted ranking if (! $opt_i) { my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted; my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted; # first half of ranking is the wanted rank my $position = 0; my $last = undef; for my $test (@wanted) { $position++ if defined $last && $last != $wanted{$test}; $ranking{$test} += $position; $last = $wanted{$test} } # second half of ranking is the unwanted rank my $normalize = (scalar keys %wranks) / (scalar keys %uranks || 0.001); $position = 0; $last = undef; for my $test (@unwanted) { $position++ if defined $last && $last != $unwanted{$test}; $ranking{$test} += ($position * $normalize); $last = $unwanted{$test}; $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi); $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo); } } { # now normalise the rankings to [0, 1] $rank_hi -= $rank_lo; foreach my $test (@tests) { $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi); } } if ($opt_T) { read_timings(); } foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) { my $parsed_rules_entry = $rules{$test}; # do not require 'tmp/rules.pl' to have been built from the # exact same ruleset version; this assumption screws up nightly # mass-check reports if they are generated with a different SVN rev # next unless (exists $rules{$test}); next if (!$opt_a && $test =~ /^__/); my $fs = $freq_spam{$test}; $fs ||= 0; my $fn = $freq_ham{$test}; $fn ||= 0; my $fa = $fs+$fn; my $num_fs = $fs; my $num_fn = $fn; my $num_fa = $fa; my $tflags = ''; if ($parsed_rules_entry) { $tflags = $parsed_rules_entry->{tflags}; } # match certain tests next if ($opt_m && $test !~ m/$opt_m/); # match tflags next if ($opt_t && (!$tflags || $tflags !~ /$opt_t/)); if (!$opt_a && !$opt_t && $tflags) { # not net tests next if ($tflags =~ /\bnet\b/ && ($opt_s % 2 == 0)); # not userconf # Jul 13 2005 jm: removed. this blocks SPF_PASS from showing up! # why should userconf rules not be visible in freqs output? # next if ($tflags =~ /\buserconf\b/); } # adjust based on corpora sizes (and cvt to % while we're at it) my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0; if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; } if ($opt_p) { my $denom = ($num_spam + $num_ham) || 0.000001; # avoid / by 0 $fa = ($fa / $denom) * 100.0; $fs = $fsadj; $fn = $fnadj; } my $soratio = $soratio{$test}; if (!defined $soratio) { $soratio{$test} = soratio ($fsadj, $fnadj); } my $promotable; if ($opt_P) { $promotable = 1; if ($isnice{$test}) { if (($soratio{$test} > (1.0 - $promote_so_min)) || ($fn < $promote_hitrate_min) || ($fs >= $promote_fprate_max)) { $promotable = 0; } } else { if (($soratio{$test} < $promote_so_min) || ($fs < $promote_hitrate_min) || ($fn >= $promote_fprate_max)) { $promotable = 0; } } } my $promotable_str = $opt_P ? ($promotable ? '+ ' : '- ') : ''; if ($opt_d) { print qq{ }.($rule_times{$test}||0).qq{ $num_fa $num_fs $num_fn }.sprintf("%.5f", $fa).qq{ }.sprintf("%.5f", $fs).qq{ }.sprintf("%.5f", $fn).qq{ }.sprintf("%.8f", $soratio).qq{ }.sprintf("%.8f", $ranking{$test}).qq{ }.($scores{$test}||0).qq{ $promotable $test }; } elsif ($opt_T) { printf "%7.5f %7.4f %7.4f %7.3f %6.2f %6.2f %s%s\n", $rule_times{$test}||0, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0, $promotable_str, $test; } elsif ($opt_p) { printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s%s\n", $fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0, $promotable_str, $test; } elsif ($opt_x) { printf "%7d %7d %7d %7.3f %6.2f %6.2f %s%s\n", $fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0, $promotable_str, $test; } else { printf "%10d %10d %10d %s\n", $fa, $fs, $fn, $test; } if ($opt_S) { _print_scoremap("ham", $scoremap_ham{$test}); _print_scoremap("spam", $scoremap_spam{$test}); } if ($opt_o) { compute_overlaps_for_rule($test); } if ($opt_d) { print qq{ }; } } if ($opt_d) { print qq{ }; } exit; sub _print_scoremap { my ($name, $smap) = @_; if ($opt_d) { print qq{ }; } $smap ||= { }; my @scores = (sort { $a <=> $b } keys %{$smap}); my $total = 0; foreach my $score (@scores) { $total += $smap->{$score}; } foreach my $score (@scores) { my $num = $smap->{$score}; my $pc = sprintf("%.4f", ($num / ($total||0.0001)) * 100); if ($opt_d) { print qq{ }; } else { printf " scoremap %4s: %2d %6.2f%% %4d %s\n", $name, $score, $pc, $num, _scoremap_graph($pc); } } if ($opt_d) { print qq{ }; } else { print "\n"; } } sub _scoremap_graph { my ($pc) = @_; return '*' x ($pc * (40/100)); } sub readlogs { my $spam = $ARGV[0] || "spam.log"; my $ham = $ARGV[1] || "ham.log"; foreach my $file ($spam, $ham) { open (IN, "<$file") || die "Could not open file '$file': $!"; my $isspam = ($file eq $spam); my $caught; my $restofline; my $rules; my $score; # this is very speed-sensitive code. remove all possible # conditionals using an eval('..'). my $evalstr = ' while () { '; if ($opt_M) { $evalstr .= ' next unless /$opt_M/o; '; } if ($opt_X) { $evalstr .= ' next if /$opt_X/o; '; } # note: doing the match with a regexp shaves off no less than # 7 opcodes. nice! # the additional split() is for this case: # ". -20 /path time=1112116980,scantime=0,format=f,reuse=no" # in other words, no hits. split(' ') cannot deal with this # correctly, seeing (".", "-20", "/path", "time=...etc"). Work # around this by using a literal / / regexp split to discard # the csv stuff we don't want out of the rest of the line. $evalstr .= ' ($caught, $score, $restofline) = split(\' \', $_, 3); next unless ($caught =~ /^[Y\.]$/ && $restofline); (undef, $rules) = split(/ /, $restofline, 3); '; if ($opt_f) { $evalstr .= ' next if (!(($caught eq "Y") xor $isspam)); '; } if ($opt_S) { $evalstr .= ' $score = int $score; '; } my $hmapstr = ''; my $smapstr = ''; if ($isspam) { if ($opt_o) { $hmapstr = ' if (!exists $hmap_spam{$r}) { $hmap_spam{$r} = ""; } vec ($hmap_spam{$r}, $num_spam, 1) = 1; '; } if ($opt_S) { $smapstr = ' $scoremap_spam{$r}{$score}++; '; } $evalstr .= ' $num_spam++; foreach my $r (split(/,/, $rules)) { $freq_spam{$r}++; '.$hmapstr.$smapstr.' } '; } else { if ($opt_o) { $hmapstr = ' if (!exists $hmap_ham{$r}) { $hmap_ham{$r} = ""; } vec ($hmap_ham{$r}, $num_ham, 1) = 1; '; } if ($opt_S) { $smapstr = ' $scoremap_ham{$r}{$score}++; '; } $evalstr .= ' $num_ham++; foreach my $r (split(/,/, $rules)) { $freq_ham{$r}++; '.$hmapstr.$smapstr.' } '; } $evalstr .= ' } '; eval $evalstr; if ($@) { die $@; } close IN; } # paranoia: remove zero length rules delete $freq_spam{''}; delete $hmap_spam{''}; delete $freq_ham{''}; delete $hmap_ham{''}; } sub compute_overlaps_for_rule { my ($r1) = @_; my %overlaps_spam = (); foreach my $r2 (keys %hmap_spam) { next if $r1 eq $r2; my $ratio = _hmap_to_overlap_ratio ($r1, $r2, $hmap_spam{$r1}, $hmap_spam{$r2}, $freq_spam{$r1}, $freq_spam{$r2}); if (exists $overlaps_spam{$ratio}) { $overlaps_spam{$ratio} .= " ".$r2; } else { $overlaps_spam{$ratio} = $r2; } } my %overlaps_ham = (); foreach my $r2 (keys %hmap_ham) { next if $r1 eq $r2; my $ratio = _hmap_to_overlap_ratio ($r1, $r2, $hmap_ham{$r1}, $hmap_ham{$r2}, $freq_ham{$r1}, $freq_ham{$r2}); if (exists $overlaps_ham{$ratio}) { $overlaps_ham{$ratio} .= " ".$r2; } else { $overlaps_ham{$ratio} = $r2; } } _print_overlap_ratios($r1, \%overlaps_spam, "spam"); _print_overlap_ratios($r1, \%overlaps_ham, " ham"); } sub _print_overlap_ratios { my ($r1, $hash, $type) = @_; if ($opt_d) { print qq{ }; } foreach my $full (sort { $b <=> $a } keys %$hash) { my $ratio = $full * 100; last if ($ratio < 20); # 20% cutoff my $rules = _prettify_overlap_rules($r1, $hash->{$full}); next if ($rules eq ''); if ($opt_d) { print qq{ $rules }; } else { printf " overlap %s: %3d%% %s\n", $type, $ratio, $rules; } } if ($opt_d) { print qq{ {type} eq 'meta') { # ignore meta-subrules that match the rule they make up. # TODO: this is simplistic; it doesn't look to see if those subrules # are in turn meta rules with further subrules that should be ignored. # but it works well enough... my $code = $rules{$rule}->{code}; @rules = grep { $code !~ /\b\Q$_\E\b/; } @rules; } return join (' ', @rules); } sub _hmap_to_overlap_ratio { my ($r1, $r2, $hmap1, $hmap2, $freq1, $freq2) = @_; $hmap1 ||= ''; $hmap2 ||= ''; if ($hmap1 !~ /[^\000]/ && $hmap2 !~ /[^\000]/) { # no hits on either! this would normally give a 100% hitrate match, # but that's misleading -- so hide it by giving it a 0% overlap return 0; } #my $i; for ($i = 0; $i < length($hmap1)*8; $i++) { print vec($hmap1,$i,1); } print "\n"; for ($i = 0; $i < length($hmap2)*8; $i++) { print vec($hmap2,$i,1); } print "\n"; # ah, nifty. this could have been very slow, but with perl's support # for bitstring ops, we get C speed. yay! my $both_bits = $hmap1 | $hmap2; my $diff_bits = $hmap1 ^ $hmap2; # now, count the set bits in both bitstrings. thank you unpack()! my $both_count = unpack("%32b*", $both_bits); my $diff_count = unpack("%32b*", $diff_bits); if ($both_count == 0) { warn "oops! no hits on either but didn't get caught in RE"; return 0; } return ($both_count - $diff_count) / $both_count; } sub readscores { my($cffile) = @_; if (system ("$FindBin::Bin/../build/parse-rules-for-masses -d \"$cffile\" -s $opt_s")) { warn "parse-rules-for-masses failed!"; } eval { require "./tmp/rules.pl"; }; if ($@) { warn "tmp/rules.pl is unparseable: $@"; $rules_pl_unparseable = 1; # but carry on anyway (for most uses) } else { $rules_pl_unparseable = 0; } } sub soratio { my ($s, $n) = @_; $s ||= 0; $n ||= 0; if ($s + $n > 0) { return $s / ($s + $n); } else { return 0.5; # no results -> not effective } } sub read_timings { if (!open (IN, "; if ($ver !~ /^v1/) { warn "hit-frequencies: unknown version in 'timing.log', timings will be 0"; close IN; return; } while () { if (/^T\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) { my ($name, $duration, $max, $runs) = ($1,$2,$3,$4); $rule_times{$name} = ($duration / ($runs||0.00001)) * 1000; } } close IN; }