#!/usr/bin/perl -w # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use strict; use warnings; use FindBin; use Getopt::Long qw(:config bundling auto_help); use Pod::Usage; use vars qw { $opt_c $opt_s $opt_f $opt_a $opt_p $opt_x $opt_m $opt_t $opt_M $opt_X $opt_L $opt_l $opt_i $opt_T $opt_o $opt_S $opt_P $opt_d $opt_g $sorting }; GetOptions("c|cffile=s" => \$opt_c, "s|scoreset=i" => \$opt_s, # ,, pacify stupid emacs cperl mode "f|falses" => \$opt_f, "a|all" => \$opt_a, "p|percentages" => \$opt_p, "x|extended" => \$opt_x, "m|matchrules=s" => \$opt_m, "t|tflags=s" => \$opt_t, "M|matchlogs=s" => \$opt_M, "X|excludelogs=s" => \$opt_X, "L|onlylanguage=s" => \$opt_L, "l|alsolanguage=s" => \$opt_l, "i|ig" => \$opt_i, "T|times" => \$opt_T, "o|overlaps" => \$opt_o, "S|scoremap" => \$opt_S, "P|promotion" => \$opt_P, "d|xml" => \$opt_d, "g|generated" => \$opt_g ); =head1 NAME hit-frequencies - Display statistics about tests hit by a mass-check run =head1 SYNOPSIS hit-frequencies [options] Options: -c,--cffile=path Use path as the rules directory -s,--scoreset=n Use scoreset n -f,--falses Count only false-positives/false-negatives -a,--all Report all tests (including subrules) -p,--percentages Report percentages instead of raw hits (implies -x) -x,--extended "Extended" output, include RANK, S/O and SCORE -m,--matchrules=re Print rules matching the regular expression -t,--tflags=re Print only rules with tflags matching the regular expression -M,--matchlogs=re Consider only logs matching the regular expression -X,--excludelogs=re Exclude logs matching this regular expression -L,--onlylanguage=lc Only print language specific tests for specified lang code (try 'all') -l,--alsolanguage=lc Also print language specific tests for specified lang code (try 'all') -i,--ig Use IG (information gain) for ranking -T,--times Display rule times (implies -x, -p) -o,--overlaps Display hit overlaps against all other rules -S,--scoremap Display score-map of hits -P,--promotion Flag rules that meet the promotion criteria -d,--XML XML output (conflicts with -x, -p) -g,--generated Include generated nightly scores for sandbox rules =head1 DESCRIPTION B will read the mass-check logs F and F or the logs given on the command line. The output will contain a summary of the number of ham and spam messages and detailed statistics for each rule. The output will include the following columns: =over 4 =item OVERALL Number of times (or percentage with B<-p>) the rule hit on all messages (spam or ham). =item SPAM Number of times (or percentage with B<-p>) the rule hit on spam messages. =item HAM Number of times (or percentage with B<-p>) the rule hit on ham messages. =item FPOS =item FNEG Shown only with B<-f>, these refer to the number of times (or percentage) the rule hit on messages that were found to be false positives or false negatives. =item S/O Shown only with B<-x> or B<-p>, this is the number of spam hits divided by total number of hits (C refers to spam divided by overall). =item RANK Shown only with B<-x> or B<-p>, and when B<-i> is not used, this is a measure that attempts to indicate how I or I a test is. The higher it is, the better the test. =item IG Shown only with B<-i>, this is another measure that attempts to indicate how I a test is. =item SCORE Shown only with B<-x> or B<-p>, this is the current score assigned to the rule. If B<-g> is used, the scores generated for sandbox rules from nightly masscheck results (72_scores.cf) will be included as well. =item NAME This is the rule's name. =back =head1 BUGS Please report bugs to http://bugzilla.spamassassin.org/ =head1 SEE ALSO L, L =cut if ($opt_l && $opt_L) { pod2usage("-L/--alsolanguage and -l/--onlylanguage are mutually exclusive"); } if ($opt_d && ($opt_x || $opt_p)) { pod2usage("-d/--xml conflicts with -x/--extended and -p/--percentages"); } $opt_s = 0 if ( !defined $opt_s ); if ($opt_p) { $opt_x = 1; } if ($opt_d || $opt_T) { $opt_x = $opt_p = 1; } # as per http://wiki.apache.org/spamassassin/RulesProjPromotion, for -P my $promote_so_min = 0.95; my $promote_hitrate_min = 0.02; my $promote_fprate_max = 1.00; my $cffile = $opt_c || "$FindBin::Bin/../rules"; # "our" so that the require'd file can overwrite them my $rules_pl_unparseable; our %rules = (); our %scores = (); my %soratio = (); my %freq_spam = (); my %freq_ham = (); my %hmap_spam = (); my %hmap_ham = (); my %scoremap_spam = (); my %scoremap_ham = (); my %freq = (); my $num_spam = 0; my $num_ham = 0; my %ranking = (); my $ok_lang = ''; my %meta_subrule_pairs = (); my %rule_times = (); readscores($cffile); $ok_lang = lc ($opt_l || $opt_L || ''); if ($ok_lang eq 'all') { $ok_lang = '.'; } if (($opt_t || $opt_o) && $rules_pl_unparseable) { die "-t/-o require rules.pl to be parseable"; } foreach my $key (keys %rules) { if ($key eq '_scoreset') { delete $rules{$key}; # bug 5683 next; } if ( ($opt_L && !$rules{$key}->{lang}) || ($rules{$key}->{lang} && (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/io) ) ) { delete $rules{$key} ; next; } $freq_spam{$key} = 0; $freq_ham{$key} = 0; if ($opt_o) { $hmap_spam{$key} = ''; $hmap_ham{$key} = ''; } } readlogs(); my $hdr_all = $num_spam + $num_ham; my $hdr_spam = $num_spam; my $hdr_ham = $num_ham; my $sorting = $opt_i ? "IG" : "RANK"; if ($opt_d) { $hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements $hdr_spam = ($num_spam / $hdr_all) * 100.0; $hdr_ham = ($num_ham / $hdr_all) * 100.0; $opt_P = 1; print qq{ $num_spam $num_spam $hdr_spam $hdr_spam }; } elsif ($opt_p) { printf "%7s %7s %7s %6s %6s %6s %s\n", $opt_T?"MSECS":"OVERALL", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%", "S/O", $sorting, "SCORE", "NAME"; printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", 0, $hdr_spam, $hdr_ham, soratio ($num_spam,$num_ham), 0, 0; $hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements $hdr_spam = ($num_spam / $hdr_all) * 100.0; $hdr_ham = ($num_ham / $hdr_all) * 100.0; $hdr_all = 100.0; # this is obvious printf "%7.5f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n", 0, $hdr_spam, $hdr_ham, soratio ($num_spam,$num_ham), 0, 0; } elsif ($opt_p) { printf "%8s %7s %7s %6s %6s %6s %s\n", "OVERALL%", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%", "S/O", $sorting, "SCORE", "NAME"; printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", $hdr_all, $hdr_spam, $hdr_ham, soratio ($num_spam,$num_ham), 0, 0; $hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements $hdr_spam = ($num_spam / $hdr_all) * 100.0; $hdr_ham = ($num_ham / $hdr_all) * 100.0; $hdr_all = 100.0; # this is obvious printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n", $hdr_all, $hdr_spam, $hdr_ham, soratio ($num_spam,$num_ham), 0, 0; } elsif ($opt_x) { printf "%7s %7s %7s %6s %6s %6s %s\n", "OVERALL%", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPOS%":"HAM%", "S/O", $sorting, "SCORE", "NAME"; printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", $hdr_all, $hdr_spam, $hdr_ham, soratio ($num_spam,$num_ham), 0, 0; } else { printf "%10s %10s %10s %s\n", "OVERALL", $opt_f?"FNEG":"SPAM", $opt_f?"FPO":"HAM", "NAME"; printf "%10d %10d %10d (all messages)\n", $hdr_all, $hdr_spam, $hdr_ham; } my %done = (); my @tests = (); my $rank_hi = 0; my $rank_lo = 9999999; # variables for wanted/unwanted RANK my %wanted; my %unwanted; my %isnice; my %wranks; my %uranks; # rules that we want to look at $freq{$_}++ for keys %freq_ham; $freq{$_}++ for keys %freq_spam; # if a rule exists in the config, we want it in the output too $freq{$_}++ for keys %rules; my $test; foreach $test (keys %freq) { my $parsed_rules_entry = $rules{$test}; # do not require 'tmp/rules.pl' to have been built from the # exact same ruleset version; this assumption screws up nightly # mass-check reports if they are generated with a different SVN rev # next unless (exists $rules{$test}); next if (!$opt_a && $test =~ /^__/); next if $done{$test}; $done{$test} = 1; push (@tests, $test); my $isnice = 0; if ($parsed_rules_entry) { if ($parsed_rules_entry->{tflags} && $parsed_rules_entry->{tflags} =~ /\bnice\b/) { $isnice = 1; } } $isnice{$test} = $isnice; my $fs = $freq_spam{$test}; $fs ||= 0; my $fn = $freq_ham{$test}; $fn ||= 0; my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0; my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj); if ($isnice) { $soratio = 1.0 - $soratio; my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp; } if ($opt_i) { # come up with a ranking my $rank; # IG system: from "Learning to Filter Unsolicited Commercial E-Mail", # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the # Boolean attributes (ie. the rules). Measures "the average reduction in # the entropy of C (classification) given the value of X (the rule)". Makes # a good ranking measure with a proper statistical basis. ;) # # Still would like to get an entropy measure in, too. # # sum P(X = x ^ C = c) # IG(X,C) = x in [0, 1] P(X = x ^ C = c) . log2( ------------------- ) # c in [Ch, Cs] P(X = x) . P(C = c) # my $safe_nspam = $num_spam || 0.0000001; my $safe_nham = $num_ham || 0.0000001; my $num_all = ($num_spam + $num_ham); my $safe_all = $num_all || 0.0000001; my $f_all = $fs+$fn; my $px0 = (($num_all - $f_all) / $safe_all); # P(X = 0) my $px1 = ($f_all / $safe_all); # P(X = 1) my $pccs = ($num_spam / $safe_all); # P(C = Cs) my $pcch = ($num_ham / $safe_all); # P(C = Ch) my $px1ccs = ($fs / $safe_nspam); # P(X = 1 ^ C = Cs) my $px1cch = ($fn / $safe_nham); # P(X = 1 ^ C = Ch) my $px0ccs = (($num_spam - $fs) / $safe_nspam); # P(X = 0 ^ C = Cs) my $px0cch = (($num_ham - $fn) / $safe_nham); # P(X = 0 ^ C = Ch) my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001; my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001; my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001; my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001; sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931... my $safe_px0ccs = ($px0ccs || 0.0000001); my $safe_px0cch = ($px0cch || 0.0000001); my $safe_px1ccs = ($px1ccs || 0.0000001); my $safe_px1cch = ($px1cch || 0.0000001); $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) + ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) + ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) + ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) ); $ranking{$test} = $rank; $rank_hi = $rank if ($rank > $rank_hi); $rank_lo = $rank if ($rank < $rank_lo); } else { # RANK: basic wanted/unwanted ranking # # The rank of each test based on two ranks: (1) the number of wanted # hits and (2) the number of unwanted hits. Each test is ranked # positionally for both its wanted and unwanted hits (ties are # allowed) and the two ranks are normalized to have the same range. # Those two ranks are added together, producing a single RANK number # that is then normalized to [0, 1]. The result is equivalent to: # # RANK(rule) = (percentile(wanted) + percentile(unwanted))/2 # $wanted{$test} = $isnice ? $fn : $fs; $unwanted{$test} = $isnice ? $fs : $fn; # count number of ranks of each type $wranks{$wanted{$test}} = 1; $uranks{$unwanted{$test}} = 1; } } # finish basic wanted/unwanted ranking if (! $opt_i) { my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted; my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted; # first half of ranking is the wanted rank my $position = 0; my $last = undef; for my $test (@wanted) { $position++ if defined $last && $last != $wanted{$test}; $ranking{$test} += $position; $last = $wanted{$test} } # second half of ranking is the unwanted rank my $normalize = (scalar keys %wranks) / (scalar keys %uranks || 0.001); $position = 0; $last = undef; for my $test (@unwanted) { $position++ if defined $last && $last != $unwanted{$test}; $ranking{$test} += ($position * $normalize); $last = $unwanted{$test}; $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi); $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo); } } { # now normalise the rankings to [0, 1] $rank_hi -= $rank_lo; foreach my $test (@tests) { $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi); } } if ($opt_T) { read_timings(); } foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) { my $parsed_rules_entry = $rules{$test}; # do not require 'tmp/rules.pl' to have been built from the # exact same ruleset version; this assumption screws up nightly # mass-check reports if they are generated with a different SVN rev # next unless (exists $rules{$test}); next if (!$opt_a && $test =~ /^__/); my $fs = $freq_spam{$test}; $fs ||= 0; my $fn = $freq_ham{$test}; $fn ||= 0; my $fa = $fs+$fn; my $num_fs = $fs; my $num_fn = $fn; my $num_fa = $fa; my $tflags = ''; if ($parsed_rules_entry) { $tflags = $parsed_rules_entry->{tflags}; } # match certain tests next if ($opt_m && $test !~ m/$opt_m/); # match tflags next if ($opt_t && (!$tflags || $tflags !~ /$opt_t/)); if (!$opt_a && !$opt_t && $tflags) { # not net tests next if ($tflags =~ /\bnet\b/ && ($opt_s % 2 == 0)); # not userconf # Jul 13 2005 jm: removed. this blocks SPF_PASS from showing up! # why should userconf rules not be visible in freqs output? # next if ($tflags =~ /\buserconf\b/); } # adjust based on corpora sizes (and cvt to % while we're at it) my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0; if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; } if ($opt_p) { my $denom = ($num_spam + $num_ham) || 0.000001; # avoid / by 0 $fa = ($fa / $denom) * 100.0; $fs = $fsadj; $fn = $fnadj; } my $soratio = $soratio{$test}; if (!defined $soratio) { $soratio{$test} = soratio ($fsadj, $fnadj); } my $promotable; if ($opt_P) { $promotable = 1; if ($isnice{$test}) { if (($soratio{$test} > (1.0 - $promote_so_min)) || ($fn < $promote_hitrate_min) || ($fs >= $promote_fprate_max)) { $promotable = 0; } } else { if (($soratio{$test} < $promote_so_min) || ($fs < $promote_hitrate_min) || ($fn >= $promote_fprate_max)) { $promotable = 0; } } } my $promotable_str = $opt_P ? ($promotable ? '+ ' : '- ') : ''; if ($opt_d) { print qq{ $num_fa $num_fs $num_fn }.sprintf("%.5f", $fa).qq{ }.sprintf("%.5f", $fs).qq{ }.sprintf("%.5f", $fn).qq{ }.sprintf("%.8f", $soratio).qq{ }.sprintf("%.8f", $ranking{$test}).qq{ }.($scores{$test}||0).qq{ $promotable $test }; } elsif ($opt_T) { printf "%7.5f %7.4f %7.4f %7.3f %6.2f %6.2f %s%s\n", $rule_times{$test}||0, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0, $promotable_str, $test; } elsif ($opt_p) { printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s%s\n", $fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0, $promotable_str, $test; } elsif ($opt_x) { printf "%7d %7d %7d %7.3f %6.2f %6.2f %s%s\n", $fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0, $promotable_str, $test; } else { printf "%10d %10d %10d %s\n", $fa, $fs, $fn, $test; } if ($opt_S) { _print_scoremap("ham", $scoremap_ham{$test}); _print_scoremap("spam", $scoremap_spam{$test}); } if ($opt_o) { compute_overlaps_for_rule($test); } if ($opt_d) { print qq{ }; } } if ($opt_d) { print qq{ }; } exit; sub _print_scoremap { my ($name, $smap) = @_; if ($opt_d) { print qq{ }; } $smap ||= { }; my @scores = (sort { $a <=> $b } keys %{$smap}); my $total = 0; foreach my $score (@scores) { $total += $smap->{$score}; } foreach my $score (@scores) { my $num = $smap->{$score}; my $pc = sprintf("%.4f", ($num / ($total||0.0001)) * 100); if ($opt_d) { print qq{ }; } else { printf " scoremap %4s: %2d %6.2f%% %4d %s\n", $name, $score, $pc, $num, _scoremap_graph($pc); } } if ($opt_d) { print qq{ }; } else { print "\n"; } } sub _scoremap_graph { my ($pc) = @_; return '*' x ($pc * (40/100)); } sub readlogs { my $spam = $ARGV[0] || "spam.log"; my $ham = $ARGV[1] || "ham.log"; foreach my $file ($spam, $ham) { open (IN, "<$file") || die "Could not open file '$file': $!"; my $isspam = ($file eq $spam); my $caught; my $restofline; my $rules; my $score; # this is very speed-sensitive code. remove all possible # conditionals using an eval('..'). my $evalstr = ' while () { '; if ($opt_M) { $evalstr .= ' next unless /$opt_M/o; '; } if ($opt_X) { $evalstr .= ' next if /$opt_X/o; '; } # note: doing the match with a regexp shaves off no less than # 7 opcodes. nice! # the additional split() is for this case: # ". -20 /path time=1112116980,scantime=0,format=f,reuse=no" # in other words, no hits. split(' ') cannot deal with this # correctly, seeing (".", "-20", "/path", "time=...etc"). Work # around this by using a literal / / regexp split to discard # the csv stuff we don't want out of the rest of the line. $evalstr .= ' ($caught, $score, $restofline) = split(\' \', $_, 3); next unless ($caught =~ /^[Y\.]$/ && $restofline); (undef, $rules) = split(/ /, $restofline, 3); my %freq_mesg = (); '; if ($opt_f) { $evalstr .= ' next if (!(($caught eq "Y") xor $isspam)); '; } if ($opt_S) { $evalstr .= ' $score = int $score; '; } my $hmapstr = ''; my $smapstr = ''; if ($isspam) { if ($opt_o) { $hmapstr = ' if (!exists $hmap_spam{$r}) { $hmap_spam{$r} = ""; } vec ($hmap_spam{$r}, $num_spam, 1) = 1; '; } if ($opt_S) { $smapstr = ' $scoremap_spam{$r}{$score}++; '; } $evalstr .= ' foreach my $r (split(/,/, $rules)) { $freq_spam{$r}++ unless $freq_mesg{$r}++; '.$hmapstr.$smapstr.' } $num_spam++; '; } else { if ($opt_o) { $hmapstr = ' if (!exists $hmap_ham{$r}) { $hmap_ham{$r} = ""; } vec ($hmap_ham{$r}, $num_ham, 1) = 1; '; } if ($opt_S) { $smapstr = ' $scoremap_ham{$r}{$score}++; '; } $evalstr .= ' foreach my $r (split(/,/, $rules)) { $freq_ham{$r}++ unless $freq_mesg{$r}++; '.$hmapstr.$smapstr.' } $num_ham++; '; } $evalstr .= ' } '; # warn "JMD $evalstr"; eval $evalstr; if ($@) { die $@; } close IN; } # paranoia: remove zero length rules delete $freq_spam{''}; delete $hmap_spam{''}; delete $freq_ham{''}; delete $hmap_ham{''}; } sub compute_overlaps_for_rule { my ($r1) = @_; my %overlaps_ham1 = (); my %overlaps_spam1 = (); my %overlaps_ham2 = (); my %overlaps_spam2 = (); my %overlaps_ham1r = (); my %overlaps_spam1r = (); foreach my $r2 (keys %hmap_spam) { next if $r1 eq $r2; # require that both rules have at least 1 hit next unless ($freq_spam{$r1} && $freq_spam{$r2}); my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1, $hmap_spam{$r2}, $hmap_spam{$r1}); if ($a1ina2 > 0) { $overlaps_spam1r{$r2} = $a1ina2; if (exists $overlaps_spam1{$a1ina2}) { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; } else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; } if (exists $overlaps_spam2{$a2ina1}) { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; } else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; } } } foreach my $r2 (keys %hmap_ham) { next if $r1 eq $r2; # require that both rules have at least 1 hit next unless ($freq_ham{$r1} && $freq_ham{$r2}); my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2, $hmap_ham{$r2}, $hmap_ham{$r1}); if ($a1ina2 > 0) { $overlaps_ham1r{$r2} = $a1ina2; if (exists $overlaps_ham1{$a1ina2}) { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; } else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; } if (exists $overlaps_ham2{$a2ina1}) { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; } else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; } } } _print_overlap_ratios($r1, \%overlaps_spam1, \%overlaps_spam2, "spam", \%overlaps_ham1r, "ham"); _print_overlap_ratios($r1, \%overlaps_ham1, \%overlaps_ham2, "ham", \%overlaps_spam1r, "spam"); } sub _print_overlap_ratios { my ($r1, $hash1, $hash2, $type, $hash_other_type, $other_type) = @_; return unless defined $r1; if ($opt_d) { print qq{ }; } my %other_type_rules = %$hash_other_type; foreach my $ratio (sort { $b <=> $a } keys %$hash1) { $ratio ||= 0; last if ($ratio < 20); # 20% cutoff my $rules = _prettify_overlap_rules($r1, $hash1->{$ratio}); next if ($rules eq ''); foreach my $r2 (split(' ', $rules)) { $r2 =~ s/\[(.*?)\]$//; my $reverse_ratio = $1 || 0; next unless defined $r2; my $is_subrule = ($meta_subrule_pairs{"$r1.$r2"} || $meta_subrule_pairs{"$r2.$r1"}); if ($opt_d) { print qq{ $r2$r1 $r1$r2 }; } else { printf " overlap %4s: %3d%% of %s hits also hit %s; %3d%% of %s hits also hit %s%s%s\n", $type, $ratio, $r1, $r2, $reverse_ratio, $r2, $r1, ($is_subrule ? ' (meta rule and subrule)' : ''), (exists $other_type_rules{$r2} ? " ($other_type $other_type_rules{$r2}%)" : " (no $other_type)") ; } } } if ($opt_d) { print qq{ {type} eq 'meta') { # ignore meta-subrules that match the rule they make up. # TODO: this is simplistic; it doesn't look to see if those subrules # are in turn meta rules with further subrules that should be ignored. # but it works well enough... my $code = $rules{$rule}->{code}; @rules = grep { my $tmp = $_; $tmp =~ s/\[.*\]$//; if ($code =~ /\b\Q${tmp}\E\b/) { $meta_subrule_pairs{"$rule.$tmp"} = 1; 0; } else { 1; } } @rules; } return join (' ', @rules); } sub _hmap_to_overlap_ratio { my ($r1, $r2, $hmap1, $hmap2) = @_; $hmap1 ||= ''; $hmap2 ||= ''; if ($hmap1 !~ /[^\000]/ || $hmap2 !~ /[^\000]/) { # no hits on either! this would normally give a 100% hitrate match, # but that's misleading -- so hide it by giving it a 0% overlap. # # also, ignore cases where there are no hits on *one* of the rules, # while there are hits on the other -- after all, if one rule doesn't # have a single hit, it cannot overlap. # return (0,0); } # my $i; for ($i = 0; $i < length($hmap1)*8; $i++) { print vec($hmap1,$i,1); } print "\n"; for ($i = 0; $i < length($hmap2)*8; $i++) { print vec($hmap2,$i,1); } print "\n"; # count bits in each, so we can show when one is fully subsumed by another # with perl's support for bitstring ops, we get C speed here, nice! my $a1 = unpack("%32b*", $hmap1); my $a2 = unpack("%32b*", $hmap2); my $a1_and_a2 = unpack("%32b*", ($hmap1 & $hmap2)); my $a1_in_a2 = int (($a1_and_a2 / ($a2 || 0.0001))*100); my $a2_in_a1 = int (($a1_and_a2 / ($a1 || 0.0001))*100); return ($a1_in_a2, $a2_in_a1); } sub readscores { my($cffile) = @_; my $tmpf = "tmp/rules$$.pl"; my $genscores = ""; if ($opt_g) { $genscores = "-d \"$FindBin::Bin/../rulesrc/scores\""; } if (system ( "$FindBin::Bin/../build/parse-rules-for-masses -d \"$cffile\" $genscores -s $opt_s -o $tmpf" )) { warn "parse-rules-for-masses failed!"; } eval { require "./$tmpf"; }; if ($@) { warn "$tmpf is unparseable: $@"; $rules_pl_unparseable = 1; # carry on anyway (for most uses); leave the tmp file behind for # possible debugging } else { $rules_pl_unparseable = 0; unlink $tmpf; } } sub soratio { my ($s, $n) = @_; $s ||= 0; $n ||= 0; if ($s + $n > 0) { return $s / ($s + $n); } else { return 0.5; # no results -> not effective } } sub read_timings { if (!open (IN, "; if ($ver !~ /^v1/) { warn "hit-frequencies: unknown version in 'timing.log', timings will be 0"; close IN; return; } while () { if (/^T\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) { my ($name, $duration, $max, $runs) = ($1,$2,$3,$4); $rule_times{$name} = ($duration / ($runs||0.00001)) * 1000; } } close IN; }