#!/usr/bin/perl -w my %freq_spam = (); my %freq_nonspam = (); my $num_spam; my $num_nonspam; while (<>) { /^\s+(\d+)\s+(\d+)\s+(\d+)\s+(.+)\s*$/ or next; my $overall = $1+0; my $spam = $2+0; my $nonspam = $3+0; my $test = $4; if ($test eq '(all messages)') { $num_spam = $spam; $num_nonspam = $nonspam; next; } $freq{$test} = $overall; $freq_spam{$test} = $spam; $freq_nonspam{$test} = $nonspam; $mutatable{$test} = 1; if ($spam + $nonspam < 5) { $mutatable{$test} = 0; $ratio{$test} = 0; } else { if ($nonspam == 0) { $nonspam = 1; # avoid / by 0 $spam *= 20; # give the spam score a bonus to make up } my $ratio = $spam / $nonspam; $ratio{$test} = $ratio; } } system ("mkdir tmp >/dev/null 2>&1"); open (OUT, ">tmp/ranges.data"); foreach my $test (sort { $ratio{$b} <=> $ratio{$a} } keys %freq) { my $overall = $freq{$test}; my $spam = $freq_spam{$test}; my $nonspam = $freq_nonspam{$test}; my $ratio = $ratio{$test}; my $mutatable = $mutatable{$test}; if (!$mutatable) { printf OUT ("%3.1f %3.1f 0 $test\n", 0, 0); next; } if ($ratio > 200.0) { $ratio = 200.0; } # set a ceiling if ($ratio < 1.0) { $ratio = 1.0; } # and floor # now we have a number between 1.0 and 200.0 indicating how # effective the test is. Come up with a reasonable range # for scores based on this. my ($lo, $hi) = ratio_in_200_to_range($ratio); printf OUT ("%3.1f %3.1f 1 $test\n", $lo, $hi); #printf "range: %3.1f %3.1f $test ($spam / $nonspam = $ratio)\n", #$lo, $hi; } close OUT; exit; sub ratio_in_200_to_range { my $ratio = shift; # the current algo maps 200.0 to a range between 1.0 and 5.0, # and 1.0 to a range between 0.1 and 1.0. my $hi_lo = 200 / 1.0; my $hi_hi = 200 / 5.0; my $lo_lo = 1 / 0.1; my $lo_hi = 1 / 1.0; my $lo_diff = abs($lo_lo - $hi_lo); my $hi_diff = abs($lo_hi - $hi_hi); my $lo = ($ratio / (($ratio/200) * $lo_diff + $lo_lo)); my $hi = ($ratio / (($ratio/200) * $hi_diff + $lo_hi)); ($lo, $hi); }