#!/usr/bin/perl -w # # Given a spam.log and nonspam.log from a "mass-check --bayes" run, # work out efficiency. This variant uses static thresholds that # model more closely what SpamAssassin uses for scoring. # # usage: bayes-threshold spam.log nonspam.log my $spam = $ARGV[0] || "spam.log"; my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log"); my $hamcutoff = 0.30; my $spamcutoff = 0.70; my $nbuckets = 50; my $range_lo = 0.00; my $range_hi = 1.00; my $step = 0.02; #my $step = ($range_hi - $range_lo) / $nbuckets; # unusable - round errors! $hamcutoff += 0.0; $spamcutoff += 0.0; # shamelessly nicked from spambayes' testing infrastructure; a system to # compute the "cost" of a pair of thresholds and classifier. I'm supporting # it here so our stats are (at least a little) comparable against theirs. my $best_cutoff_fp_weight = 10.0; my $best_cutoff_fn_weight = 1.0; my $best_cutoff_unsure_weight = 0.1; %bux_sp = (); %bux_ns = (); my $i; for ($i = 0; $i <= ($range_hi - $range_lo) / $step; $i++) { my $lvl = $range_lo + ($i * $step); push (@buckets, $lvl); $bux_ns{$lvl} = $bux_sp{$lvl} = 0; } foreach my $file ($spam, $nonspam) { open (IN, "<$file") || die "Could not open file '$file': $!"; my $isspam = 0; ($file eq $spam) and $isspam = 1; while () { /^(\.|Y)\s.+bayes=(\S+)$/ or next; my $score = $2+0; if ($score == 1) { $score = 0.9999999999999; } my $bucket_id; foreach my $bucket (@buckets) { if ($score >= $bucket && $score < $bucket+$step) { $bucket_id = $bucket; last; } } if (!defined $bucket_id) { warn "no bucket for $score!"; } if (!defined $bux_sp{$bucket_id}) { warn "undef bucket at $bucket_id! (score=$bucket)"; } if (!defined $bux_ns{$bucket_id}) { warn "undef bucket at $bucket_id! (score=$bucket)"; } if ($isspam) { $bux_sp{$bucket_id}++; } else { $bux_ns{$bucket_id}++; } } } my $max_sp = 0; my $max_ns = 0; my $tot_sp = 0; my $tot_ns = 0; foreach my $bucket (@buckets) { $tot_sp += $bux_sp{$bucket}; if ($bux_sp{$bucket} > $max_sp) { $max_sp = $bux_sp{$bucket}; } $tot_ns += $bux_ns{$bucket}; if ($bux_ns{$bucket} > $max_ns) { $max_ns = $bux_ns{$bucket}; } } foreach my $cutoff (0.3, 0.2, 0.1, 0.04, 0.02) { $hamcutoff = $cutoff; $spamcutoff = 1.0 - $cutoff; my %results = results_for_cutoff ($hamcutoff, $spamcutoff); write_results (%results); } sub results_for_cutoff { my %results = (); my $fn = 0; my $fp = 0; my $unsure_sp = 0; my $unsure_ns = 0; for ($i = $range_lo; $i < $hamcutoff; $i += $step) { foreach my $bucket (@buckets) { if ($i >= $bucket && $i < $bucket+$step) { $fn += $bux_sp{$bucket}; } } } # total up the unsures (between hamcutoff and spamcutoff) for ($i = $hamcutoff; $i <= $spamcutoff; $i += $step) { foreach my $bucket (@buckets) { if ($i >= $bucket && $i < $bucket+$step) { $unsure_ns += $bux_ns{$bucket}; $unsure_sp += $bux_sp{$bucket}; } } } for ($i = $spamcutoff; $i <= $range_hi; $i += $step) { foreach my $bucket (@buckets) { if ($i >= $bucket && $i < $bucket+$step) { $fp += $bux_ns{$bucket}; } } } my $cost = ($fp * $best_cutoff_fp_weight) + ($fn * $best_cutoff_fn_weight) + ($unsure_ns * $best_cutoff_unsure_weight) + ($unsure_sp * $best_cutoff_unsure_weight); $results{"$hamcutoff $spamcutoff"} = { 'hamcutoff' => $hamcutoff, 'spamcutoff'=> $spamcutoff, 'cost' => $cost, 'unsure_ns' => $unsure_ns, 'unsure_sp' => $unsure_sp, 'fp' => $fp, 'fn' => $fn }; return %results; } sub write_results { my (%results) = @_; foreach my $r (values %results) { printf "Threshold optimization for hamcutoff=%3.2f, spamcutoff=%3.2f: cost=\$%5.2f\n", $r->{hamcutoff}, $r->{spamcutoff}, $r->{cost}; printf "Total ham:spam: %d:%d\n", $tot_ns, $tot_sp; printf "FP: %5d %5.3f%% ", $r->{fp}, ($r->{fp}*100) / $tot_ns; printf "FN: %5d %5.3f%%\n", $r->{fn}, ($r->{fn}*100) / $tot_sp; my $unsure = $r->{unsure_ns} + $r->{unsure_sp}; printf "Unsure: %5d %5.3f%% ", $unsure, ($unsure*100) / ($tot_sp+$tot_ns); printf "(ham: %5d %5.3f%% ", $r->{unsure_ns}, ($r->{unsure_ns}*100) / ($tot_ns); printf "spam: %5d %5.3f%%)\n", $r->{unsure_sp}, ($r->{unsure_sp}*100) / ($tot_sp); # for TCR calc, treat "unsures" as ham # TODO: unsure_sp should probably be treated as spam, assuming # it'll fall in the 5.0-6.0 score range my $fn = $r->{unsure_sp} + $r->{fn}; my $fp = $r->{fp}; printf "TCRs: l=1 %5.3f l=5 %5.3f l=9 %5.3f\n", tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 1), tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 5), tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 9); printf "SUMMARY: %3.2f/%3.2f fp %5d fn %5d uh %5d us %5d c %5.2f\n", $r->{hamcutoff}, $r->{spamcutoff}, $r->{fp}, $r->{fn}, $r->{unsure_ns}, $r->{unsure_sp}, $r->{cost}; print "\n"; } } sub tcr { my ($nspamspam, $nspamlegit, $nlegitspam, $nlegitlegit, $lambda) = @_; my $nlegit = $nlegitspam+$nlegitlegit; my $nspam = $nspamspam+$nspamlegit; my $werr = ($lambda * $nlegitspam + $nspamlegit) / ($lambda * $nlegit + $nspam); my $werr_base = $nspam / ($lambda * $nlegit + $nspam); $werr ||= 0.000001; # avoid / by 0 my $tcr = $werr_base / $werr; #my $sr = ($nspamspam / $nspam) * 100.0; #my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0; $tcr; }