#!/usr/bin/perl -w # # Given a spam.log and nonspam.log from a "mass-check --bayes" run, # draw a histogram of the score ranges. # # This now draws a detailed "zoom" view as well as the overall histogram, # so the low-frequency FPs and FNs around the middle ground can be viewed. # In addition, it does not show ham lines or spam lines, if those buckets # got no hits. # # usage: draw-bayes-histogram [--spam=spam.log] [--nonspam=nonspam.log] # [--nocollapse] [--nozoom] [--buckets=20] # # or: draw-bayes-histogram spam.log nonspam.log (backwards compatible) use Getopt::Long; use vars qw($opt_spam $opt_nonspam $opt_nocollapse $opt_nozoom $opt_buckets); GetOptions("spam=s", "nonspam=s", "nocollapse", "nozoom", "buckets=i"); my $spam = $opt_spam; if (!$spam && $ARGV[0] !~ /^\-/) { $spam = $ARGV[0]; } if (!$spam) { $spam = "spam.log"; } my $nonspam = $opt_nonspam; if (!$nonspam && $ARGV[1] !~ /^\-/) { $nonspam = $ARGV[1]; } if (!$nonspam) { $nonspam = "nonspam.log"; } my $buckets = $opt_buckets || 25; my $zoomfactor = 20; my $range_lo = 0.0; my $range_hi = 1.0; %bux_sp = (); %bux_ns = (); my $step = ($range_hi - $range_lo) / $buckets; my $i; for ($i = $range_lo; $i <= $range_hi; $i += $step) { push (@buckets, $i); $bux_ns{$i} = $bux_sp{$i} = 0; } foreach my $file ($spam, $nonspam) { open (IN, "<$file") || die "Could not open file '$file': $!"; my $isspam = 0; ($file eq $spam) and $isspam = 1; while () { /^(\.|Y)\s.+bayes=([^\s,]+)/ or next; my $score = $2+0; my $bucket_id; foreach my $bucket (@buckets) { if ($score >= $bucket && $score < $bucket+$step) { $bucket_id = $bucket; last; } } if ($isspam) { $bux_sp{$bucket_id}++; } else { $bux_ns{$bucket_id}++; } } } my $max_sp = 0; my $max_ns = 0; my $tot_sp = 0; my $tot_ns = 0; foreach my $bucket (@buckets) { $tot_sp += $bux_sp{$bucket}; if ($bux_sp{$bucket} > $max_sp) { $max_sp = $bux_sp{$bucket}; } $tot_ns += $bux_ns{$bucket}; if ($bux_ns{$bucket} > $max_ns) { $max_ns = $bux_ns{$bucket}; } } my $chars_in_line = 55; if ($opt_nozoom) { $chars_in_line += 10; } my $scale_sp = ($max_sp / $chars_in_line); my $scale_ns = ($max_ns / $chars_in_line); $scale_sp ||= 0.000001; $scale_ns ||= 0.000001; $tot_sp ||= 0.000001; $tot_ns ||= 0.000001; print STDOUT "SCORE NUMHIT DETAIL OVERALL HISTOGRAM (. = ham, # = spam)\n"; # 0.000 (19.217%) ..........|.................... foreach my $bucket (@buckets) { my $numdots; $numdots = int (($bux_ns{$bucket} / $scale_ns) + .5); my $line_ns = ('.' x $numdots); $numdots = int ((($bux_ns{$bucket}*$zoomfactor) / $scale_ns) + .5); my $zoomline_ns = ('.' x $numdots); $zoomline_ns = sprintf ("%-10s", substr ($zoomline_ns, 0, 10)); $numdots = int (($bux_sp{$bucket} / $scale_sp) + .5); my $line_sp = ('#' x $numdots); $numdots = int ((($bux_sp{$bucket}*$zoomfactor) / $scale_sp) + .5); my $zoomline_sp = ('#' x $numdots); $zoomline_sp = sprintf ("%-10s", substr ($zoomline_sp, 0, 10)); if (!$opt_nozoom) { $line_ns = $zoomline_ns.'|'.$line_ns; $line_sp = $zoomline_sp.'|'.$line_sp; } if ($bux_ns{$bucket} != 0 && !$opt_nocollapse) { printf STDOUT "%3.3f (%6.3f%%) %s\n", $bucket, (($bux_ns{$bucket} / $tot_ns) * 100.0), $line_ns; } if ($bux_sp{$bucket} != 0 && !$opt_nocollapse) { printf STDOUT "%3.3f (%6.3f%%) %s\n", $bucket, (($bux_sp{$bucket} / $tot_sp) * 100.0), $line_sp; } }