#!/usr/bin/perl -w # # Given a 'results' dir from a bayes-10pcv-driver run, # graph a histogram of the score ranges using GNUPlot. # # usage: graph-bayes-histogram [--buckets=100] ...dir/results .../dir2/results ... # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use Getopt::Long; use vars qw($opt_buckets); GetOptions("buckets=i"); my $buckets = $opt_buckets || 100; my $range_lo = 0.0; my $range_hi = 1.0; %bux_sp = (); %bux_ns = (); my $step = ($range_hi - $range_lo) / $buckets; my $i; for ($i = $range_lo; $i <= $range_hi; $i += $step) { push (@buckets, $i); } open(DATA, ">plot.data"); my $setcount = 0; my %tag = (); my @dirs = (); foreach my $dir (@ARGV) { for ($i = $range_lo; $i <= $range_hi; $i += $step) { $bux_ns{$i} = $bux_sp{$i} = 0; } dofile($setcount, "$dir/spam_all.log", "$dir/nonspam_all.log"); push (@dirs, $dir); $tag{$dir} = $setcount; $setcount++; } close DATA; open (OUT, "| gnuplot -") or die "cannot run gnuplot"; select(OUT); print " set xlabel 'P(spam)' set ylabel 'Frequency' set logscale y 2 set xrange [0.0:1.01] set yrange [] set xtics 0,0.1,0.99 set terminal png crop set out 'graph.png' plot "; my @text = (); my $t = 0; foreach my $dir (@dirs) { my $s = $tag{$dir}; $t++; push (@text, " 'plot.data' using 1:2 index $s with linesp lt $t pt $t t 'ham, $dir'"); $t++; push (@text, " 'plot.data' using 1:3 index $s with linesp lt $t pt $t t 'spam, $dir'"); } print join(", \\\n", @text); print "\n"; close OUT; exit; sub dofile { my ($setcount, $spam, $nonspam) = @_; foreach my $file ($spam, $nonspam) { open (IN, "<$file") || die "Could not open file '$file': $!"; my $isspam = 0; ($file eq $spam) and $isspam = 1; while () { /^(\.|Y)\s.+bayes=([^\s,]+)/ or next; my $score = $2+0; my $bucket_id; foreach my $bucket (@buckets) { if ($score >= $bucket && $score < $bucket+$step) { $bucket_id = $bucket; last; } } if ($isspam) { $bux_sp{$bucket_id}++; } else { $bux_ns{$bucket_id}++; } } } my $sideoffset = 0.001*$setcount; foreach my $bucket (@buckets) { my $ns = $bux_ns{$bucket}; my $sp = $bux_sp{$bucket}; my $xpos = $bucket + $sideoffset; print DATA "$xpos $ns $sp\n"; } print DATA "\n\n"; }