#!/usr/bin/perl -w # # Given a 'results' dir from a bayes-10pcv-driver run, # graph a ROC curve of accuracy. # # usage: graph-accuracy-curve [--buckets=100] ...dir/results .../dir2/results ... # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use Getopt::Long; use vars qw($opt_buckets); GetOptions("buckets=i"); my $buckets = $opt_buckets || 100; my $range_lo = 0.0; my $range_hi = 1.0; %bux_sp = (); %bux_ns = (); my $step = ($range_hi - $range_lo) / $buckets; my $i; for ($i = $range_lo; $i <= $range_hi; $i += $step) { push (@buckets, $i); } open(DATA, ">plot.data"); my $setcount = 0; my %tag = (); my @dirs = (); foreach my $dir (@ARGV) { for ($i = $range_lo; $i <= $range_hi; $i += $step) { $bux_ns{$i} = $bux_sp{$i} = 0; } dofile($setcount, "$dir/spam_all.log", "$dir/nonspam_all.log"); push (@dirs, $dir); $tag{$dir} = $setcount; $setcount++; } close DATA; open (OUT, "| gnuplot -") or die "cannot run gnuplot"; select(OUT); # set xtics 0,0.1,0.99 print " set xlabel 'FPs' set ylabel 'FNs' set logscale xy 2 set xrange [] set yrange [] set terminal png size 1024,768 crop set out 'graph.png' plot "; my @text = (); my $t = 0; foreach my $dir (@dirs) { my $s = $tag{$dir}; $t++; push (@text, " 'plot.data' using 1:2 index $s with linesp lt $t pt $t t 'ham, $dir'"); } print join(", \\\n", @text); print "\n"; close OUT; exit; sub dofile { my ($setcount, $spam, $nonspam) = @_; foreach my $file ($spam, $nonspam) { open (IN, "<$file") || die "Could not open file '$file': $!"; my $isspam = 0; ($file eq $spam) and $isspam = 1; while () { /^(\.|Y)\s.+bayes=([^\s,]+)/ or next; my $score = $2+0; my $bucket_id; foreach my $bucket (@buckets) { if ($score >= $bucket && $score < $bucket+$step) { $bucket_id = $bucket; last; } } if ($isspam) { $bux_sp{$bucket_id}++; } else { $bux_ns{$bucket_id}++; } } } foreach my $bucket (@buckets) { my ($fp, $fn) = results_for_cutoff($bucket); print DATA "$fp $fn\n"; } print DATA "\n\n"; } sub results_for_cutoff { my $cutoff = shift; my $fn = 0; my $fp = 0; for ($i = $range_lo; $i < $cutoff; $i += $step) { foreach my $bucket (@buckets) { if ($i >= $bucket && $i < $bucket+$step) { $fn += $bux_sp{$bucket}; } } } for ($i = $cutoff; $i <= $range_hi; $i += $step) { foreach my $bucket (@buckets) { if ($i >= $bucket && $i < $bucket+$step) { $fp += $bux_ns{$bucket}; } } } return ($fp, $fn); }