#!/usr/bin/perl -w # # mk-roc-graphs -- create Receiver Operating Characteristic graphs # # Creates ROC curve graphs from a pair of SpamAssassin logs and the current # score-set. See # for details on ROC curves. # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use Getopt::Long; use vars qw($opt_cffile $opt_spam $opt_ham $opt_count $opt_lambda); GetOptions("cffile=s", "count", "threshold=f", "spam=s", "ham=s", "scoreset=i", "lambda=f"); $opt_cffile ||= "../rules"; $opt_spam ||= 'spam.log'; $opt_ham ||= 'ham.log'; $opt_count ||= 0; $opt_scoreset = 0 if (!defined $opt_scoreset); my $msgline; use vars qw(%rules %allrules); my %spam_points = (); my %ham_points = (); my ($num_tests, $num_spam, $num_ham); my $nybias = 10; # lambda value for TCR equation, representing the cost of of an FP vs. the # cost of a FN. Some example values are: 1 = tagged only, 9 = mailed back # to sender asking for token, 999 = blocking or deleting a message. # # We roughly aim for a value representing "moved to infrequently-read folder". my $lambda = 50; if ($opt_lambda) { $lambda = $opt_lambda; } readscores(); readlogs(); if ($opt_count) { $nybias = $nybias*($num_spam / $num_ham); evaluate_reports(); } else { evaluate_scores(); } exit 0; # arguments are $isspam, $count, \@tests sub log_line_count { my $score = 0; $score += $scores{$_} for @{$_[2]}; $score = (int($score * 10) / 10); if ($_[0]) { $num_spam++; $spam_points{$score}++; } else { $num_ham++; $ham_points{$score}++; } } sub readlogs { my $count = 0; $num_spam = $num_ham = 0; # set handler for log lines my $log_line = \&log_line_count; foreach my $file ($opt_spam, $opt_ham) { open (IN, "<$file") || die "Could not open file '$file': $!"; my $isspam = ($file eq $opt_spam); my $caught; # 1st parameter of log line my $rules; # 4th parameter of log line while (defined($msgline = )) { ($caught, undef, undef, $rules) = split(' ', $msgline); # only take lines starting with Y or . next unless ($caught eq 'Y' || $caught eq '.') && $rules; # get tests, but ignore unknown tests and subrules my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} } split(/,/, $rules); # run handler $log_line->($isspam, $count, \@tests); # increment line $count++; } close IN; } $num_tests = $count; } sub readscores { warn "Reading scores from \"$opt_cffile\"...\n"; system ("../build/parse-rules-for-masses -d \"$opt_cffile\" -s $opt_scoreset") and die; require "./tmp/rules.pl"; %allrules = %rules; # ensure it stays global } sub _evaluate_at_all_thresholds { my $callback = shift; my %u=(); my @scores = grep {defined} map { if (exists $u{$_}) { undef; } else { $u{$_}=undef;$_; } } sort { $a <=> $b } (keys %ham_points, keys %spam_points); my $tot_h = $num_ham || 0.00001; my $tot_s = $num_spam || 0.00001; foreach my $threshold (@scores) { my $fp = 0; # false positives; ham marked as spam my $fn = 0; # false negatives; spam marked as ham my $tp = 0; # true positives; spam marked as spam my $tn = 0; # true negatives; ham marked as ham foreach my $scr (@scores) { my $nh = $ham_points{$scr} || 0; my $ns = $spam_points{$scr} || 0; if ($threshold > $scr) { $tn += $nh; $fn += $ns; } else { $fp += $nh; $tp += $ns; } } $callback->($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp); } } sub evaluate_reports { _evaluate_at_all_thresholds (sub { my ($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp) = @_; my $nh = $ham_points{$threshold} || 0; my $ns = $spam_points{$threshold} || 0; # $nh /= $tot_h; # $ns /= $tot_s; # $tn /= $tot_h; # $tp /= $tot_s; # $fp /= $tot_h; # $fn /= $tot_s; printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold); printf "# Correctly non-spam: %6d %4.2f%%\n", $tn, ($tn / $tot_h) * 100.0; printf "# Correctly spam: %6d %4.2f%%\n", $tp, ($tp / $tot_s) * 100.0; printf "# False positives: %6d %4.2f%%\n", $fp, ($fp / $tot_h) * 100.0; printf "# False negatives: %6d %4.2f%%\n", $fn, ($fn / $tot_s) * 100.0; # convert to the TCR metrics used in the published lit my $nspamspam = $tp; my $nspamlegit = $fn; my $nlegitspam = $fp; my $nlegitlegit = $tn; my $nlegit = $tot_h; my $nspam = $tot_s; my $werr = ($lambda * $nlegitspam + $nspamlegit) / ($lambda * $nlegit + $nspam); my $werr_base = $nspam / ($lambda * $nlegit + $nspam); $werr ||= 0.000001; # avoid / by 0 my $tcr = $werr_base / $werr; my $sr = ($nspamspam / $nspam) * 100.0; my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0; printf "# TCR(l=%s): %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%%\n", $lambda, $tcr, $sr, $sp; }); } sub evaluate_scores { _evaluate_at_all_thresholds (sub { my ($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp) = @_; my $nh = $ham_points{$threshold} || 0; my $ns = $spam_points{$threshold} || 0; $nh /= $tot_h; $ns /= $tot_s; $tn /= $tot_h; $tp /= $tot_s; $fp /= $tot_h; $fn /= $tot_s; printf "%f %.9f %.9f %.9f %.9f %.9f %.9f\n", $threshold, $nh, $ns, $tn, $tp, $fp, $fn; }); } __DATA__ # Suggested commands to plot data from this script: # ham score distribution: ./mk-roc-graphs --scoreset=3 > dat gnuplot set grid plot [-5:40] [] "dat" using 1:2 with boxes # spam score distribution: ./mk-roc-graphs --scoreset=3 > dat gnuplot set grid plot [-5:40] [] "dat" using 1:3 with boxes # plot a ROC curve: ./mk-roc-graphs --scoreset=3 > dat gnuplot set grid plot [0:0.1] [0:0.1] "dat" using 6:7 with linesp # For a ROC curve, as shown in the literature ./mk-roc-graphs --scoreset=3 > dat set grid set xlabel "False Positives (1.0 = 100%)" set ylabel "True Positives (1.0 = 100%)" plot "dat" using 6:5 with linesp # same, but tighter: ./mk-roc-graphs --scoreset=3 > dat gnuplot set grid plot [0:0.02] [0:0.02] "dat" using 6:7 with linesp # and the piece de resistance: ./mk-roc-graphs --scoreset=0 > set0 ./mk-roc-graphs --scoreset=1 > set1 ./mk-roc-graphs --scoreset=2 > set2 ./mk-roc-graphs --scoreset=3 > set3 gnuplot set terminal png set output "out.png" set grid set xlabel "False Positives (1.0 = 100% of ham marked as spam)" set ylabel "False Negatives (1.0 = 100% of spam marked as ham)" plot [0:0.1] [0:0.1] "set0" using 6:7 with linesp, \ "set1" using 6:7 with linesp, \ "set2" using 6:7 with linesp, \ "set3" using 6:7 with linesp