#!/usr/bin/perl -w # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # =head1 NAME fp-fn-statistics - Display statistics about the quality of scores =head1 SYNOPSIS fp-fn-statistics [options] Options: -c,--cffile=path Use path as the rules directory -s,--scoreset=n Use scoreset n -t,--threshold=n Use a spam/ham threshold of n (default: 5) --lambda=n Use a lambda value of n --spam=file Location of mass-check spam log (spam.log) --ham=file Location of mass-check ham log (ham.log) --fplog=file File to which false positive logs should be saved --fnlog=file File to which false negative logs should be saved =head1 DESCRIPTION B first calculates the score each message from a masses.log would have under a new set of scores. It then aggregates the number of messages correctly and incorrectly found as spam and ham, and their average scores. In addition, B determines the "Total Cost Ratio" as a result of the false positives and negatives mentioned above. This calculation takes into the value of lambda, which represents the cost of recovering a false positive, where 1 indicates a message is tagged only, 9 means the message is mailed back to sender asking for a token (TMDA style) and 999 means a message is delted. The default, 5, represents the message being moved to an infrequently read folder. B can also save false positive and false negatives logs to a file for future analysis. If this is all you're doing, it could be accomplished a lot quicker with B, but why not reinvent the wheel? =cut use Getopt::Long; use strict; use vars qw($opt_cffile $opt_lambda $opt_threshold $opt_scoreset $opt_spam $opt_ham $opt_fplog $opt_fnlog); $opt_cffile = "../rules"; $opt_threshold = 5; $opt_spam = 'spam.log'; $opt_ham = 'ham.log'; $opt_scoreset = 0; GetOptions("c|cffile=s" => \$opt_cffile, "lambda=f" => \$opt_lambda, "t|threshold=f" => \$opt_threshold, "spam=s" => \$opt_spam, "ham=s" => \$opt_ham, "s|scoreset=i" => \$opt_scoreset, "fplog=s" => \$opt_fplog, "fnlog=s" => \$opt_fnlog ); # If desired, report false positives and false negatives for analysis if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); } if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); } # lambda value for TCR equation, representing the cost of of an FP vs. the # cost of a FN. Some example values are: 1 = tagged only, 9 = mailed back # to sender asking for token, 999 = blocking or deleting a message. # # We roughly aim for a value representing "moved to infrequently-read folder". my $lambda = 50; if ($opt_lambda) { $lambda = $opt_lambda; } use vars qw(%scores %allrules %rules); readscores(); die "wrong scoreset in tmp/rules.pl" unless $allrules{_scoreset} == $opt_scoreset; print "Reading per-message hit stat logs and scores...\n"; my ($num_spam, $num_ham); my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore); readlogs(); evaluate(); # show memory usage before we exit # print "Running \"ps aux\"...\n"; # open(PS, "ps aux|"); # while() { # print if $. == 1 || /\b$$\b/; # } # close(PS); exit 0; # arguments are $isspam, $count, \@tests, $msgline; sub log_line_count { my $score = 0; $score += $scores{$_} for @{$_[2]}; if ($_[0]) { $num_spam++; if ($score >= $opt_threshold) { $ga_yy++; $yyscore += $score; } else { $ga_yn++; $ynscore += $score; if (defined $opt_fnlog) { print FNLOG $_[3]; } } } else { $num_ham++; if ($score >= $opt_threshold) { #print STDERR "FP: $id\n"; $ga_ny++; $nyscore += $score; if (defined $opt_fplog) { print FPLOG $_[3]; } } else { $ga_nn++; $nnscore += $score; } } } sub readlogs { my $msgline; my $count = 0; $num_spam = $num_ham = 0; $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0; $yyscore = $ynscore = $nyscore = $nnscore = 0.0; foreach my $file ($opt_spam, $opt_ham) { open (IN, "<$file") || die "Could not open file '$file': $!"; my $isspam = ($file eq $opt_spam); my $caught; # 1st parameter of log line my $rules; # 4th parameter of log line while (defined($msgline = )) { ($caught, undef, undef, $rules) = split(' ', $msgline); # only take lines starting with Y or . next unless ($caught eq 'Y' || $caught eq '.') && $rules; # get tests, but ignore unknown tests and subrules my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} } split(/,/, $rules); # run handler log_line_count($isspam, $count, \@tests, $msgline); # increment line $count++; } close IN; } } sub readscores { print "Reading scores from \"$opt_cffile\"...\n"; my $cmd = "../build/parse-rules-for-masses -o tmp/rules_$$.pl -d \"$opt_cffile\" -s $opt_scoreset"; warn "[$cmd]\n"; system ($cmd) and die; require "tmp/rules_$$.pl"; unlink "tmp/rules_$$.pl"; %allrules = %rules; # ensure it stays global } sub evaluate { printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_threshold); printf "# Correctly non-spam: %6d %4.2f%%\n", $ga_nn, ($ga_nn / $num_ham) * 100.0; printf "# Correctly spam: %6d %4.2f%%\n", $ga_yy, ($ga_yy / $num_spam) * 100.0; printf "# False positives: %6d %4.2f%%\n", $ga_ny, ($ga_ny / $num_ham) * 100.0; printf "# False negatives: %6d %4.2f%%\n", $ga_yn, ($ga_yn / $num_spam) * 100.0; # convert to the TCR metrics used in the published lit my $nspamspam = $ga_yy; my $nspamlegit = $ga_yn; my $nlegitspam = $ga_ny; my $nlegitlegit = $ga_yn; my $nlegit = $num_ham; my $nspam = $num_spam; my $werr = ($lambda * $nlegitspam + $nspamlegit) / ($lambda * $nlegit + $nspam); my $werr_base = $nspam / ($lambda * $nlegit + $nspam); $werr ||= 0.000001; # avoid / by 0 my $tcr = $werr_base / $werr; my $sr = ($nspamspam / $nspam) * 100.0; my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0; printf "# TCR(l=%s): %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%%\n", $lambda, $tcr, $sr, $sp; }