#!/usr/bin/perl -w # hacked version of hit-frequencies - Allen # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use Getopt::Std; getopts("l:L:h"); our ($opt_h, $opt_l, $opt_L); sub usage { die "find-extremes [-l LC] [-L LC] [spam log] [nonspam log] -l LC also print language specific rules for lang code LC (or 'all') -L LC only print language specific rules for lang code LC (or 'all') options -l and -L are mutually exclusive. if either the spam or and nonspam logs are unspecified, the defaults are \"spam.log\" and \"nonspam.log\" in the cwd. "; } usage() if($opt_h || ($opt_l && $opt_L)); $lower = 1; #$threshold = 5; $higher = 9; $min_expected = 2; # Should not be set to more than 5 or less than 2 my %freq_spam = (); # how often non-nice found in spam my %freq_over_higher_falsepos = (); # how often non-nice found in ones over # higher threshold that are false positives my %freq_nonspam = (); # how often nice found in nonspam my %freq_under_lower_falseneg = (); # how often nice found in ones under # lower threshold that are false negatives my %over_expected_falsepos = (); # how much over/under non-nice are than # their expected freq_over_higher_falsepos my %over_expected_falseneg = (); # how much over/under nice are than # their expected freq_under_lower_falseneg my %ratio_expected_falsepos = (); # ratio version of above my %ratio_expected_falseneg = (); # ditto my $num_spam = 0; my $num_nonspam = 0; my $num_over_higher_falsepos = 0; my $num_under_lower_falseneg = 0; my $ok_lang = ''; readscores(); $ok_lang = lc ($opt_l || $opt_L || ''); if ($ok_lang eq 'all') { $ok_lang = '.'; } foreach my $key (keys %rules) { next if ($key eq '_scoreset'); if ( ($opt_L && !$rules{$key}->{lang}) || ($rules{$key}->{lang} && (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i) ) ) { delete $rules{$key} ; next; } if ($rules{$key}->{tflags} =~ m/net/) { delete $rules{$key}; next; } if ($rules{$key}->{tflags} !~ m/userconf/) { if ($rules{$key}->{tflags} =~ m/nice/) { $freq_nonspam{$key} = 0; $freq_under_lower_falseneg{$key} = 0; } else { $freq_spam{$key} = 0; $freq_over_higher_falsepos{$key} = 0; } } } readlogs(); unless (($num_over_higher_falsepos >= $min_expected) && ($num_under_lower_falseneg >= $min_expected)) { die "Insufficient extremes in dataset (" . $num_over_higher_falsepos . " " . $num_under_lower_falseneg . "); stopped"; } # http://bmj.com/collections/statsbk/8.shtml sub chisquare { my($a,$b,$c,$d) = @_; my $chisquare = ((($a*$d - $b*$c)**2)*($a + $b + $c + $d))/ (($a + $d)*($c + $d)*($b + $d)*($a + $c)); if ($chisquare >= 6.64) { return $chisquare,0.01; } elsif ($chisquare >= 3.841) { return $chisquare,0.05; } else { return $chisquare,.5; } } my $ratio_falsepos = $num_over_higher_falsepos/$num_spam; my $ratio_falseneg = $num_under_lower_falseneg/$num_nonspam; my $skipped_non_nice = 0; foreach $rule (keys %freq_spam) { my $expected = $freq_spam{$rule}*$ratio_falsepos; if ($expected <= $min_expected) { $skipped_non_nice++; next; } $over_expected_falsepos{$rule} = $freq_over_higher_falsepos{$rule} - $expected; $ratio_expected_falsepos{$rule} = $freq_over_higher_falsepos{$rule}/$expected; ($chisquare{$rule},$prob{$rule}) = chisquare($num_spam,$num_over_higher_falsepos, $freq_spam{$rule},$freq_over_higher_falsepos{$rule}); if ($freq_over_higher_falsepos{$rule} < $expected) { $chisquare{$rule} *= -1; } } warn "Skipped non-nice: $skipped_non_nice\n"; my $skipped_nice = 0; foreach $rule (keys %freq_nonspam) { my $expected = $freq_nonspam{$rule}*$ratio_falseneg; if ($expected <= $min_expected) { $skipped_nice++; next; } $over_expected_falseneg{$rule} = $freq_under_lower_falseneg{$rule} - $expected; $ratio_expected_falseneg{$rule} = $freq_under_lower_falseneg{$rule}/$expected; ($chisquare{$rule},$prob{$rule}) = chisquare($num_nonspam,$num_under_lower_falseneg, $freq_nonspam{$rule},$freq_under_lower_falseneg{$rule}); if ($freq_under_lower_falseneg{$rule} < $expected) { $chisquare{$rule} *= -1; } } warn "Skipped nice: $skipped_nice\n"; @rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos); if (scalar(@rules_falsepos)) { print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n"; my(@rules_falsepos_bad) = grep {$chisquare{$_} > 0} (@rules_falsepos); if (scalar(@rules_falsepos_bad)) { @rules_falsepos_bad = sort { ($chisquare{$b} <=> $chisquare{$a}) || ($ratio_expected_falsepos{$b} <=> $ratio_expected_falsepos{$a}) || ($over_expected_falsepos{$b} <=> $over_expected_falsepos{$a}) || ($freq_over_higher_falsepos{$b} <=> $freq_over_higher_falsepos{$a})} (@rules_falsepos_bad); foreach $rule (@rules_falsepos_bad) { print $rule . "\t" . $prob{$rule} . "\t" . $ratio_expected_falsepos{$rule} . "\t" . $over_expected_falsepos{$rule} . "\t" . $freq_over_higher_falsepos{$rule} . "\n"; } } my(@rules_falsepos_good) = grep {$chisquare{$_} < 0} (@rules_falsepos); if (scalar(@rules_falsepos_good)) { print "###\n"; @rules_falsepos_good = sort { ($chisquare{$a} <=> $chisquare{$b}) || ($ratio_expected_falsepos{$a} <=> $ratio_expected_falsepos{$b}) || ($freq_spam{$b} <=> $freq_spam{$a})} (@rules_falsepos_good); foreach $rule (@rules_falsepos_good) { print $rule . "\t" . $prob{$rule} . "\t" . $ratio_expected_falsepos{$rule} . "\t" . $over_expected_falsepos{$rule} . "\t" . $freq_over_higher_falsepos{$rule} . "\n"; } } } else { warn "No over-falsepos to print\n"; } @rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg); if (scalar(@rules_falseneg)) { print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n"; my(@rules_falseneg_bad) = grep {$chisquare{$_} > 0} (@rules_falseneg); if (scalar(@rules_falseneg_bad)) { @rules_falseneg_bad = sort { ($chisquare{$b} <=> $chisquare{$a}) || ($ratio_expected_falseneg{$b} <=> $ratio_expected_falseneg{$a}) || ($over_expected_falseneg{$b} <=> $over_expected_falseneg{$a}) || ($freq_under_lower_falseneg{$b} <=> $freq_under_lower_falseneg{$a})} (@rules_falseneg_bad); foreach $rule (@rules_falseneg_bad) { print $rule . "\t" . $prob{$rule} . "\t" . $ratio_expected_falseneg{$rule} . "\t" . $over_expected_falseneg{$rule} . "\t" . $freq_under_lower_falseneg{$rule} . "\n"; } } my(@rules_falseneg_good) = grep {$chisquare{$_} < 0} (@rules_falseneg); if (scalar(@rules_falseneg_good)) { print "###\n"; @rules_falseneg_good = sort { ($chisquare{$a} <=> $chisquare{$b}) || ($ratio_expected_falseneg{$a} <=> $ratio_expected_falseneg{$b}) || ($freq_spam{$b} <=> $freq_spam{$a})} (@rules_falseneg_good); foreach $rule (@rules_falseneg_good) { print $rule . "\t" . $prob{$rule} . "\t" . $ratio_expected_falseneg{$rule} . "\t" . $over_expected_falseneg{$rule} . "\t" . $freq_under_lower_falseneg{$rule} . "\n"; } } } else { warn "No over-falseneg to print\n"; } exit; sub readlogs { my $spam = $ARGV[0] || "spam.log"; my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log"); (open(NONSPAM,$nonspam)) || (die "Couldn't open file '$nonspam': $!; stopped"); while (defined($line = )) { if ($line =~ m/^\s*\#/) { next; } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) { my $tests = $1; my $hits = 0; my(@tests) = (); foreach $test (grep {length($_)} (split(/,+/,$tests))) { if (exists($rules{$test})) { push @tests, $test; $hits += $rules{$test}->{score}; } } if (scalar(@tests)) { $num_nonspam++; foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) { $freq_nonspam{$test}++; } if ($hits >= $higher) { $num_over_higher_falsepos++; foreach $test (grep {exists($freq_over_higher_falsepos{$_})} (@tests)) { $freq_over_higher_falsepos{$test}++; } } } } elsif ($line =~ m/\S/) { chomp($line); warn "Can't interpret line '$line'; skipping"; } } close(NONSPAM); (open(SPAM,$spam)) || (die "Couldn't open file '$spam': $!; stopped"); while (defined($line = )) { if ($line =~ m/^\s*\#/) { next; } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) { my $tests = $1; my $hits = 0; my $plus_hits = 0; my(@tests) = (); foreach $test (grep {length($_)} (split(/,+/,$tests))) { if (exists($rules{$test})) { push @tests, $test; $hits += $rules{$test}->{score}; if ($rules{$test}->{score} > 0) { $plus_hits += $rules{$test}->{score}; } } } if (scalar(@tests)) { $num_spam++; foreach $test (grep {exists($freq_spam{$_})} (@tests)) { $freq_spam{$test}++; } if (($hits <= $lower) && $plus_hits && ($plus_hits >= $lower)) { $num_under_lower_falseneg++; foreach $test (grep {exists($freq_under_lower_falseneg{$_})} (@tests)) { $freq_under_lower_falseneg{$test}++; } } } } elsif ($line =~ m/\S/) { chomp($line); warn "Can't interpret line '$line'; skipping"; } } close(SPAM); } sub readscores { system ("../build/parse-rules-for-masses") and die "Couldn't do parse-rules-for-masses: $?; stopped"; require "./tmp/rules.pl"; }