#!/usr/bin/perl # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # any tests that get less than this % of matches on *both* spam or nonspam, are # reported. my $LOW_MATCHES_PERCENT = 0.03; my $scoreset = 0; sub usage { die " lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios from a mass-check logfile pair. The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies -x -p' output). The 'falsefreqs' argument is frequencies of hits in false-positives and false-negatives only ('hit-frequencies -x -p -f' output). "; } my $opt_falsefreqs; while ($#ARGV >= 0) { $_ = shift @ARGV; if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; } elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; } else { usage(); } } print "BAD TESTS REPORT\n"; readrules(); print "\n" .((scalar keys %rulefile) + 1). " rules found.\n"; print "\nRule file syntax issues:\n\n"; lintrules(); if ($opt_falsefreqs) { open (FALSE, "<$opt_falsefreqs"); while () { if (!/^\s*([\d\.]+)/) { my ($overall, $spam, $nons, $so, $score, $name) = split (' '); next unless ($name =~ /\S/); $falsefreqs_spam{$name} = $spam; $falsefreqs_nons{$name} = $nons; $falsefreqs_so{$name} = $so; } } close FALSE; } while (<>) { if (!/^\s*([\d\.]+)/) { $output{'a_header'} = $_; next; } my $badrule; my ($overall, $spam, $nons, $so, $score, $name) = split (' '); next unless ($name =~ /\S/); my $ffspam = $falsefreqs_spam{$name}; my $ffnons = $falsefreqs_nons{$name}; my $ffso = $falsefreqs_so{$name}; my $tf = $tflags{$name}; next if ($tf =~ /net/ && ($scoreset % 2) == 0); next if ($tf =~ /userconf/); if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) { # sanity! $badrule = 'no matches'; } else { if ($score < 0.0) { # negative score with more spams than nonspams? bad rule. if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) { $badrule = 'non-nice but -ve score'; } if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) { if ($ffso < 0.5) { $badrule = 'fn'; } else { # ignore, the FNs are overridden by other tests so it doesn't # affect the overall results. } } # low number of matches overall if ($nons < $LOW_MATCHES_PERCENT) { $badrule ||= ''; $badrule .= ', low matches'; } } elsif ($score > 0.0) { # positive score with more nonspams than spams? bad. if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) { $badrule = 'nice but +ve score'; } if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) { if ($ffso > 0.5) { $badrule = 'fp'; } else { # ignore, the FPs are overridden by other tests so it doesn't # affect the overall results. } } # low number of matches overall if ($spam < $LOW_MATCHES_PERCENT) { $badrule ||= ''; $badrule .= ', low matches'; } } elsif ($score == 0.0) { $badrule = 'score is 0'; } } if (defined $badrule) { $badrule =~ s/^, //; chomp; $output{$badrule} .= $_ . " ($badrule)\n"; } } # do all but 'no/low matches' first print "\nHigh-priority issues:\n\n"; foreach my $badness (sort keys %output) { next if ($badness eq 'no matches'); next if ($badness eq 'low matches'); print $output{$badness}; delete $output{$badness}; } # now go back and do the other 2 (if they're there) print "\nLow-priority issues:\n\n"; foreach my $badness (sort keys %output) { next unless defined ($output{$badness}); print $output{$badness}; delete $output{$badness}; } exit; sub concat_rule_lang { my $rule = shift; my $lang = shift; if (defined $lang && $lang ne '') { return "[$lang]_$rule"; } else { return $rule; } } # note: do not use parse-rules-for-masses here, we need to do linting instead # of your average parse sub readrules { my @files = <../rules/[0-9]*.cf>; my $file; %rulesfound = (); %langs = (); foreach $file (@files) { open (IN, "<$file"); while () { s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/; # make all the foo-bar stuff foo_bar 1 while s/^(\S+)-/\1_/g; 1 while s/^(lang\s+\S+\s+\S+)-/\1_/g; my $lang = ''; if (s/^lang\s+(\S+)\s+//) { $lang = $1; $langs{$1} = 1; } if (/^(header|rawbody|body|full|uri|meta|mimeheader)\s+(\S+)\s+/) { $rulesfound{$2} = 1; $rulefile{$2} ||= $file; $scorefile{$1} = $file; $score{$2} ||= 1.0; $tflags{$2} ||= ''; $descfile{$2} ||= $file; # a rule with no score or desc is OK $description{$2}->{$lang} = undef; if (/^body\s+\S+\s+eval:/) { # ignored } elsif (/^body\s+\S+\s+(.*)$/) { my $re = $1; # If there's a ( in a rule where it should be (?:, flag it. # but ignore [abc(] ... if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) { print "warning: non-(?:...) capture in regexp in $file: $_\n"; } if ($re =~ /\.[\*\+]/) { print "warning: .* in regexp in $file: $_\n"; } if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) { if ($1 > 120 || $2 > 120) { print "warning: long .{n} in regexp in $file: $_\n"; } } } } elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) { $rulesfound{$1} = 1; $descfile{concat_rule_lang ($1, $lang)} ||= $file; $descfile{$1} ||= $file; $description{$1}->{$lang} = $2; } elsif (/^tflags\s+(\S+)\s+(.+)$/) { $rulesfound{$1} = 1; $tflags{$1} = $2; $tflagsfile{concat_rule_lang ($1, $lang)} = $file; $tflagsfile{$1} = $file; } elsif (/^score\s+(\S+)\s+(.+)$/) { $rulesfound{$1} = 1; $scorefile{concat_rule_lang ($1, $lang)} = $file; $scorefile{$1} = $file; $score{$1} = $2; } elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap| clear_terse_report_template|terse_report| required_score|ok_locales|ok_languages|test|lang| spamphrase|whitelist_from|require_version| clear_unsafe_report_template|unsafe_report| (?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam| (?:bayes_)?auto_learn )/x) { next; } else { print "warning: unknown rule in $file: $_\n"; } } close IN; } @langsfound = sort keys %langs; @rulesfound = sort keys %rulesfound; } sub lintrules { my %possible_renames = (); foreach my $rule (@rulesfound) { my $match = $rule; $match =~ s/_\d+[^_]+$//gs; # trim e.g. "_20K" $match =~ s/[^A-Z]+//gs; # trim numbers etc. if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) { $possible_renames{$match} .= " ".$rule; } $possible_rename_matches{$rule} = $match; } foreach my $lang ('', @langsfound) { foreach my $baserule (@rulesfound) { next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ ); my $rule = concat_rule_lang ($baserule, $lang); my $f = $descfile{$rule}; my $warned = ''; if (defined $f && !defined ($rulefile{$rule}) && !defined ($rulefile{$baserule})) { print "warning: $baserule has description, but no rule: $f\n"; $warned .= ' lamedesc'; } # Check our convention for rule length if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) { print "warning: $baserule has a name longer than 22 chars: $f\n"; } # Check our convention for rule length if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) { print "warning: $baserule has a description longer than 50 chars: $f\n"; } # lang rule trumps normal rule $f = $rulefile{$rule} || $rulefile{$baserule}; # if the rule exists, and the language/rule description doesn't exist ... if ( defined $f && !defined $description{$baserule}->{$lang} ) { print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n"; $warned .= ' lamedesc'; } $f = $scorefile{$rule}; if (defined $f && !defined ($rulefile{$rule}) && !defined ($rulefile{$baserule})) { print "warning: $baserule has score, but no rule: $f\n"; $warned .= ' lamescore'; } my $r = $possible_rename_matches{$rule}; if ($warned ne '' && defined $r) { my @matches = split (' ', $possible_renames{$r}); if (scalar @matches != 0) { my $text = ''; # now try and figure out "nearby" rules with no description/score foreach my $baser (@matches) { my $blang; if ($descfile{$rule} =~ /text_(\S\S)\./) { $blang = $1; } my $r = concat_rule_lang ($baser, $blang); #warn "$r $descfile{$r} $descfile{$baser}"; next if ($warned =~ /lamedesc/ && (defined $descfile{$r})); next if ($warned =~ /lamescore/ && (defined $scorefile{$r})); $text .= " $baser"; } if ($text ne '') { print "warning: (possible renamed rule? $text)\n"; } } } } } }