#!/usr/bin/perl use strict; use warnings; use File::Temp (); use LWP::Simple; use URI::Escape; use Data::Dumper; my $FROM_CACHE; if (@ARGV && $ARGV[0] eq '--fromcache') { $FROM_CACHE = 1; } my $MAKE_CACHE; $MAKE_CACHE = 1; # turn this on by default, no harm # we allow promotion of rules that are "ifplugin" one of these my @def_plugins = map { s,^lib/Mail/SpamAssassin/Plugin/(\S+)\.pm$,Mail::SpamAssassin::Plugin::$1,gs; $_; } ; my $PROMOTABLE_PLUGINS_RE = "^" . join("|", @def_plugins) . "\$"; # number of days to look back; if a rule isn't listed as promotable on # all of these days, it won't be listed. (we grant an exception for # new rules that didn't exist on previous days, however, so new rules # can be published quickly to handle sudden outbreaks without requiring # manual update work) my @DAYS_REQUIRED = (1, 2, 3); # S/O threshold required my $SO_THRESHOLD = .8; ########################################################################### print q{ Bad performing rules, from the past 3 night's mass-checks. (Note: 'net' rules will be listed as 'no hits' unless you set 'tflags net'. This also applies for meta rules which use 'net' rules.) }; my $cgi_url = "http://ruleqa.spamassassin.org/"; my @doc = (); my $cache = 'ruleqa.cache.'; my $submitters = ''; my $url; # tracks the last day used my $dayoffset = 0; foreach my $day (@DAYS_REQUIRED) { if (!$FROM_CACHE || !-f $cache.$day) { with_new_offset: $url = $cgi_url.($day+$dayoffset)."-days-ago?xml=1"; warn "HTTP get: $url\n"; $doc[$day] = get ($url); if (!$doc[$day]) { die "HTTP get failed: $doc[$day]\n"; } if ($MAKE_CACHE) { if (open(O, ">$cache$day")) { print O $doc[$day]; close O; } } } else { open(I, "<$cache$day") or die; $doc[$day] = join('',); close I; } ########################################################################### # the HTML looks like: # # # ... # ....... # ... # # # in other words, the machine-parseable metadata is embedded in the HTML # as a microformat. if ($doc[$day] =~ m{ \s*(.*?)\s* }sx) { my $daysubs = $1; # ignore days when the mass-check sets contain a --net log, since # it's the weekly --net run. That generally contains a much # smaller set of logs (since it takes longer to run mass-check --net) # so the results are untrustworthy. if ($daysubs =~ /(?:^|\s)net-/) { warn "day $day contains a --net mass-check! offsetting by an extra day\n"; $dayoffset++; goto with_new_offset; } ($submitters ne '') and $submitters .= "; "; $submitters .= "day $day: $daysubs"; } else { loghtml_die("no 'mcviewing', 'mcsubmitters' microformats on day $day"); } } ########################################################################### # __HIGHBITS0 # 8.76540.20560.977 # ruleqa%3Fdaterev%3Dlast-night%26rule%3D__HIGHBITS%26s_detail%3D1 my $plist; foreach my $day (@DAYS_REQUIRED) { while ($doc[$day] =~ m!(.*?)!xg) { my $xml = $1; my $obj = { }; while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)!!) { $obj->{$1} = $2; } while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)!!) { $obj->{$1} = uri_unescape($2); } my $name = $obj->{test}; $obj->{detailhref} = $cgi_url.$obj->{detailhref}; $plist->[$day]->{$name} = $obj; } if (!scalar keys %{$plist->[$day]}) { loghtml_die("no rules found? on day $day"); } } ########################################################################### ## my $dump = Data::Dumper->Dump([$plist], ['promolist']); print $dump; # use SpamAssassin classes directly, so we can lint rules # as we go use lib 'lib'; use Mail::SpamAssassin; my $mailsa = Mail::SpamAssassin->new({ rules_filename => "rules", site_rules_filename => join("\000", qw( rulesrc/core rulesrc/sandbox )), local_tests_only => 1, dont_copy_prefs => 1, config_tree_recurse => 1, keep_config_parsing_metadata => 1, # debug => 1, }); # hack hack hack!! we don't want to load plugin files twice, # and since the mkrules compiler copies from rulesrc/sandbox/*/*.pm # to rules/*.pm, they would otherwise appear twice. foreach my $fname () { my $path = File::Spec->rel2abs($fname); $INC{$path} = 1; # warn "JMD $path"; } my %rules_with_errors = (); my %killed_rules = (); my %count_rules = (); my %report_bad_subrules = (); $mailsa->{lint_callback} = sub { my %opts = @_; # ignore non-rule-issue lint failures return if ($opts{msg} =~ /(?: score\sset\sfor\snon-existent| description\sexists )/x); warn "demoting $opts{rule}: $opts{msg}"; if ($opts{iserror}) { $rules_with_errors{$opts{rule}}++; } }; $mailsa->lint_rules(); # print "# active ruleset list, automatically generated from $cgi_url\n"; # print "# with results from: $submitters\n"; my @spcs = ($submitters =~ /\s+/g); if (scalar @spcs < 2) { die "not generating results; less than 3 submitter results available!\n"; } # base most of our decisions off day 1 (last night's mass-checks). # note: meta rules must come before their __SUBRULES in this sort; # default lexical sort will do this. foreach my $plistkey (sort keys %{$plist->[1]}) { my $name = $plistkey; my $plistobj = $plist->[1]->{$plistkey}; my $notes = ''; # rules in sandboxes without a T_ prefix, will be renamed during the # ruleqa process... in other words, the output freqs line will talk # about rule "T_FOO". if there's a rule "FOO" defined, assume that's # the one being talked about. my $no_t = $name; if ($no_t =~ s/^T_//) { if (defined $mailsa->{conf}->{scores}->{$no_t}) { $name = $no_t; } } # ignore rules that don't exist (if they have a desc or score, # they exist according to the Conf parser) next unless ($mailsa->{conf}->{descriptions}->{$name} || $mailsa->{conf}->{scores}->{$name}); my $tfs = $mailsa->{conf}->{tflags}->{$name} || ''; my $src = $mailsa->{conf}->{source_file}->{$name}; $count_rules{$src}++; # skip rules of these tflags, we cannot judge them without more data if ($tfs =~ /\b(?:userconf|learn|net)\b/) { next; } # rules that fail lint next if $rules_with_errors{$name}; # subrules with ok parent rules if ($name =~ /^__/ && !$report_bad_subrules{$name}) { # print " # ignoring subrule $name: parent rules seem fine\n"; next; } # certain tests need to be reversed for "nice" rules my $is_nice = 0; if ($tfs =~ /\bnice\b/) { $is_nice = 1; } my $valid = 1; # number of nights the rule appears in my $so = $plist->[1]->{$plistkey}->{so}; if (defined $plist->[2]->{$plistkey}->{so}) { $so += $plist->[2]->{$plistkey}->{so}; $valid++; } if (defined $plist->[3]->{$plistkey}->{so}) { $so += $plist->[3]->{$plistkey}->{so}; $valid++; } $so /= $valid; # average across all 3 my $adj_so; if ($is_nice) { $adj_so = 1.0 - $so; # 0.0 => 1.0 } else { $adj_so = $so; } next unless ($adj_so < $SO_THRESHOLD); my $target = ($is_nice ? 'hpc' : 'spc'); my $spc = $plist->[1]->{$plistkey}->{$target}; $spc += $plist->[2]->{$plistkey}->{$target} || 0; $spc += $plist->[3]->{$plistkey}->{$target} || 0; $spc /= $valid; $target = ($is_nice ? 'spc' : 'hpc'); my $hpc = $plist->[1]->{$plistkey}->{$target}; $hpc += $plist->[2]->{$plistkey}->{$target} || 0; $hpc += $plist->[3]->{$plistkey}->{$target} || 0; $hpc /= $valid; if ($spc <= 0.0001) { if ($hpc <= 0.0001) { badrule($name, "no hits at all"); } else { badrule($name, "no hits of target type"); } next; } badrule($name, "bad, avg S/O=".sprintf("%.2f",$so)." ". "avg Spam%=".sprintf("%.2f",$spc)." ". "avg Ham%=".sprintf("%.2f",$hpc) ); } foreach my $srcfile (reverse sort keys %killed_rules) { my $set = $killed_rules{$srcfile}; my $count = $count_rules{$srcfile}; my $c_bad = scalar keys %{$set}; print "\n$srcfile ($count rules, $c_bad bad):\n\n"; foreach my $name (sort keys %{$set}) { my $reason = $set->{$name}; print " $name: $reason\n"; } } exit; sub badrule { my ($name, $reason) = @_; my $src = $mailsa->{conf}->{source_file}->{$name}; $killed_rules{$src}->{$name} = $reason; # if it's a subrule in a meta rule, note this # TODO: this only works reliably for lexically-previous meta rules; # that's ok for __SUBRULES used in META_RULES, since "M" < "_". if ($report_bad_subrules{$name}) { $killed_rules{$src}->{$name} .= "\n # used in:$report_bad_subrules{$name}"; } # if it's a meta rule, note that we can complain about its subrules too foreach my $r (split ' ', $mailsa->{conf}->{meta_dependencies}->{$name} || '') { $report_bad_subrules{$r} .= " ".$name; } } sub loghtml_die { die "$_[0]\nURL: $url\n"; }