#!/usr/bin/perl use strict; use warnings; use File::Temp (); use LWP::Simple; use URI::Escape; use Data::Dumper; my $FROM_CACHE; if (@ARGV && $ARGV[0] eq '--fromcache') { $FROM_CACHE = 1; } my $MAKE_CACHE; $MAKE_CACHE = 1; # turn this on by default, no harm # we allow promotion of rules that are "ifplugin" one of these my @def_plugins = map { s,^lib/Mail/SpamAssassin/Plugin/(\S+)\.pm$,Mail::SpamAssassin::Plugin::$1,gs; $_; } ; my $PROMOTABLE_PLUGINS_RE = "^" . join("|", @def_plugins) . "\$"; ########################################################################### my $cgi_url = "http://ruleqa.spamassassin.org/"; my $doc; my $cache = 'ruleqa.cache'; my $url; if (!$FROM_CACHE || !-f $cache) { $url = $cgi_url."last-night?xml=1"; $doc = get ($url); if (!$doc) { die "HTTP get failed: $doc\n"; } if ($MAKE_CACHE) { open(O, ">$cache"); print O $doc; close O; } } else { open(I, "<$cache") or die; $doc = join('',); close I; } ########################################################################### # the HTML looks like: # # # ... # ....... # ... # # # in other words, the machine-parseable metadata is embedded in the HTML # as a microformat. my $submitters = ''; if ($doc =~ m{ \s*(.*?)\s* }sx) { $submitters = $1; } else { loghtml_die("failed to find 'mcviewing' and 'mcsubmitters' microformats"); } ########################################################################### # __HIGHBITS0 # 8.76540.20560.977 # ruleqa%3Fdaterev%3Dlast-night%26rule%3D__HIGHBITS%26s_detail%3D1 my $plist = { }; while ($doc =~ m!(.*?)!xg) { my $xml = $1; my $obj = { }; while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)!!) { $obj->{$1} = $2; } while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)!!) { $obj->{$1} = uri_unescape($2); } my $name = $obj->{test}; $obj->{detailhref} = $cgi_url.$obj->{detailhref}; $plist->{$name} = $obj; } if (!scalar keys %$plist) { loghtml_die("no rules found?"); } sub loghtml_die { die "$_[0]\nURL: $url\ngot:\n$doc\n"; } ########################################################################### ## my $dump = Data::Dumper->Dump([$plist], ['promolist']); print $dump; # use SpamAssassin classes directly, so we can lint rules # as we go use lib 'lib'; use Mail::SpamAssassin; my $mailsa = Mail::SpamAssassin->new({ rules_filename => "rules", site_rules_filename => join("\000", qw( rulesrc/core rulesrc/sandbox )), local_tests_only => 1, dont_copy_prefs => 1, config_tree_recurse => 1, keep_config_parsing_metadata => 1, # debug => 1, }); # hack hack hack!! we don't want to load plugin files twice, # and since the mkrules compiler copies from rulesrc/sandbox/*/*.pm # to rules/*.pm, they would otherwise appear twice. foreach my $fname () { my $path = File::Spec->rel2abs($fname); $INC{$path} = 1; # warn "JMD $path"; } my %rules_with_errors = (); $mailsa->{lint_callback} = sub { my %opts = @_; # ignore non-rule-issue lint failures return if ($opts{msg} =~ /(?: score\sset\sfor\snon-existent| description\sexists )/x); warn "demoting $opts{rule}: $opts{msg}"; if ($opts{iserror}) { $rules_with_errors{$opts{rule}}++; } }; $mailsa->lint_rules(); print "# active ruleset list, automatically generated from $cgi_url\n"; print "# with results from: $submitters\n"; my @spcs = ($submitters =~ /\s+/g); if (scalar @spcs < 2) { die "not generating results; less than 3 submitter results available!\n"; } foreach my $plistkey (sort keys %$plist) { my $name = $plistkey; my $obj = $plist->{$plistkey}; my $notes = ''; # rules in sandboxes without a T_ prefix, will be renamed during the # ruleqa process... in other words, the output freqs line will talk # about rule "T_FOO". if there's a rule "FOO" defined, assume that's # the one being talked about. my $no_t = $name; if ($no_t =~ s/^T_//) { if (defined $mailsa->{conf}->{scores}->{$no_t}) { $name = $no_t; } } # now that it's ok to have sandbox rules without a T_ prefix, # "T_" prefix implies "tflags nopublish" next if ($name =~ /^T_/); # ignore rules that don't exist (if they have a desc or score, # they exist according to the Conf parser) next unless ($mailsa->{conf}->{descriptions}->{$name} || $mailsa->{conf}->{scores}->{$name}); # "nopublish" tflags my $tfs = $mailsa->{conf}->{tflags}->{$name}; if ($tfs) { next if ($tfs =~ /\bnopublish\b/); # all of these tflags force publication; # include "net", since otherwise this script has to be aware # what day of the week it is for weekly net/non-net mass-checks! # very messy. TODO? if ($tfs =~ /\b(publish|userconf|learn|net)\b/) { $notes = "tflags ".$1; goto publish; } } # ignore rules that are not marked as promotable next unless ($obj->{promo}); # only rules from "rulesrc" dirs my $src = $mailsa->{conf}->{source_file}->{$name}; next if (!$src || $src !~ /rulesrc/); # rules that fail lint next if $rules_with_errors{$name}; # that require a plugin we won't have my $skip = 0; my $ifs = $mailsa->{conf}->{if_stack}->{$name}; while ($ifs && $ifs =~ /plugin\s*\((.+?)\)/g) { my $pkg = $1; # grep out the ones we *do* have, and do use in "ifplugin" # lines in "rulesrc", here... next; #JMD: next if ($pkg =~ /${PROMOTABLE_PLUGINS_RE}/o); print "\n# not publishing $name: needs $ifs\n"; $skip++; } next if $skip; # don't output the ever-changing bits of data # $notes = "spam=$obj->{spc} ham=$obj->{hpc} so=$obj->{so}"; $notes = "good enough"; publish: print "\n# $notes\n$name\n"; }