#!/usr/bin/perl -w # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # =head1 NAME rewrite-cf-with-new-scores - Rewrite SpamAssassin scores file with new scores. =head1 SYNOPSIS rewrite-cf-with-new-scores [options] Options --old-scores=file Read file containing the old SpamAssassin scores --new-scores=file Read file containing the new SpamAssassin scores -s,--scoreset n Rewrite scoreset n --output=file Output rewritten score file to file Note: these options can be shortened (i.e. --old, --new, --out) as long as they are unambiguous. =head1 DESCRIPTION B is a tool to update the sitewide scores file with the newly generated scores. Since SpamAssassin has four different scoresets, which each need to be generated separately, this tool is used to only change the correct scoreset. By default, the old scores are read from F<../rules/50_scores.cf> and the new ones from F. The output will be F<50_scores.cf> by default. If no options are given, the script will look for command line options in the following order: scoreset, old-scores, new-scores. In this case, output will go to B. The rules directory needs to be used to make sure scores are given for the right tests. Rules not found in the rules directory will not be given scores in the output. =head1 BUGS Please report bugs to http://bugzilla.spamassassin.org/ =head1 SEE ALSO L, L, L =cut use strict; use warnings; use Getopt::Long qw(:config auto_help); use Pod::Usage; our ($opt_old, $opt_new, $opt_scoreset, $opt_out, $opt_cffile); GetOptions("old-scores=s" => \$opt_old, "new-scores=s" => \$opt_new, "cffile=s" => \$opt_cffile, "s|scoreset=i" => \$opt_scoreset, "output=s" => \$opt_out); # Backwards compatibility mode if (!defined($opt_old) && !defined($opt_new) && !defined($opt_scoreset) && !defined($opt_out)) { ($opt_scoreset, $opt_old, $opt_new) = @ARGV; } if (!defined $opt_out) { $opt_out = "-"; #STDOUT } if (!defined $opt_cffile) { $opt_cffile = '../rules'; } if (!defined $opt_scoreset) { $opt_scoreset = 0; } $opt_new ||= "perceptron.scores"; $opt_old ||= "../rules/50_scores.cf"; $opt_out ||= "50_scores.cf"; my $NUM_SCORESETS = 4; my $ZERO_MINISCULE_SCORES = 0; my $MINISCULE_THRESHOLD = 0.01; # points my $UNZERO_META_PREDICATES = 1; my $IGNORE_ZEROED_TEST_RULES = 1; if ($opt_scoreset < 0 || $opt_scoreset >= $NUM_SCORESETS) { pod2usage("scoreset $opt_scoreset out of range 0 - " . ($NUM_SCORESETS-1)); } # Open output open(OUT, ">$opt_out"); # scores are broken into three regions: # 1. "pre" (stuff before generated mutable scores) # 2. "gen" (first generated mutable scores section) # 3. "end" (stuff after generated mutable scores) # 4. "gen2" (any later generated mutable scores sections) # variables filled-out in read_rules() our %rules; # rules data # variables filled-out in read_gascores() my %gascores = (); # generated scores # variables filled-out in read_ranges_data() my %range_zeroed_rules = (); # zeroed by score-ranges-from-freqs # variables filled-out in read_oldscores() my $pre = ''; # stuff before first "gen" section my $end = ''; # stuff after first "gen" section my %oldscores; # old scores my %comment; # "gen" rule comments my %fixed; # scores that are fixed (non-gen) my %gen2; # scores that are gen in the $end string # compiled output my @gen_order = (); my %gen_lines = (); # read stuff in read_rules(); read_gascores(); read_oldscores(); # TODO: read_ranges.data read_ranges_data(); sub read_ranges_data { # used to pick up rules zeroed by score-ranges-from-freqs, e.g. # "ignoring 'FM_MORTGAGE6PLUS': score and range == 0" # which results in: # tmp/ranges.data:0.000 0.000 0 FM_MORTGAGE6PLUS open (IN, ") { /^(\S+) (\S+) (\S+) (\S+)/ or next; my ($min, $max, $mutable, $name) = ($1,$2,$3,$4); if ($min == 0 && $max == 0 && $mutable == 0) { $range_zeroed_rules{$name} = 1; } } close IN; } build_new_scores(); if ($ZERO_MINISCULE_SCORES) { fixup_miniscule_scores(); } if ($UNZERO_META_PREDICATES) { fixup_meta_predicates(); } $end = sub_gen2($end); # write stuff out print OUT $pre; print_gen(); print OUT $end; exit; sub read_rules { my $tmpf = "tmp/rules$$.pl"; system "../build/parse-rules-for-masses ". "-d \"$opt_cffile\" -s $opt_scoreset -o $tmpf" and die; require $tmpf; unlink $tmpf; } sub read_gascores { open (STDIN, "<$opt_new") or die "cannot open $opt_new"; while () { next unless /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/; my $name = $1; my $score = $2; # various things we should be concerned about if (!exists $rules{$name}) { warn "$name is not defined in tmp/rules.pl\n"; next; } if ($rules{$name}->{issubrule}) { warn "$name is an indirect sub-rule in tmp/rules.pl\n"; next; } if ($rules{$name} =~ /^__/) { warn "$name has an indirect sub-rule \"__\" prefix\n"; next; } if ($name eq '(null)') { warn "$name is (null)\n"; next; } $gascores{$name} = $score; } } sub read_oldscores { open (IN, "<$opt_old") or die "cannot open $opt_old"; # state of things my $where = "pre"; # region of original scores file that we're in my $seen_gen = 0; # have we seen the first tag? # read everything in while (my $line = ) { if ($line =~ /<\/gen:mutable>/) { $where = "end"; } if ($where eq "pre") { readline_fix($line); $pre .= $line; } elsif ($where eq "gen") { readline_gen($line); } elsif ($where eq "gen2") { readline_gen2($line); $end .= $line; } elsif ($where eq "end") { readline_fix($line); $end .= $line; } if ($line =~ //) { if ($seen_gen) { $where = "gen2"; } else { $where = "gen"; $seen_gen = 1; } } } } # used for both "pre" and "end" sub readline_fix { my ($line) = @_; my $comment; if ($line =~ s/\s*#\s*(.*)//) { $comment = $1; } if ($line =~ /^\s*score\s+(\S+)\s/) { my (undef, $name, @scores) = split(' ', $line); $fixed{$name}++; $comment{$name} = $comment if $comment; } } sub readline_gen { my ($line) = @_; my $comment; if ($line =~ s/\s*#\s*(.*)//) { $comment = $1; $comment =~ s/ n=$opt_scoreset//; } if ($line =~ /^\s*score\s+(\S+)\s/) { my (undef, $name, @scores) = split(' ', $line); for (my $i = 1; $i < $NUM_SCORESETS; $i++) { $scores[$i] = $scores[0] unless defined $scores[$i]; } @{$oldscores{$name}} = @scores; $comment{$name} = $comment if $comment; } } sub readline_gen2 { my ($line) = @_; my $comment; if ($line =~ s/\s*#\s*(.*)//) { $comment = $1; $comment =~ s/ n=$opt_scoreset//; } if ($line =~ /^\s*score\s+(\S+)\s/) { my (undef, $name, @scores) = split(' ', $line); for (my $i = 1; $i < $NUM_SCORESETS; $i++) { $scores[$i] = $scores[0] unless defined $scores[$i]; } @{$oldscores{$name}} = @scores; $comment{$name} = $comment if $comment; $gen2{$name}++; } } sub build_new_scores { # we just consider scores for this set that are in the input or were in the # "gen" region from the old scores, tmp/rules.pl is not considered here my %gen; # rules to be printed in "gen" region $gen{$_} = 1 for keys %gascores; # scores for this set from GA $gen{$_} = 1 for keys %oldscores; # original scores in "gen" region $gen{$_} = 1 for keys %range_zeroed_rules; # zeroed by range script # remove fixed scores for (keys %fixed) { delete $gen{$_}; } # sort all generated rules by name for my $name (sort keys %gen) { next if ($rules{$name}->{lang}); # "lang es" rules etc. next if ($rules{$name}->{issubrule}); # indirect sub-rules next if ($name eq 'AWL'); # dynamic score my @scores = (); my $comment = ''; $comment = $comment{$name} if defined $comment{$name}; # use the old scores if they existed @scores = @{$oldscores{$name}} if exists $oldscores{$name}; # set appropriate scoreset value if (defined $gascores{$name}) { $scores[$opt_scoreset] = $gascores{$name}; delete $oldscores{$name}; } elsif ($range_zeroed_rules{$name}) { if ($IGNORE_ZEROED_TEST_RULES && $name =~ /^T_/) { next; } # zeroed pre-evolver, by score-ranges-from-freqs $scores[$opt_scoreset] = 0; if (defined $oldscores{$name}) { $comment .= " n=$opt_scoreset"; } } else { # zero for current scoreset if there was no new score; # when the perceptron does this for mutable rules, it means # that score had a new score of 0 $scores[$opt_scoreset] = 0; if (defined $oldscores{$name}) { $comment .= " n=$opt_scoreset"; #warn "$name has no GA score, but had a score before\n"; } } # sort and unique comment tags my %unique; $unique{$_} = 1 for split(' ', $comment); $comment = join(' ', sort keys %unique); push (@gen_order, $name); $gen_lines{$name} = { scores => \@scores, comment => $comment }; } } sub new_score_line { my ($name) = @_; # create new score line my @scores = @{$gen_lines{$name}{scores}}; my $comment = $gen_lines{$name}{comment}; return sprintf("score %s %s%s", $name, join(" ", generate_scores($name, @scores)), ($comment) ? ' # ' . $comment : ''); } sub print_gen { print OUT "\n"; foreach my $name (@gen_order) { next if ($gen2{$name}); # will do that separately print OUT new_score_line($name), "\n"; } print OUT "\n"; } sub sub_gen2 { my $end = shift; foreach my $name (keys %gen2) { if ($end !~ s/^\s*score\s+${name}\s.+?$/ new_score_line($name); /em) { # we failed to sub it; output score in main gen:mutable block instead delete $gen2{$name}; } } $end; } sub generate_scores { my ($name, @scores) = @_; my $isnet = 0; my $islearn = 0; if (defined $rules{$name}->{tflags}) { $isnet = ($rules{$name}->{tflags} =~ /\bnet\b/); $islearn = ($rules{$name}->{tflags} =~ /\blearn\b/); } # set defaults if not already set if (!defined $scores[0]) { warn "$name does not have a default score\n"; $scores[0] ||= 0; } my $flag = 1; for (my $i = 1; $i < $NUM_SCORESETS; $i++) { $scores[$i] = $scores[0] unless defined $scores[$i]; $flag = 0 if ($scores[$i] != $scores[$i-1]); }; # enforce rule/scoreset rules. # net rules never have a non-zero score in sets 0 and 2 for (my $i = 0; $i < $NUM_SCORESETS; $i++) { if ($isnet && ($i & 1) == 0) { $scores[$i] = 0; $flag = 0 if ($i > 0 && $scores[$i] != $scores[$i-1]); } if ($islearn && ($i & 2) == 0) { $scores[$i] = 0; $flag = 0 if ($i > 0 && $scores[$i] != $scores[$i-1]); } } if ($flag) { splice @scores, 1; } return @scores; } sub fixup_miniscule_scores { my $num_fixed = 0; foreach my $name (@gen_order) { my @scores = @{$gen_lines{$name}{scores}}; if (abs($scores[$opt_scoreset]) < $MINISCULE_THRESHOLD) { $scores[$opt_scoreset] = 0; $num_fixed++; } @{$gen_lines{$name}{scores}} = @scores; } warn "zeroed $num_fixed scores for being 'miniscule'.\n"; } sub fixup_meta_predicates { # this is the opposite of t/meta.t while (my ($name, $info) = each %rules) { next if ($name eq '_scoreset'); my $type = $info->{type} || "unknown"; # look at meta rules that are not disabled next unless ($type eq "meta" && ($name =~ /^__/ || $info->{score} != 0)); next unless ($info->{depends}); # test rules should not impose requirements on release rules; ignore # any dependency requirements caused by T_ rules next if $name =~ /^T_/; for my $depend (@{ $info->{depends} }) { if (!exists $rules{$depend}) { warn "$name depends on $depend which is nonexistent\n"; next; } # if dependency is a predicate, it'll run next if $depend =~ /^__/; # not a generated rule? not our problem, then; t/meta.t will catch it next unless (exists $gen_lines{$depend}); # ignore "tflags net" and "tflags learn" rules -- it is OK # for those to have zero scores in some scoresets, for obvious # reasons. next if (defined $rules{$depend}->{tflags} && $rules{$depend}->{tflags} =~ /\b(?:net|learn)\b/); # if dependency has a non-zero score, it'll run my $depscore = $gen_lines{$depend}{scores}[$opt_scoreset]; next if (defined $depscore && $depscore != 0); warn "fix meta dep: $name depends on $depend with 0 score, fixing at non-0\n"; $gen_lines{$depend}{scores}[$opt_scoreset] = 0.001; } } }