#!/usr/bin/perl -w # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # =head1 NAME logs-to-c - Convert a mass-check log into perceptron format =head1 SYNOPSIS logs-to-c [options] Options: -c,--cffile=path Use path as the rules directory -s,--scoreset=n Use scoreset n --spam=file Location of spam mass-check log --ham=file Location of ham mass-check log =head1 DESCRIPTION B will read the mass-check logs F and F or as specified by the B<--spam> and B<--ham> options, and convert it into the format needed by the perceptron. This is a format that is simple for the perceptron to parse, but is not very readable to humans. =head1 BUGS Please report bugs to http://bugzilla.spamassassin.org/ =head1 SEE ALSO L, L =cut use Getopt::Long qw(:config auto_help bundling); use strict; our $opt_cffile = "../rules"; our $opt_spam = 'spam.log'; our $opt_ham = 'ham.log'; our $opt_scoreset = 0; GetOptions("cffile=s", "spam=s", "ham=s", "scoreset=i"); my $is_spam = ''; # vec aligned with @tests_hit my @tests_hit = (); my %mutable_tests = (); our (%rules, %allrules, %scores); my (%ignored_rule, %range_lo, %range_hi); my %rule_to_index; readscores(); print "Reading per-message hit stat logs and scores...\n"; my ($num_tests, $num_spam, $num_ham); read_ranges(); readlogs(); print "Writing logs and current scores as C code...\n"; writescores_c(); # show memory usage before we exit # print "Running \"ps aux\"...\n"; # open(PS, "ps aux|"); # while() { # print if $. == 1 || /\b$$\b/; # } # close(PS); exit 0; # code to freeze/thaw test lines in as little space as possible # this could be faster, but improves memory usage by a phenomenal # amount over arrayrefs or strings of comma-separated-values my $short_index = 1; my %long_to_short; my @short_to_long; sub new_short { $short_index++; $long_to_short{$_[0]} = $short_index; $short_to_long[$short_index] = $_[0]; return $short_index; } # uses less than half the memory of join on ',' and even better # compared to Storable::freeze sub freeze_tests { return pack("w*", map { $long_to_short{$_} || new_short($_); } @{$_[0]}) } sub thaw_tests { return map { $short_to_long[$_] } unpack("w*", $_[0]); } sub readlogs { my $msgline; my $count = 0; $num_spam = $num_ham = 0; foreach my $file ($opt_spam, $opt_ham) { open (IN, "<$file") || die "Could not open file '$file': $!"; my $isspam = ($file eq $opt_spam); my $caught; # 1st parameter of log line my $rules; # 4th parameter of log line my $restofline; # intermediate parse buffer while (defined($msgline = )) { # faster log-reading code from hit-frequencies. # the additional split() is for this case: # ". -20 /path time=1112116980,scantime=0,format=f,reuse=no" # in other words, no hits. split(' ') cannot deal with this # correctly, seeing (".", "-20", "/path", "time=...etc"). Work # around this by using a literal / / regexp split to discard # the csv stuff we don't want out of the rest of the line. ($caught, undef, $restofline) = split(' ', $msgline, 3); next unless ($caught =~ /^[Y\.]$/ && $restofline); (undef, $rules) = split(/ /, $restofline, 3); # get tests, but ignore unknown tests and subrules my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} } split(/,/, $rules); if ($isspam) { $num_spam++; vec($is_spam, $count, 1) = 1; } else { $num_ham++; vec($is_spam, $count, 1) = 0; } # inlined for speed. # ORIGINAL: $tests_hit[$count] = freeze_tests(\@tests); $tests_hit[$count] = pack("w*", map { $long_to_short{$_} || new_short($_); } @tests); # TODO: benchmark using foreach(), map() is often slower $count++; # increment line } close IN; } $num_tests = $count; } sub readscores { print "Reading scores from \"$opt_cffile\"...\n"; my $tmpf = "tmp/rules$$.pl"; system "../build/parse-rules-for-masses ". "-d \"$opt_cffile\" -s $opt_scoreset -o $tmpf" and die; require $tmpf; unlink $tmpf; %allrules = %rules; # ensure it stays global } sub writescores_c { my $output = ''; my $size = 0; my $mutable = 0; my $i; # jm: now, score-ranges-from-freqs has tflags to work from, so # it will always list all mutable tests. my @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) || ($mutable_tests{$b} <=> $mutable_tests{$a}) || ($a cmp $b)} (keys %scores); my $max_hits_per_msg = 0; for (my $file = 0; $file < $num_tests; $file++) { my(@hits) = grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (thaw_tests($tests_hit[$file])); if ((scalar(@hits)+1) > $max_hits_per_msg) { $max_hits_per_msg = scalar(@hits)+1; } } for ($i = 0; $i <= $#index_to_rule; $i++) { my $name = $index_to_rule[$i]; $rule_to_index{$name} = $i; if ($ignored_rule{$name}) { next; } if ($mutable_tests{$name} == 0) { $range_lo{$name} = $range_hi{$name} = $scores{$name}; } else { $mutable++; if ($range_lo{$name} > $range_hi{$name}) { ($range_lo{$name},$range_hi{$name}) = ($range_hi{$name},$range_lo{$name}); } #$range_lo{$name} ||= 0.1; #$range_hi{$name} ||= 1.5; # no default score found? set it to max and let GA adjust downwards. this # seems to help avoid a load of really good rules getting 1.0 scores if ($allrules{$name}->{no_score_found}) { $scores{$name} = ($range_hi{$name} + $range_lo{$name}) / 2.0; } } $output .= ".".$i."\n". "n".$name."\n". "b".$scores{$name}."\n". "m".$mutable_tests{$name}."\n". "l".$range_lo{$name}."\n". "h".$range_hi{$name}."\n"; $size++; } open (DAT, ">tmp/scores.data"); print DAT "N$size\n", "M$mutable\n", # informational only $output; close DAT; open (OUT, ">tmp/scores.h"); print OUT " #include #include #include int num_scores = $size; int num_mutable = $mutable; unsigned char is_mutable[$size]; double range_lo[$size]; double range_hi[$size]; double bestscores[$size]; char *score_names[$size]; double tmp_scores[$size][2]; unsigned char ny_hit[$mutable]; unsigned char yn_hit[$mutable]; double lookup[$mutable]; /* readscores() is defined in tests.h */ "; close OUT; writetests_c($max_hits_per_msg); # make sure $rule_to_index is around } sub writetests_c { my $max_hits_per_msg = $_[0]; my(%uniq_files) = (); my(%count_keys) = (); my(%file_key) = (); my $file; for ($file = 0; $file < $num_tests; $file++) { my $uniq_key = vec($is_spam, $file, 1) . " "; my (@good_tests) = grep {length($_) && (! $ignored_rule{$_}) && (defined($rule_to_index{$_}))} (thaw_tests($tests_hit[$file])); @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests)); $uniq_key .= join(" ",@good_tests); if (exists($count_keys{$uniq_key})) { $count_keys{$uniq_key}++; } else { $count_keys{$uniq_key} = 1; $file_key{$file} = $uniq_key; $uniq_files{$file} = scalar(keys(%count_keys)) - 1; } } my $num_nondup = scalar(keys(%uniq_files)); open (TOP, ">tmp/tests.h"); print TOP " #include #include #include int num_tests = $num_tests; int num_nondup = $num_nondup; int num_spam = $num_spam; int num_ham = $num_ham; int max_hits_per_msg = $max_hits_per_msg; unsigned char num_tests_hit[$num_nondup]; unsigned char is_spam[$num_nondup]; unsigned short tests_hit[$num_nondup][$max_hits_per_msg]; double scores[$num_nondup]; double tmp_total[$num_nondup]; int tests_count[$num_nondup]; "; $_ = join ('', ); print TOP $_; close TOP; open (DAT, ">tmp/tests.data"); foreach $file (sort {$a <=> $b} (keys %uniq_files)) { print DAT ".".$uniq_files{$file}."\n"; my $out = ''; $out .= "s".vec($is_spam, $file, 1)."\n"; my $base_score = 0; my $num_tests_hit = 0; foreach my $test (thaw_tests($tests_hit[$file])) { if ($test eq '') { next; } if ($ignored_rule{$test}) { # this is not a log-worthy event anymore, since we have a lot # of T_ test rules that are ignored during perceptron runs # warn "ignored rule $test got a hit in $file!\n"; next; } if (!defined $rule_to_index{$test}) { warn "test with no C index: $test\n"; next; } if ($mutable_tests{$test}) { $num_tests_hit++; $out .= "t".$rule_to_index{$test}."\n"; if ($num_tests_hit >= $max_hits_per_msg) { die "Need to increase \$max_hits_per_msg"; } } else { $base_score += $scores{$test}; } } $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests $out .= "c" . $count_keys{$file_key{$file}} . "\n"; print DAT "n".$num_tests_hit."\n".$out; } close DAT; } sub read_ranges { if (!-f 'tmp/ranges.data') { die "need to make 'tmp/ranges.data' first"; } # read ranges, and mutableness, from ranges.data. open (IN, ") { /^(\S+) (\S+) (\d+) (\S+)$/ or next; my $t = $4; $range_lo{$t} = $1+0; $range_hi{$t} = $2+0; my $mut = $3+0; if ($allrules{$t}->{issubrule}) { # warn "$t: ignoring, is sub-rule\n"; # no need to warn $ignored_rule{$t} = 1; $mutable_tests{$t} = 0; next; } if ($t =~ /^T_/) { # warn "$t: ignoring, is T_ test rule\n"; # no need to warn $ignored_rule{$t} = 1; $mutable_tests{$t} = 0; $range_lo{$t} = 0.01; # clamp to insignificant range $range_hi{$t} = 0.01; next; } if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) { warn "$t: ignoring, score and range == 0\n"; $ignored_rule{$t} = 1; $mutable_tests{$t} = 0; next; } $ignored_rule{$t} = 0; if (!$mut) { $mutable_tests{$t} = 0; } elsif ($range_lo{$t} == $range_hi{$t}) { $mutable_tests{$t} = 0; } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) { $mutable_tests{$t} = 0; } else { $mutable_tests{$t} = 1; } unless ($mutable_tests{$t} || $scores{$t}) { warn "$t: ignoring, immutable and score == 0 in this scoreset\n"; $ignored_rule{$t} = 1; } } close IN; # catch up on the ones missed; seems to be userconf or 0-hitters mostly. foreach my $t (sort keys %allrules) { next if ($t eq '_scoreset'); next if (exists($range_lo{$t})); if ($allrules{$t}->{issubrule}) { if (!$ignored_rule{$t}) { # warn "$t: ignoring, is sub-rule\n"; # no need to warn here $ignored_rule{$t} = 1; } $mutable_tests{$t} = 0; next; } if ($t =~ /^T_/) { if (!$ignored_rule{$t}) { # warn "$t: ignoring, is T_ test rule\n"; # no need to warn here $ignored_rule{$t} = 1; $range_lo{$t} = 0.01; # clamp to insignificant range $range_hi{$t} = 0.01; } $mutable_tests{$t} = 0; next; } $ignored_rule{$t} = 0; unless (exists($mutable_tests{$t}) && ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) { $mutable_tests{$t} = 0; } unless ($mutable_tests{$t} || $scores{$t}) { if (!$ignored_rule{$t}) { warn "$t: ignoring, immutable and score == 0 in this scoreset\n"; $ignored_rule{$t} = 1; } } } foreach my $t (keys %range_lo) { next if ($ignored_rule{$t}); if ($mutable_tests{$t}) { if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { $scores{$t} = -1; } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { $scores{$t} = -0.01; } if ($scores{$t} >= $range_hi{$t}) { $scores{$t} = $range_hi{$t} - 0.001; } elsif ($scores{$t} <= $range_lo{$t}) { $scores{$t} = $range_lo{$t} + 0.001; } } else { if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) { next; } elsif ($range_lo{$t} == $range_hi{$t}) { $scores{$t} = $range_lo{$t}; next; } if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { $scores{$t} = -1; } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { $scores{$t} = -0.01; } if ($scores{$t} > $range_hi{$t}) { $scores{$t} = $range_hi{$t}; } elsif ($scores{$t} < $range_lo{$t}) { $scores{$t} = $range_lo{$t}; } } } } __DATA__ void loadtests (void) { FILE *fin = fopen ("tmp/tests.data", "r"); char buf[256]; int file = 0; int tnum = 0; while (fgets (buf, 255, fin) != NULL) { char cmd; long arg; float argd; cmd = (char) *buf; arg = strtol (buf+1, NULL, 10); argd = (float)strtod (buf+1, NULL); if (cmd == '.') { file = arg; } else if (cmd == 'n') { tnum = 0; num_tests_hit[file] = arg; } else if (cmd == 's') { is_spam[file] = arg; } else if (cmd == 'b') { scores[file] = argd; } else if (cmd == 't') { tests_hit[file][tnum] = arg; tnum++; } else if (cmd == 'c') { tests_count[file] = arg; } } fclose(fin); printf ("Read test results for %d messages (%d total).\n", file+1, num_tests); } void loadscores (void) { FILE *fin = fopen ("tmp/scores.data", "r"); char buf[256]; int snum = 0; while (fgets (buf, 255, fin) != NULL) { char cmd; long arg; float argd; char *str, *white; cmd = (char) *buf; arg = strtol (buf+1, NULL, 10); argd = (float)strtod (buf+1, NULL); str = buf+1; while ((white = strchr (str, '\n')) != NULL) { *white = '\0'; } if (cmd == '.') { snum = arg; } else if (cmd == 'b') { bestscores[snum] = argd; } else if (cmd == 'l') { range_lo[snum] = argd; } else if (cmd == 'h') { range_hi[snum] = argd; } else if (cmd == 'n') { score_names[snum] = strdup (str); /* leaky leak ;) */ } else if (cmd == 'm') { is_mutable[snum] = arg; } } fclose(fin); printf ("Read scores for %d tests.\n", num_scores); }