#!/usr/bin/perl -w # $HOME/.corpus should contain settings for these values: # # corpus=/home/corpus/corpus # tree=/home/corpus/cvs/spamassassin # tmp=/home/corpus/tmp # tagtime=/home/corpus/log/tagtime # html=/home/html/root/users/corpus # username=joe # password=xyzzy use strict; use POSIX qw(nice); nice(15); my $configuration = "$ENV{HOME}/.corpus"; my %opt; my $revision = "unknown"; my %revision; my @files; my $skip = ''; &configure; &version; &init; &update; &locate; &rename; ¤t; &clean_up; sub version { my $line; if (open(TESTING, "$opt{tree}/rules/70_cvs_rules_under_test.cf")) { chomp($line = ); if ($line =~ m/.*Revision:\s*(\S+).*/) { $revision = $1; } close(TESTING); } } sub configure { # does rough equivalent of source open(C, $configuration) || die "open failed: $configuration: $!\n"; while() { chomp; s/#.*//; if (/^(.*?)\s*=\s*(.*)/) { $opt{$1} = $2; } } close(C); } sub clean_up { system "rm -f $opt{tmp}/*.$$"; } sub init { $SIG{INT} = \&clean_up; $SIG{TERM} = \&clean_up; $ENV{RSYNC_PASSWORD} = $opt{password}; $ENV{TIME} = '%e,%U,%S'; $ENV{TZ} = 'UTC'; } sub update { chdir $opt{corpus}; system "rsync -CPcvuzbt --timeout=60 $opt{username}" . '@rsync.spamassassin.org::corpus/* .'; } sub locate { chdir "$opt{tree}/masses"; opendir(CORPUS, $opt{corpus}); @files = sort readdir(CORPUS); closedir(CORPUS); @files = grep { /^(?:spam|nonspam|ham)-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files; @files = grep { my $time = 0; my $tag = 0; $revision{$_} = "unknown"; open(FILE, "$opt{corpus}/$_"); while (my $line = ) { last if $line !~ /^#/; $time++ if $line =~ /\b(?!08)\d\d:\d\d:\d\d\b/; $tag++ if $line =~ /CURRENT_CORPORA_SUBMIT_VERSION/; $tag++ if $line =~ /CURRENT_CORPORA_WEEKLY_VERSION/ && /-net-/; $revision{$_} = $1 if $line =~ m/CVS revision:\s*(\S+)/; } close(FILE); if (!$time) { $skip .= "# skipped $_: time is between 0800 UTC and 0900 UTC\n"; } if (!$tag) { $skip .= "# skipped $_: tag not CURRENT_CORPORA_SUBMIT_VERSION\n"; } ($time && $tag); } @files; } sub rename { use File::Copy qw(move); my $hour = (gmtime(time))[2]; if ($hour == 9) { chdir $opt{html}; opendir(HTML, $opt{html}); my @html = readdir(HTML); closedir(HTML); @html = grep { -f } @html; for (@html) { move($_, "last/$_"); } } } sub sort_all { my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/); my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/); $a2 ||= ''; $b2 ||= ''; my $n = ($a1 cmp $b1) || ($a2 cmp $b2); $n -= 1000 if $a =~ /^OVERALL/; $n += 1000 if $b =~ /^OVERALL/; $n -= 100 if $a1 =~ /^\(all messages\)/; $n += 100 if $b1 =~ /^\(all messages\)/; $n -= 10 if $a1 =~ /^\(all messages as \%\)/; $n += 10 if $b1 =~ /^\(all messages as \%\)/; return $n; } sub current { for my $class ("DETAILS", "HTML", "NET") { for my $age ("new", "all", "1day", "2day", "7day") { my @ham = grep { /^(?:nonspam|ham)/ } @files; my @spam = grep { /^spam/ } @files; chdir $opt{corpus}; next if ($class eq "NET" && $age !~ /^(?:new|all|7day)$/); # net vs. local my @ham_net = grep { /-net-/ } @ham; my @spam_net = grep { /-net-/ } @spam; if ($class eq "NET") { @ham = @ham_net; @spam = @spam_net; } else { # remove duplicates for my $net (@ham_net) { @ham = grep { $_ ne $net } @ham; } for my $net (@spam_net) { @spam = grep { $_ ne $net } @spam; } } # age if ($class eq "NET" && $age ne "7day") { @ham = grep { -M "$_" < 10 } @ham; @spam = grep { -M "$_" < 10 } @spam; # find most recent CVS revision my $wanted = 0.0; for (@spam, @ham) { $wanted = $revision{$_} if ($revision{$_} > $wanted); } @spam = grep { $revision{$_} eq $wanted } @spam; @ham = grep { $revision{$_} eq $wanted } @ham; } elsif ($age =~ /^(?:new|all)$/) { @ham = grep { -M "$_" < -M $opt{tagtime} } @ham; @spam = grep { -M "$_" < -M $opt{tagtime} } @spam; @ham = grep { $revision{$_} eq $revision } @ham; @spam = grep { $revision{$_} eq $revision } @spam; } elsif ($age =~ /(\d+)day/) { my $mtime = $1; @ham = grep { -M "$_" < $mtime } @ham; @spam = grep { -M "$_" < $mtime } @spam; } open(OUT, "> $opt{html}/$class.$age"); print OUT "# ham results used: " . join(" ", @ham) . "\n"; print OUT "# spam results used: " . join(" ", @spam) . "\n"; for (@ham) { print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision; } for (@spam) { print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision; } my $flags = ""; $flags = "-t net -s 1" if $class eq "NET"; $flags = "-M HTML_MESSAGE" if $class eq "HTML"; if ($age eq "all") { my %spam; my %ham; my @output; for my $file (@spam) { $spam{$1} = $file if ($file =~ m/-(\w+)\.log$/); } for my $file (@ham) { $ham{$1} = $file if ($file =~ m/-(\w+)\.log$/); } unlink "$opt{tmp}/ham.log.$$"; unlink "$opt{tmp}/spam.log.$$"; next unless (scalar keys %spam && scalar keys %ham); for my $user (sort keys %spam) { next unless defined $ham{$user}; chdir "$opt{tree}/masses"; system("cat $opt{corpus}/$ham{$user} >> $opt{tmp}/ham.log.$$"); system("cat $opt{corpus}/$spam{$user} >> $opt{tmp}/spam.log.$$"); open(IN, "./hit-frequencies -xpa $flags $opt{corpus}/$spam{$user} $opt{corpus}/$ham{$user} |"); while() { chomp; push @output, "$_:$user\n"; } close(IN); } open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); while() { push @output, $_; } close(IN); for (sort sort_all @output) { print OUT $_; } } elsif (@ham && @spam) { # get logs system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$"); system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$"); chdir "$opt{tree}/masses"; open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); while() { print(OUT); } close(IN); } close(OUT); } } }