#!/usr/bin/perl -w # settings are located in $HOME/.corpus use strict; use POSIX qw(nice); use constant MONTH => 60*60*24*30; nice(15); my $configuration = "$ENV{HOME}/.corpus"; my %opt; my $revision = "unknown"; my %revision; my @files; my $skip = ''; my $time_start = time; &configure; &version; &init; &update; &locate; &rename; ¤t; &clean_up; sub version { my $line; if (open(TESTING, "$opt{tree}/rules/70_testing.cf")) { chomp($line = ); if ($line =~ m/^#.*Rev(?:ision)?:\s*(\S+).*/) { $revision = $1; } close(TESTING); } } sub configure { # does rough equivalent of source open(C, $configuration) || die "open failed: $configuration: $!\n"; while() { chomp; s/#.*//; if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) { $opt{$1} = $2; } } close(C); } sub clean_up { system "rm -f $opt{tmp}/*.$$"; } sub init { $SIG{INT} = \&clean_up; $SIG{TERM} = \&clean_up; $ENV{RSYNC_PASSWORD} = $opt{password}; $ENV{TIME} = '%e,%U,%S'; $ENV{TZ} = 'UTC'; } sub update { chdir $opt{corpus}; # allow non-running of rsync under some circumstances if ($opt{rsync_command}) { system $opt{rsync_command}; } else { system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .'; } if (-f "rsync.last") { open(FIND, "find . -type f -newer rsync.last |"); my $files = ""; while() { $files .= $_; } close(FIND); if (! $files) { print STDERR "no new corpus files\n"; if (!$opt{always_update_html} && rand(24) > 1) { exit 0; } else { print STDERR "updating anyway\n"; } } } open(RSYNC, "> rsync.last"); close(RSYNC); system "chmod +r *.log"; } sub locate { chdir "$opt{tree}/masses"; opendir(CORPUS, $opt{corpus}); @files = sort readdir(CORPUS); closedir(CORPUS); @files = grep { /^(?:spam|ham)-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files; @files = grep { my $time = 0; my $tag = 0; $revision{$_} = "unknown"; open(FILE, "$opt{corpus}/$_"); while (my $line = ) { last if $line !~ /^#/; $time++ if $line =~ /\b(?!08)\d\d:\d\d:\d\d\b/; $revision{$_} = $1 if $line =~ m/(?:CVS|SVN) revision:\s*(\S+)/; } close(FILE); if (!$time) { $skip .= "# skipped $_: time is between 0800 UTC and 0900 UTC\n"; } $time; } @files; } sub rename { use File::Copy qw(move); my $hour = (gmtime($time_start))[2]; if ($hour == 9) { chdir $opt{html}; opendir(HTML, $opt{html}); my @html = readdir(HTML); closedir(HTML); @html = grep { -f } @html; for (@html) { move($_, "last/$_"); } } } sub sort_all { my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/); my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/); my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || '')); if ($a1 =~ /^OVERALL/) { $n -= 1000; } elsif ($a1 =~ /^\(all messages\)/) { $n -= 100; } elsif ($a1 =~ /^\(all messages as \%\)/) { $n -= 10; } if ($b1 =~ /^OVERALL/) { $n += 1000; } elsif ($b1 =~ /^\(all messages\)/) { $n += 100; } elsif ($b1 =~ /^\(all messages as \%\)/) { $n += 10; } return $n; } sub time_filter { my ($after, $before) = @_; if (/time=(\d+)/) { return (($time_start - $1 >= MONTH * $after) && ($time_start - $1 < MONTH * $before)); } return 0; } sub current { my $classes = $opt{output_classes}; $classes ||= "DETAILS.new DETAILS.all DETAILS.age HTML.new HTML.all HTML.age NET.new NET.all NET.age"; foreach my $entry (split(' ', $classes)) { $entry =~ /^(\S+)\.(\S+)$/; my $class = $1; my $age = $2; if (!$age) { warn "no age in $entry"; next; } gen_class ($class, $age); } } sub gen_class { my ($class, $age) = @_; print STDERR "generating $class.$age\n"; next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/); my @ham = grep { /^ham/ } @files; my @spam = grep { /^spam/ } @files; print STDERR "ham: " . join(' ', @ham) . "\n"; print STDERR "spam: " . join(' ', @spam) . "\n"; chdir $opt{corpus}; # net vs. local if ($class eq "NET") { @ham = grep { /-net-/ } @ham; @spam = grep { /-net-/ } @spam; print STDERR "ham: " . join(' ', @ham) . "\n"; print STDERR "spam: " . join(' ', @spam) . "\n"; } else { # if both net and local exist, use newer my %spam; my %ham; for my $file (@spam) { $spam{$1}++ if ($file =~ m/-(\w+)\.log$/); } for my $file (@ham) { $ham{$1}++ if ($file =~ m/-(\w+)\.log$/); } while (my ($user, $count) = each %ham) { if ($count > 1) { my $nightly = "ham-$user.log"; my $weekly = "ham-net-$user.log"; if ($revision{$nightly} >= $revision{$weekly}) { @ham = grep { $_ ne $weekly } @ham; } else { @ham = grep { $_ ne $nightly } @ham; } } } while (my ($user, $count) = each %spam) { if ($count > 1) { my $nightly = "spam-$user.log"; my $weekly = "spam-net-$user.log"; if ($revision{$nightly} >= $revision{$weekly}) { @spam = grep { $_ ne $weekly } @spam; } else { @spam = grep { $_ ne $nightly } @spam; } } } print STDERR "ham: " . join(' ', @ham) . "\n"; print STDERR "spam: " . join(' ', @spam) . "\n"; } # age if ($class eq "NET" && $age ne "7day") { @ham = grep { -M "$_" < 10 } @ham; @spam = grep { -M "$_" < 10 } @spam; # find most recent CVS revision my $wanted = 0.0; for (@spam, @ham) { $wanted = $revision{$_} if ($revision{$_} > $wanted); } @spam = grep { $revision{$_} eq $wanted } @spam; @ham = grep { $revision{$_} eq $wanted } @ham; print STDERR "ham: " . join(' ', @ham) . "\n"; print STDERR "spam: " . join(' ', @spam) . "\n"; } elsif ($age =~ /^(?:new|all|age)$/) { @ham = grep { -M "$_" < -M $opt{tagtime} } @ham; @spam = grep { -M "$_" < -M $opt{tagtime} } @spam; if (!$opt{ignore_revisions}) { @ham = grep { $revision{$_} eq $revision } @ham; @spam = grep { $revision{$_} eq $revision } @spam; } print STDERR "ham: " . join(' ', @ham) . "\n"; print STDERR "spam: " . join(' ', @spam) . "\n"; } elsif ($age =~ /(\d+)day/) { my $mtime = $1; @ham = grep { -M "$_" < $mtime } @ham; @spam = grep { -M "$_" < $mtime } @spam; print STDERR "ham: " . join(' ', @ham) . "\n"; print STDERR "spam: " . join(' ', @spam) . "\n"; } open(OUT, "> $opt{html}/$class.$age"); print OUT "# ham results used: " . join(" ", @ham) . "\n"; print OUT "# spam results used: " . join(" ", @spam) . "\n"; for (@ham) { print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision; } for (@spam) { print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision; } my $flags = ""; $flags = "-t net -s 1" if $class eq "NET"; $flags = "-M HTML_MESSAGE" if $class eq "HTML"; $flags = "-o" if $class eq "OVERLAP"; if ($age eq "all") { my %spam; my %ham; my @output; for my $file (@spam) { $spam{$1} = $file if ($file =~ m/-(\w+)\.log$/); } for my $file (@ham) { $ham{$1} = $file if ($file =~ m/-(\w+)\.log$/); } unlink "$opt{tmp}/ham.log.$$"; unlink "$opt{tmp}/spam.log.$$"; next unless (scalar keys %spam && scalar keys %ham); for my $user (sort keys %spam) { next unless defined $ham{$user}; chdir "$opt{tree}/masses"; system("cat $opt{corpus}/$ham{$user} >> $opt{tmp}/ham.log.$$"); system("cat $opt{corpus}/$spam{$user} >> $opt{tmp}/spam.log.$$"); open(IN, "./hit-frequencies -xpa $flags $opt{corpus}/$spam{$user} $opt{corpus}/$ham{$user} |"); while() { chomp; push @output, "$_:$user\n"; } close(IN); } open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); while() { push @output, $_; } close(IN); for (sort sort_all @output) { print OUT $_; } } elsif ($age eq "age") { my @output; for my $which (("0-1", "1-3", "3-6")) { my ($after, $before) = split(/-/, $which); # get and filter logs chdir $opt{corpus}; for my $type (("ham", "spam")) { open(TMP, "> $opt{tmp}/$type.log.$$"); my @array = ($type eq "ham") ? @ham : @spam; for my $file (@array) { open(IN, $file); while () { print TMP $_ if time_filter($after, $before); } close(IN); } close (TMP); } # print out by age chdir "$opt{tree}/masses"; open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); while() { chomp; push @output, "$_:$which\n"; } close(IN); } for (sort sort_all @output) { print OUT $_; } } elsif (@ham && @spam) { # get logs system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$"); system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$"); chdir "$opt{tree}/masses"; open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); while() { print(OUT); } close(IN); } close(OUT); }