#!/local/perl586/bin/perl -w # settings are located in $HOME/.corpus use strict; use Getopt::Long; use vars qw( $realcorpusdir $opt_override $opt_tag ); GetOptions( "tag=s" => \$opt_tag, "dir=s" => \$realcorpusdir, "override=s" => \$opt_override, ); $opt_override ||= ''; $opt_tag ||= 'n'; # nightly is the default use File::Path; use File::Copy; use Time::ParseDate; use Cwd qw(abs_path); use POSIX qw(nice strftime); use constant WEEK => 7*60*60*24; nice(15); # daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before # the time of day when the mass-check tagging occurs; see # http://wiki.apache.org/spamassassin/DateRev for more details. use constant DATEREV_ADJ => - (8 * 60 * 60); # what's the max age of mail we will accept data from? (in weeks) # TODO: maybe this should be in ~/.corpus my $OLDEST_HAM_WEEKS = 52 * 5; # 5 years my $OLDEST_SPAM_WEEKS = 6 * 4; # 6 months # --------------------------------------------------------------------------- my $configuration = "$ENV{HOME}/.corpus"; my %opt; my %revision = (); my %filesize = (); my %dateline = (); my %mtime = (); my %logs_by_daterev = (); my %is_net_daterev = (); my %time = (); my @files; my @tmps = (); my $skip = ''; my $time_start = time; my $output_revpath; &configure; &init; my $corpusdir; my $make_links; if ($realcorpusdir) { print "reading logs from '$realcorpusdir'\n"; $make_links = 0; $corpusdir = $realcorpusdir; # no need to take copies } else { $realcorpusdir = $opt{corpus}; &update_rsync; # create a temp dir to hold hard links to the files we're working on. This # is used so that the rsyncd can upload new source files, replacing # our work files, without affecting us. use hard links for speed and # efficiency $corpusdir = "$opt{tmp}/parse.$$"; mkdir $corpusdir or die "cannot mkdir $corpusdir"; push @tmps, $corpusdir; print "copying logs from '$realcorpusdir'\n"; print "using $corpusdir to hold temporary copies\n"; $make_links = 1; } &locate; ¤t; &clean_up; exit; sub configure { # does rough equivalent of source open(C, $configuration) || die "open failed: $configuration: $!\n"; my $pwd = Cwd::getcwd; # add 'override' options my @lines = (, split(/\|/, $opt_override)); foreach $_ (@lines) { chomp; s/#.*//; if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) { my ($key, $val) = ($1, $2); $val =~ s/\$PWD/$pwd/gs; $opt{$key} = $val; } } close(C); } sub clean_up { chdir "/"; system "rm -rf $opt{tmp}/*.$$ ".join(' ', @tmps); } sub init { $SIG{INT} = \&clean_up; $SIG{TERM} = \&clean_up; $ENV{RSYNC_PASSWORD} = $opt{password}; $ENV{TIME} = '%e,%U,%S'; $ENV{TZ} = 'UTC'; } sub update_rsync { chdir $realcorpusdir; # allow non-running of rsync under some circumstances if ($opt{rsync_command}) { system $opt{rsync_command}; } else { system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .'; } # this block is no longer required -- we do sensible things with modtime # comparisons to work it out! if (0 && !$opt{always_update_html}) { if (-f "rsync.last") { open(FIND, "find . -type f -newer rsync.last |"); my $files = ""; while() { $files .= $_; } close(FIND); if (! $files) { print STDERR "no new corpus files\n"; if (rand(24) > 1) { exit 0; } else { print STDERR "updating anyway\n"; } } } } open(RSYNC, "> rsync.last"); close(RSYNC); system "chmod +r *.log"; } sub locate { opendir(CORPUS, $realcorpusdir); @files = sort readdir(CORPUS); closedir(CORPUS); @files = grep { /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && -f "$realcorpusdir/$_" && -M _ < 10 } @files; foreach my $file (@files) { my $tag = 0; my $headers = ''; open(FILE, "$realcorpusdir/$file") or warn "cannot read $realcorpusdir/$file"; while (my $line = ) { last if $line !~ /^#/; $headers .= $line; if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) { my ($datepre, $hh, $datepost) = ($1,$2,$3); my $timet = Time::ParseDate::parsedate($datepre.$hh.$datepost, GMT => 1, PREFER_PAST => 1); $time{$file} = $timet; } elsif ($line =~ m/^# Date:\s*(\S+)/) { # a better way to do the above. TODO: parse it instead $dateline{$file} = $1; } elsif ($line =~ m/^# SVN revision:\s*(\S+)/) { $revision{$file} = $1; } } close(FILE); my @s = stat("$realcorpusdir/$file"); $filesize{$file} = $s[7]; $mtime{$file} = $s[9]; if (!defined $time{$file}) { warn "$realcorpusdir/$file: no time found, ignored\n"; next; } if (!defined $revision{$file}) { warn "$realcorpusdir/$file: no revision found, ignored\n"; next; } if ($revision{$file} eq 'unknown') { warn "$realcorpusdir/$file: not tagged with a revision, ignored\n"; next; } my $daterev = mk_daterev($time{$file},$revision{$file},$opt_tag); $logs_by_daterev{$daterev} ||= [ ]; push (@{$logs_by_daterev{$daterev}}, $file); if ($file =~ /-net-/) { $is_net_daterev{$daterev} = 1; print "$realcorpusdir/$file: rev=$daterev time=$time{$file} (set 1)\n"; } else { print "$realcorpusdir/$file: rev=$daterev time=$time{$file} (set 0)\n"; } get_rulemetadata_for_revision($daterev, $revision{$file}); if ($make_links) { link ("$realcorpusdir/$file", "$corpusdir/$file") or die "cannot ln $realcorpusdir/$file to $corpusdir"; } } } sub sort_all { my ($a1, $a2) = ($a =~ m/($.*?$|\S+)(?::(\S+))?$/); my ($b1, $b2) = ($b =~ m/($.*?$|\S+)(?::(\S+))?$/); $a1 =~ s/^[\+\-]//; $b1 =~ s/^[\+\-]//; my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || '')); if ($a1 =~ /^OVERALL/) { $n -= 1000; } elsif ($a1 =~ /^$all messages$/) { $n -= 100; } elsif ($a1 =~ /^$all messages as \%$/) { $n -= 10; } if ($b1 =~ /^OVERALL/) { $n += 1000; } elsif ($b1 =~ /^$all messages$/) { $n += 100; } elsif ($b1 =~ /^$all messages as \%$/) { $n += 10; } return $n; } sub time_filter_fileset { my ($fileary, $outname, $after, $before) = @_; my $timet_before = (defined $before ? ($time_start - ($before * WEEK)) : $time_start+1); my $timet_after = (defined $after ? ($time_start - ($after * WEEK)) : 0); open(TMP, "> $outname") or warn "cannot write $outname"; for my $file (@{$fileary}) { open(IN, $file) or warn "cannot read $file"; while () { next unless /\btime=(\d+)/; next if ($1 < $timet_after || $1 > $timet_before); print TMP; } close IN; } close TMP or warn "failed to close $outname"; } sub current { my $classes = $opt{output_classes}; $classes ||= "DETAILS.new DETAILS.all DETAILS.age NET.new NET.all NET.age"; foreach my $entry (split(' ', $classes)) { $entry =~ /^(\S+)\.(\S+)$/; my $class = $1; my $age = $2; if (!$age) { warn "no age in $entry"; next; } if ($class eq 'HTML') { warn "class HTML in $entry obsolete, ignored"; next; } foreach my $daterev (sort keys %logs_by_daterev) { my $rev; if ($daterev !~ /\/r(\d+)/) { warn "bad daterev: $daterev"; next; } $rev = $1; if ($class eq "NET") { next unless $is_net_daterev{$daterev}; } gen_class ($daterev, $rev, $class, $age); } } } sub gen_class { my ($daterev, $rev, $class, $age) = @_; print STDERR "\ngenerating: $opt{html}/$daterev/$class.$age\n"; return if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/); my @ham = grep { /^ham/ } @{$logs_by_daterev{$daterev}}; my @spam = grep { /^spam/ } @{$logs_by_daterev{$daterev}}; print STDERR "input h: " . join(' ', @ham) . "\n"; print STDERR "input s: " . join(' ', @spam) . "\n"; chdir $corpusdir; # net vs. local if ($class eq "NET") { @ham = grep { /-net-/ } @ham; @spam = grep { /-net-/ } @spam; } # age if ($age =~ /(\d+)day/) { my $mtime = $1; @ham = grep { -M $_ < $mtime } @ham; @spam = grep { -M $_ < $mtime } @spam; } print STDERR "selected h: " . join(' ', @ham) . "\n"; print STDERR "selected s: " . join(' ', @spam) . "\n"; # we cannot continue if we have no files that match the criteria... # demand at least 1 ham and 1 spam file if (scalar @spam <= 0 || scalar @ham <= 0) { warn "not enough files found matching criteria ($daterev $class $age)\n"; return; } my $dir = create_outputdir($daterev); my $fname = "$dir/$class.$age"; # now, if the target file already exists, check to see if it's newer # than all the sources, make-style; if not, don't re-create it if (-f $fname) { my $targetfreshness = (-M $fname); my $needsrebuild = 0; foreach my $srcfile (@spam, @ham) { my $srcfreshness = (-M $srcfile); if ($targetfreshness > $srcfreshness) { # src is fresher print "$fname is older than $srcfile: $targetfreshness > $srcfreshness\n"; $needsrebuild = 1; last; } } if (!$needsrebuild) { print "existing: $fname, fresher than sources\n"; return; } } my $when = scalar localtime time; print qq{creating: $fname started $when... }; my $bytes = 0; if ($class eq 'LOGS') { foreach my $f (@ham, @spam) { $f =~ s/[^-\._A-Za-z0-9]+/_/gs; # sanitize! my $zf = "$fname-$f.gz"; system("gzip -c < $f > $zf.$$"); if ($? >> 8 != 0) { warn "gzip -c < $f > $zf.$$ failed"; } rename("$zf.$$", $zf) or warn "cannot rename $zf.$$ to $zf"; $bytes += (-s $zf); } } else { my $tmpfname = "$fname.$$"; open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname"; print OUT "# ham results used for $daterev $class $age: " . join(" ", @ham) . "\n"; print OUT "# spam results used for $daterev $class $age: " . join(" ", @spam) . "\n"; print OUT "# ".log_metadata_xml($daterev, @ham, @spam)."\n"; push (@tmps, abs_path($tmpfname)); my $flags = ""; $flags = "-t net -s 1" if $class eq "NET"; $flags = "-o" if $class eq "OVERLAP"; $flags = "-S" if $class eq "SCOREMAP"; if ($opt{rules_dir}) { $flags .= " -c '$opt{rules_dir}'"; } # are we analyzing --net mass-check logs? if so, use scoreset 1 if (join(" ", @ham) =~ /-net-/) { $flags .= " -s 1" if $class eq "NET"; } # catch an odd error condition, where hit-frequencies creates output # with no log lines included at all my $no_msgs = 0; if ($age eq "all") { my %spam; my %ham; my @output; for my $file (@spam) { $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/); } for my $file (@ham) { $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/); } unlink "$opt{tmp}/ham.log.$$"; unlink "$opt{tmp}/spam.log.$$"; if (scalar keys %spam <= 0 || scalar keys %ham <= 0) { warn "no files found for $class.$age"; return; } for my $user (sort keys %spam) { next unless $ham{$user}; time_filter_fileset([ "$corpusdir/$ham{$user}" ], "$opt{tmp}/ham.log.$$", $OLDEST_HAM_WEEKS, undef); time_filter_fileset([ "$corpusdir/$spam{$user}" ], "$opt{tmp}/spam.log.$$", $OLDEST_SPAM_WEEKS, undef); start_freqs($rev, "$flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$"); while() { chomp; push @output, "$_:$user\n"; } close(FREQS); system("cat $opt{tmp}/ham.log.$$ >> $opt{tmp}/hamall.log.$$"); system("cat $opt{tmp}/spam.log.$$ >> $opt{tmp}/spamall.log.$$"); } if (-z "$opt{tmp}/hamall.log.$$" && -z "$opt{tmp}/spamall.log.$$") { warn "time_filter_fileset() returned empty logs. not creating freqs!"; return; # we'll try again later } start_freqs($rev, "$flags $opt{tmp}/spamall.log.$$ $opt{tmp}/hamall.log.$$"); while() { /\s0\s+0\s+0.500\s+0.00\s+0.00\s+$all messages$/ and $no_msgs = 1; push @output, $_; } close(FREQS); for (sort sort_all @output) { print OUT; } } elsif ($age eq "age") { my @output; for my $which (("0-1", "1-2", "2-3", "3-6")) { my ($before, $after) = split(/-/, $which); # get and filter logs chdir $corpusdir; time_filter_fileset(\@ham, "$opt{tmp}/ham.log.$$", $after, $before); time_filter_fileset(\@spam, "$opt{tmp}/spam.log.$$", $after, $before); # print out by age start_freqs($rev, "$flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$"); while() { chomp; push @output, "$_:$which\n"; } close(FREQS); } for (sort sort_all @output) { print OUT; } } elsif (@ham && @spam) { # get logs time_filter_fileset(\@ham, "$opt{tmp}/ham.log.$$", $OLDEST_HAM_WEEKS, undef); time_filter_fileset(\@spam, "$opt{tmp}/spam.log.$$", $OLDEST_SPAM_WEEKS, undef); if (-z "$opt{tmp}/ham.log.$$" && -z "$opt{tmp}/spam.log.$$") { warn "time_filter_fileset() returned empty logs. not creating freqs!"; return; # we'll try again later } start_freqs($rev, "$flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$"); while() { /\s0\s+0\s+0.500\s+0.00\s+0.00\s+$all messages$/ and $no_msgs = 1; print(OUT); } close(FREQS); } $bytes = (-s OUT); close(OUT); if ($no_msgs) { warn "ERROR: no data in freqs! aborting, leaving tmp file as $tmpfname"; return; } rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname"; # compress for certain classes if ($class eq "OVERLAP") { $fname =~ s/'//gs; system ("gzip '$fname'"); # takes care of keeping the original around so we don't have to if ($? >> 8 != 0) { warn "gzip '$fname' failed"; } } } $when = scalar localtime time; print qq{created: $bytes bytes, finished at $when URL: $opt{ruleqa_url}$output_revpath }; } sub mk_daterev { my ($timet, $rev, $tag) = @_; return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag"; } sub create_outputdir { my ($revpath) = @_; my $dir = $opt{html} .'/'. $revpath; # print "output dir: $dir\n"; if (!-d $dir) { my $prevu = umask 0; mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir"; umask $prevu; } $output_revpath = $revpath; # set the global $output_revpath =~ s/\//-/; # looks nicer return $dir; } sub log_metadata_xml { my ($daterev, @files) = @_; my $str = ''; # this is extracted into the info.xml file later by the gen_info_xml script foreach my $f (@files) { $str .= qq{ $daterev $revision{$f} $filesize{$f} $dateline{$f} $mtime{$f} }; } $str =~ s/\s+/ /gs; # on a single line please return ''.$str.''; } sub create_rulemetadata_dir { my $rev = shift; my $dir = "$opt{html}/rulemetadata/$rev"; if (!-d $dir) { my $prevu = umask 0; mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir"; umask $prevu; } return $dir; } sub get_rulemetadata_for_revision { my ($daterev, $rev) = @_; my $dir = create_rulemetadata_dir($rev); # argh. this is silly; ~bbmass/.corpus specifies "$PWD" in its # "tree" path, so we have to ensure we're in the 'masses' dir # for this to work! chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; my $cmd = "$opt{tree}/masses/rule-qa/get-rulemetadata-for-revision ". "--rev=$rev --outputdir='$dir'"; system($cmd); if ($? >> 8 != 0) { warn "'$cmd' failed"; } } sub start_freqs { my ($rev, $args) = @_; $rev ||= 'HEAD'; my $hfdir = "$opt{tmp}/hfdir/r$rev"; print "setting up hit-frequencies for r$rev in $hfdir\n"; (-d "$opt{tmp}/hfdir") or system("mkdir $opt{tmp}/hfdir"); if (!-d $hfdir) { system ("cp -pr $opt{tree} $hfdir"); ($?>>8 != 0) and die "cp $opt{tree}/masses $hfdir failed"; } chdir "$hfdir" or die "cannot chdir $hfdir"; system("svn up -r$rev; svn up -r$rev rulesrc"); # ensure these are rebuilt system("rm -f rules/70_sandbox.cf rules/72_active.cf"); # do this twice in case Makefile.PL is rebuilt system ("( make build_rules; make build_rules ) < /dev/null"); chdir "$hfdir/masses" or die "cannot chdir $hfdir/masses"; open (FREQS, "./hit-frequencies -TxpaP $args |") or die "cannot run ./hit-frequencies $args |"; }