#!/local/perl586/bin/perl -w # settings are located in $HOME/.corpus use strict; use Getopt::Long; use vars qw( $corpusdir $opt_override $opt_tag ); GetOptions( "tag=s" => \$opt_tag, "dir=s" => \$corpusdir, "override=s" => \$opt_override, ); $opt_override ||= ''; $opt_tag ||= 'n'; # nightly is the default use File::Path; use File::Copy; use Time::ParseDate; use Cwd; use POSIX qw(nice strftime); use constant WEEK => 60*60*24; nice(15); # daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before # the time of day when the mass-check tagging occurs; see # http://wiki.apache.org/spamassassin/DateRev for more details. use constant DATEREV_ADJ => - (8 * 60 * 60); my $configuration = "$ENV{HOME}/.corpus"; my %opt; my %revision = (); my %filesize = (); my %dateline = (); my %mtime = (); my %logs_by_daterev = (); my %is_net_daterev = (); my %time = (); my @files; my @tmps = (); my $skip = ''; my $time_start = time; my $output_revpath; &configure; &init; if ($corpusdir) { print "reading logs from '$corpusdir'\n"; } else { $corpusdir = $opt{corpus}; &update_rsync; } &locate; ¤t; &clean_up; sub configure { # does rough equivalent of source open(C, $configuration) || die "open failed: $configuration: $!\n"; my $pwd = getcwd; # add 'override' options my @lines = (, split(/\|/, $opt_override)); foreach $_ (@lines) { chomp; s/#.*//; if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) { my ($key, $val) = ($1, $2); $val =~ s/\$PWD/$pwd/gs; $opt{$key} = $val; } } close(C); } sub clean_up { system "rm -f $opt{tmp}/*.$$ ".join(' ', @tmps); } sub init { $SIG{INT} = \&clean_up; $SIG{TERM} = \&clean_up; $ENV{RSYNC_PASSWORD} = $opt{password}; $ENV{TIME} = '%e,%U,%S'; $ENV{TZ} = 'UTC'; } sub update_rsync { chdir $corpusdir; # allow non-running of rsync under some circumstances if ($opt{rsync_command}) { system $opt{rsync_command}; } else { system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .'; } # this block is no longer required -- we do sensible things with modtime # comparisons to work it out! if (0 && !$opt{always_update_html}) { if (-f "rsync.last") { open(FIND, "find . -type f -newer rsync.last |"); my $files = ""; while() { $files .= $_; } close(FIND); if (! $files) { print STDERR "no new corpus files\n"; if (rand(24) > 1) { exit 0; } else { print STDERR "updating anyway\n"; } } } } open(RSYNC, "> rsync.last"); close(RSYNC); system "chmod +r *.log"; } sub locate { opendir(CORPUS, $corpusdir); @files = sort readdir(CORPUS); closedir(CORPUS); @files = grep { /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10 } @files; foreach my $file (@files) { my $tag = 0; my $headers = ''; open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file"; while (my $line = ) { last if $line !~ /^#/; $headers .= $line; if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) { my ($datepre, $hh, $datepost) = ($1,$2,$3); my $timet = Time::ParseDate::parsedate($datepre.$hh.$datepost, GMT => 1, PREFER_PAST => 1); $time{$file} = $timet; } elsif ($line =~ m/^# Date:\s*(\S+)/) { # a better way to do the above. TODO: parse it instead $dateline{$file} = $1; } elsif ($line =~ m/^# SVN revision:\s*(\S+)/) { $revision{$file} = $1; } } close(FILE); my @s = stat("$corpusdir/$file"); $filesize{$file} = $s[7]; $mtime{$file} = $s[9]; if (!defined $time{$file}) { warn "$corpusdir/$file: no time found, ignored\n"; next; } if (!defined $revision{$file}) { warn "$corpusdir/$file: no revision found, ignored\n"; next; } if ($revision{$file} eq 'unknown') { warn "$corpusdir/$file: not tagged with a revision, ignored\n"; next; } my $daterev = mk_daterev($time{$file},$revision{$file},$opt_tag); $logs_by_daterev{$daterev} ||= [ ]; push (@{$logs_by_daterev{$daterev}}, $file); if ($file =~ /-net-/) { $is_net_daterev{$daterev} = 1; print "$corpusdir/$file: rev=$daterev time=$time{$file} (set 1)\n"; } else { print "$corpusdir/$file: rev=$daterev time=$time{$file} (set 0)\n"; } get_rulemetadata_for_revision($daterev, $revision{$file}); } } sub sort_all { my ($a1, $a2) = ($a =~ m/($.*?$|\S+)(?::(\S+))?$/); my ($b1, $b2) = ($b =~ m/($.*?$|\S+)(?::(\S+))?$/); $a1 =~ s/^[\+\-]//; $b1 =~ s/^[\+\-]//; my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || '')); if ($a1 =~ /^OVERALL/) { $n -= 1000; } elsif ($a1 =~ /^$all messages$/) { $n -= 100; } elsif ($a1 =~ /^$all messages as \%$/) { $n -= 10; } if ($b1 =~ /^OVERALL/) { $n += 1000; } elsif ($b1 =~ /^$all messages$/) { $n += 100; } elsif ($b1 =~ /^$all messages as \%$/) { $n += 10; } return $n; } sub time_filter { my ($after, $before) = @_; if (/time=(\d+)/) { return (($time_start - $1 >= WEEK * $after) && ($time_start - $1 < WEEK * $before)); } return 0; } sub current { my $classes = $opt{output_classes}; $classes ||= "DETAILS.new DETAILS.all DETAILS.age HTML.new HTML.all HTML.age NET.new NET.all NET.age"; foreach my $entry (split(' ', $classes)) { $entry =~ /^(\S+)\.(\S+)$/; my $class = $1; my $age = $2; if (!$age) { warn "no age in $entry"; next; } foreach my $daterev (sort keys %logs_by_daterev) { my $rev; if ($daterev !~ /\/r(\d+)/) { warn "bad daterev: $daterev"; next; } $rev = $1; print STDERR "generating: $class.$age for $daterev\n"; if ($class eq "NET") { next unless $is_net_daterev{$daterev}; } gen_class ($daterev, $rev, $class, $age); } } } sub gen_class { my ($daterev, $rev, $class, $age) = @_; print STDERR "\ngenerating $daterev $class.$age:\n"; return if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/); my @ham = grep { /^ham/ } @{$logs_by_daterev{$daterev}}; my @spam = grep { /^spam/ } @{$logs_by_daterev{$daterev}}; print STDERR "input h: " . join(' ', @ham) . "\n"; print STDERR "input s: " . join(' ', @spam) . "\n"; chdir $corpusdir; # net vs. local if ($class eq "NET") { @ham = grep { /-net-/ } @ham; @spam = grep { /-net-/ } @spam; } # age if ($age =~ /(\d+)day/) { my $mtime = $1; @ham = grep { -M $_ < $mtime } @ham; @spam = grep { -M $_ < $mtime } @spam; } print STDERR "selected h: " . join(' ', @ham) . "\n"; print STDERR "selected s: " . join(' ', @spam) . "\n"; # we cannot continue if we have no files that match the criteria... # demand at least 1 ham and 1 spam file if (scalar @spam <= 0 || scalar @ham <= 0) { warn "not enough files found matching criteria ($daterev $class $age)\n"; return; } my $dir = create_outputdir($daterev); my $fname = "$dir/$class.$age"; # now, if the target file already exists, check to see if it's newer # than all the sources, make-style; if not, don't re-create it if (-f $fname) { my $targetfreshness = (-M $fname); my $needsrebuild = 0; foreach my $srcfile (@spam, @ham) { my $srcfreshness = (-M $srcfile); if ($targetfreshness > $srcfreshness) { # src is fresher print "$fname is older than $srcfile: $targetfreshness > $srcfreshness\n"; $needsrebuild = 1; last; } } if (!$needsrebuild) { print "existing: $fname, fresher than sources\n"; return; } } my $when = scalar localtime time; print qq{creating: $fname started $when... }; my $bytes = 0; if ($class eq 'LOGS') { foreach my $f (@ham, @spam) { $f =~ s/[^-\._A-Za-z0-9]+/_/gs; # sanitize! my $zf = "$fname-$f.gz"; system("gzip -c < $f > $zf.$$"); if ($? >> 8 != 0) { warn "gzip -c < $f > $zf.$$ failed"; } rename("$zf.$$", $zf) or warn "cannot rename $zf.$$ to $zf"; $bytes += (-s $zf); } } else { my $tmpfname = "$fname.$$"; open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname"; print OUT "# ham results used for $daterev $class $age: " . join(" ", @ham) . "\n"; print OUT "# spam results used for $daterev $class $age: " . join(" ", @spam) . "\n"; print OUT "# ".log_metadata_xml($daterev, @ham, @spam)."\n"; push (@tmps, $tmpfname); my $flags = ""; $flags = "-t net -s 1" if $class eq "NET"; $flags = "-M HTML_MESSAGE" if $class eq "HTML"; $flags = "-o" if $class eq "OVERLAP"; $flags = "-S" if $class eq "SCOREMAP"; if ($opt{rules_dir}) { $flags .= " -c '$opt{rules_dir}'"; } # are we analyzing --net mass-check logs? if so, use scoreset 1 if (join(" ", @ham) =~ /-net-/) { $flags .= " -s 1" if $class eq "NET"; } if ($age eq "all") { my %spam; my %ham; my @output; for my $file (@spam) { $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/); } for my $file (@ham) { $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/); } unlink "$opt{tmp}/ham.log.$$"; unlink "$opt{tmp}/spam.log.$$"; if (scalar keys %spam <= 0 || scalar keys %ham <= 0) { warn "no files found for $class.$age"; return; } chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; for my $user (sort keys %spam) { next unless $ham{$user}; system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$"); system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$"); open(IN, "./hit-frequencies -TxpaP $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |"); while() { chomp; push @output, "$_:$user\n"; } close(IN); } open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); while() { push @output, $_; } close(IN); for (sort sort_all @output) { print OUT; } } elsif ($age eq "age") { my @output; for my $which (("0-1", "1-2", "2-3", "3-6")) { my ($after, $before) = split(/-/, $which); # get and filter logs chdir $corpusdir; for my $type (("ham", "spam")) { open(TMP, "> $opt{tmp}/$type.log.$$"); my @array = ($type eq "ham") ? @ham : @spam; for my $file (@array) { open(IN, $file) or warn "cannot read $file"; while () { print TMP $_ if time_filter($after, $before); } close(IN); } close (TMP); } # print out by age chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); while() { chomp; push @output, "$_:$which\n"; } close(IN); } for (sort sort_all @output) { print OUT; } } elsif (@ham && @spam) { # get logs system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$"); system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$"); chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); while() { print(OUT); } close(IN); } $bytes = (-s OUT); close(OUT); rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname"; } $when = scalar localtime time; print qq{created: $bytes bytes, finished at $when URL: $opt{ruleqa_url}$output_revpath }; } sub mk_daterev { my ($timet, $rev, $tag) = @_; return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag"; } sub create_outputdir { my ($revpath) = @_; my $dir = $opt{html} .'/'. $revpath; # print "output dir: $dir\n"; if (!-d $dir) { my $prevu = umask 0; mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir"; umask $prevu; } $output_revpath = $revpath; # set the global $output_revpath =~ s/\//-/; # looks nicer return $dir; } sub log_metadata_xml { my ($daterev, @files) = @_; my $str = ''; # this is extracted into the info.xml file later by the gen_info_xml script foreach my $f (@files) { $str .= qq{ $daterev $revision{$f} $filesize{$f} $dateline{$f} $mtime{$f} }; } $str =~ s/\s+/ /gs; # on a single line please return ''.$str.''; } sub create_rulemetadata_dir { my $rev = shift; my $dir = "$opt{html}/rulemetadata/$rev"; if (!-d $dir) { my $prevu = umask 0; mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir"; umask $prevu; } return $dir; } sub get_rulemetadata_for_revision { my ($daterev, $rev) = @_; my $dir = create_rulemetadata_dir($rev); # argh. this is silly; ~bbmass/.corpus specifies "$PWD" in its # "tree" path, so we have to ensure we're in the 'masses' dir # for this to work! chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; my $cmd = "$opt{tree}/masses/rule-qa/get-rulemetadata-for-revision ". "--rev=$rev --outputdir='$dir'"; system($cmd); if ($? >> 8 != 0) { warn "'$cmd' failed"; } }