#!/local/perl586/bin/perl -w # settings are located in $HOME/.corpus use strict; use Getopt::Long; use vars qw( $opt_override $opt_tag ); GetOptions( "tag=s" => \$opt_tag, "override=s" => \$opt_override, ); $opt_override ||= ''; $opt_tag ||= 'n'; # nightly is the default use File::Path; use File::Copy; use Time::ParseDate; use Cwd qw(abs_path); use POSIX qw(nice strftime); use constant WEEK => 7*60*60*24; nice(15); # daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before # the time of day when the mass-check tagging occurs; see # http://wiki.apache.org/spamassassin/DateRev for more details. use constant DATEREV_ADJ => - (8 * 60 * 60); # what's the max age of mail we will accept data from? (in weeks) # TODO: maybe this should be in ~/.corpus my $OLDEST_HAM_WEEKS = 72 * 4; # 72 months = 6 years my $OLDEST_SPAM_WEEKS = 2 * 4; # 2 months # --------------------------------------------------------------------------- sub runcmd; my $configuration = "$ENV{HOME}/.corpus"; my %cf; my %revision = (); my %filesize = (); my %dateline = (); my %mtime = (); my %logs_by_daterev = (); my %is_net_daterev = (); my %time = (); my @tmps = (); my $time_start = time; my $output_revpath; my $perl_path = $^X; configure(); init(); my $logsdir = "$cf{html}/logs"; print "reading logs from '$logsdir'\n"; locate_input(); generate_logs(); clean_up(); exit; # --------------------------------------------------------------------------- sub configure { # does rough equivalent of source open(C, $configuration) || die "open failed: $configuration: $!\n"; my $pwd = Cwd::getcwd; # add 'override' options my @lines = (, split(/\|/, $opt_override)); foreach (@lines) { chomp; s/#.*//; if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) { my ($key, $val) = ($1, $2); $val =~ s/\$PWD/$pwd/gs; $cf{$key} = $val; } } close(C); $cf{output_classes} ||= "DETAILS.new DETAILS.all DETAILS.age NET.new NET.all NET.age"; } # --------------------------------------------------------------------------- sub clean_up { chdir "/"; runcmd "rm -rf $cf{tmp}/*.$$ ".join(' ', @tmps); } # --------------------------------------------------------------------------- sub init { $SIG{INT} = \&clean_up; $SIG{TERM} = \&clean_up; $ENV{RSYNC_PASSWORD} = $cf{password}; $ENV{TIME} = '%e,%U,%S'; $ENV{TZ} = 'UTC'; } # --------------------------------------------------------------------------- sub locate_input { opendir(CORPUS, $logsdir); my @files = sort readdir(CORPUS); closedir(CORPUS); @files = grep { /^(?:spam|ham)-(?:net-)?\S+\.log$/ && -f "$logsdir/$_" && -M _ < 10 } @files; foreach my $file (@files) { my $tag = 0; my $headers = ''; open(FILE, "$logsdir/$file") or warn "cannot read $logsdir/$file"; while (my $line = ) { last if $line !~ /^#/; $headers .= $line; if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) { my ($datepre, $hh, $datepost) = ($1,$2,$3); my $timet = Time::ParseDate::parsedate($datepre.$hh.$datepost, GMT => 1, PREFER_PAST => 1); $time{$file} = $timet; } elsif ($line =~ m/^# Date:\s*(\S+)/) { # a better way to do the above. TODO: parse it instead $dateline{$file} = $1; if (!defined $time{$file}) { # if time line unparseable (localized?) use this instead my ($yyyy, $mm, $dd, $h, $m, $s) = $dateline{$file} =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/; my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0", GMT => 1, PREFER_PAST => 1); $time{$file} = $timet; } } elsif ($line =~ m/^# SVN revision:\s*(\S+)/) { $revision{$file} = $1; } } close(FILE); my @s = stat("$logsdir/$file"); $filesize{$file} = $s[7]; $mtime{$file} = $s[9]; if (!defined $time{$file}) { warn "$logsdir/$file: no time found, ignored\n"; next; } if (!defined $revision{$file}) { warn "$logsdir/$file: no revision found, ignored\n"; next; } if ($revision{$file} eq 'unknown') { warn "$logsdir/$file: not tagged with a revision, ignored\n"; next; } my $daterev = mk_daterev($time{$file},$revision{$file},$opt_tag); $logs_by_daterev{$daterev} ||= [ ]; push (@{$logs_by_daterev{$daterev}}, $file); if ($file =~ /-net-/) { $is_net_daterev{$daterev} = 1; print "$logsdir/$file: rev=$daterev time=$time{$file} (set 1)\n"; } else { print "$logsdir/$file: rev=$daterev time=$time{$file} (set 0)\n"; } get_rulemetadata_for_revision($daterev, $revision{$file}); } } # --------------------------------------------------------------------------- sub sort_all { my ($a1, $a2) = ($a =~ m/($.*?$|\S+)(?::(\S+))?$/); my ($b1, $b2) = ($b =~ m/($.*?$|\S+)(?::(\S+))?$/); $a1 =~ s/^[\+\-]//; $b1 =~ s/^[\+\-]//; my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || '')); if ($a1 =~ /^OVERALL/) { $n -= 1000; } elsif ($a1 =~ /^$all messages$/) { $n -= 100; } elsif ($a1 =~ /^$all messages as \%$/) { $n -= 10; } if ($b1 =~ /^OVERALL/) { $n += 1000; } elsif ($b1 =~ /^$all messages$/) { $n += 100; } elsif ($b1 =~ /^$all messages as \%$/) { $n += 10; } return $n; } # --------------------------------------------------------------------------- sub time_filter_fileset { my ($fileary, $outname, $after, $before) = @_; my $timet_before = (defined $before ? ($time_start - ($before * WEEK)) : $time_start+1); my $timet_after = (defined $after ? ($time_start - ($after * WEEK)) : 0); open(TMP, ">$outname") or warn "cannot write $outname"; for my $file (@{$fileary}) { open(IN, $file) or warn "cannot read $file"; while () { next unless /\btime=(\d+)/; next if ($1 < $timet_after || $1 > $timet_before); print TMP; } close IN; } close TMP or warn "failed to close $outname"; } # --------------------------------------------------------------------------- sub generate_logs { foreach my $entry (split(' ', $cf{output_classes})) { $entry =~ /^(\S+)\.(\S+)$/; my $class = $1; my $rtype = $2; if (!$rtype) { warn "no rtype in $entry"; next; } if ($class eq 'HTML') { warn "class HTML in $entry obsolete, ignored"; next; } foreach my $daterev (reverse sort keys %logs_by_daterev) { my $rev; if ($daterev !~ /\/r(\d+)/) { warn "bad daterev: $daterev"; next; } $rev = $1; if ($class eq "NET") { next unless $is_net_daterev{$daterev}; } gen_class ($daterev, $rev, $class, $rtype); } } } # --------------------------------------------------------------------------- my ($tmp_h, $tmp_s, $no_messages_in_freqs, $hf_flags); sub gen_class { my ($daterev, $rev, $class, $rtype) = @_; return if ($class eq "NET" && $rtype !~ /^(?:new|all|age|7day)$/); chdir $logsdir; print STDERR "\ngenerating: $cf{html}/$daterev/$class.$rtype\n"; my @ham = grep { /^ham/ } @{$logs_by_daterev{$daterev}}; print STDERR "input h: " . join(' ', @ham) . "\n"; my @spam = grep { /^spam/ } @{$logs_by_daterev{$daterev}}; print STDERR "input s: " . join(' ', @spam) . "\n"; # net vs. local if ($class eq "NET") { @ham = grep { /-net-/ } @ham; @spam = grep { /-net-/ } @spam; } # age if ($rtype =~ /(\d+)day/) { my $mtime = $1; @ham = grep { -M $_ < $mtime } @ham; @spam = grep { -M $_ < $mtime } @spam; } print STDERR "selected h: " . join(' ', @ham) . "\n"; print STDERR "selected s: " . join(' ', @spam) . "\n"; # we cannot continue if we have no files that match the criteria... # demand at least 1 ham and 1 spam file if (scalar @spam <= 0 || scalar @ham <= 0) { warn "not enough files found matching criteria ($daterev $class $rtype)\n"; return; } my $dir = create_outputdir($daterev); my $fname = "$dir/$class.$rtype"; # now, if the target file already exists, check to see if it's newer # than all the sources, make-style; if not, don't re-create it if (-f $fname) { my $targetfreshness = (-M $fname); my $needsrebuild = 0; foreach my $srcfile (@spam, @ham) { my $srcfreshness = (-M $srcfile); if ($targetfreshness > $srcfreshness) { # src is fresher print "need rebuild, $fname is older than $srcfile: $targetfreshness > $srcfreshness\n"; $needsrebuild = 1; last; } } if (!$needsrebuild) { print "existing: $fname, fresher than sources\n"; return; } } my $when = scalar localtime time; print qq{creating: $fname ($class) started $when... }; my $bytes = 0; my $tmpfname = "$fname.$$"; if ($class eq 'LOGS') { $bytes = gen_report_logs($fname, \@ham, \@spam); } elsif ($class eq 'CORPUS') { push (@tmps, abs_path($tmpfname)); my $cmd = "$perl_path $cf{tree}/masses/logs-to-corpus-report ". join(" ", @ham)." ".join(" ", @spam)." > $tmpfname"; runcmd $cmd; ($? >> 8 == 0) or warn "failed to run logs-to-corpus-report"; rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname"; $bytes = (-s $fname); } else { push (@tmps, abs_path($tmpfname)); open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname"; print OUT "# ham results used for $daterev $class $rtype: " . join(" ", @ham) . "\n"; print OUT "# spam results used for $daterev $class $rtype: " . join(" ", @spam) . "\n"; print OUT "# ".log_metadata_xml($daterev, @ham, @spam)."\n"; $hf_flags = ""; $hf_flags = "-t net -s 1" if $class eq "NET"; $hf_flags = "-o" if $class eq "OVERLAP"; $hf_flags = "-S" if $class eq "SCOREMAP"; if ($cf{rules_dir}) { $hf_flags .= " -c '$cf{rules_dir}'"; } # are we analyzing --net mass-check logs? if so, use scoreset 1 if (join(" ", @ham) =~ /-net-/) { $hf_flags .= " -s 1" if $class eq "NET"; } # catch an odd error condition, where hit-frequencies creates output # with no log lines included at all $no_messages_in_freqs = 0; $tmp_h = "$cf{tmp}/ham.log.$$"; $tmp_s = "$cf{tmp}/spam.log.$$"; if ($rtype eq "all") { gen_report_freqs_all($tmpfname, \@ham, \@spam, $rev); } elsif ($rtype eq "age") { gen_report_freqs_age($tmpfname, \@ham, \@spam, $rev); } elsif (@ham && @spam) { gen_report_freqs_basic($tmpfname, \@ham, \@spam, $rev); } $bytes = (-s OUT); close(OUT); if ($no_messages_in_freqs) { warn "ERROR: no data in freqs! aborting, leaving tmp file as $tmpfname"; return; } rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname"; # compress for certain classes if ($class eq "OVERLAP") { $fname =~ s/'//gs; runcmd ("rm '$fname.gz'; gzip '$fname'"); # takes care of keeping the original around so we don't have to if ($? >> 8 != 0) { warn "gzip '$fname' failed"; } } } $when = scalar localtime time; print qq{created: $bytes bytes, finished at $when URL: $cf{ruleqa_url}$output_revpath }; } # --------------------------------------------------------------------------- sub mk_daterev { my ($timet, $rev, $tag) = @_; return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag"; } # --------------------------------------------------------------------------- sub create_outputdir { my ($revpath) = @_; my $dir = $cf{html} .'/'. $revpath; # print "output dir: $dir\n"; if (!-d $dir) { my $prevu = umask 0; mkpath([$dir], 0, oct($cf{html_mode})) or warn "failed to mkdir $dir"; umask $prevu; } $output_revpath = $revpath; # set the global $output_revpath =~ s/\//-/; # looks nicer return $dir; } # --------------------------------------------------------------------------- sub log_metadata_xml { my ($daterev, @files) = @_; my $str = ''; # this is extracted into the info.xml file later by the gen_info_xml script foreach my $f (@files) { $str .= qq{ $daterev $revision{$f} $filesize{$f} $dateline{$f} $mtime{$f} }; } $str =~ s/\s+/ /gs; # on a single line please return ''.$str.''; } # --------------------------------------------------------------------------- sub create_rulemetadata_dir { my $rev = shift; my $dir = "$cf{html}/rulemetadata/$rev"; if (!-d $dir) { my $prevu = umask 0; mkpath([$dir], 0, oct($cf{html_mode})) or warn "failed to mkdir $dir"; umask $prevu; } return $dir; } # --------------------------------------------------------------------------- sub get_rulemetadata_for_revision { my ($daterev, $rev) = @_; my $dir = create_rulemetadata_dir($rev); # argh. this is silly; ~bbmass/.corpus specifies "$PWD" in its # "tree" path, so we have to ensure we're in the 'masses' dir # for this to work! chdir "$cf{tree}/masses" or die "cannot chdir $cf{tree}/masses"; my $cmd = "$cf{tree}/masses/rule-qa/get-rulemetadata-for-revision ". "--rev=$rev --outputdir='$dir'"; runcmd($cmd); if ($? >> 8 != 0) { warn "'$cmd' failed"; } chdir $logsdir; } # --------------------------------------------------------------------------- sub start_hit_frequencies_at_rev { my ($rev, $args) = @_; $rev ||= 'HEAD'; (-d "$cf{tmp}/hfdir") or runcmd("mkdir -p $cf{tmp}/hfdir"); my $hfdir = "$cf{tmp}/hfdir/r$rev"; my $expected_svn_file = "$hfdir/Makefile.PL"; print "setting up hit-frequencies for r$rev in $hfdir\n"; my $needs_checkout = 0; if (-d $hfdir && chdir $hfdir) { eval { # "svn up" has been observed to wedge on the ruleqa zone VM, put a timeout so we can recover local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required alarm 60*60; # an hour should be generous enough runcmd("svn up -r$rev"); alarm 0; }; if ($@ || $?>>8 != 0 || !-f $expected_svn_file) { print "simple 'svn update' failed. performing full checkout instead...\n"; $needs_checkout = 1; } } else { $needs_checkout = 1; } if ($needs_checkout) { my $svnurl = get_svn_url(); runcmd("rm -rf $hfdir"); eval { local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required alarm 60*60; # an hour should be generous enough runcmd("svn co $svnurl\@$rev $hfdir"); alarm 0; }; if ($@ || $?>>8 != 0 || !-f $expected_svn_file) { die "svn co failed"; } } chdir "$hfdir" or die "cannot chdir $hfdir"; # ensure these are rebuilt runcmd "rm -f rules/70_sandbox.cf rules/72_active.cf"; # do this twice in case Makefile.PL is rebuilt runcmd "( make build_rules || $perl_path Makefile.PL;make build_rules )) { /URL: (.*)$/ and $svnurl = $1; } close SVNINFO or die "cannot close svn info"; return $svnurl; } # --------------------------------------------------------------------------- sub gen_report_logs { my ($fname, $hamref, $spamref) = @_; my $bytes = 0; foreach my $f (@$hamref, @$spamref) { $f =~ s/[^-\._A-Za-z0-9]+/_/gs; # sanitize! my $zf = "$fname-$f.gz"; runcmd("gzip -c < $f > $zf.$$"); if ($? >> 8 != 0) { warn "gzip -c < $f > $zf.$$ failed"; } rename("$zf.$$", $zf) or warn "cannot rename $zf.$$ to $zf"; $bytes += (-s $zf); } # this is just so we won't recompress these logs if re-run open TOUCH, ">$fname" or warn "cannot write to $fname"; close TOUCH; return $bytes; } # --------------------------------------------------------------------------- sub gen_report_freqs_all { my ($tmpfname, $hamref, $spamref, $rev) = @_; my %spam; my %ham; my @output; for my $file (@$spamref) { my $u = extract_username_from_log_filename($file); $spam{$u} = $file; print "username in spam log: $u\n"; } for my $file (@$hamref) { my $u = extract_username_from_log_filename($file); $ham{$u} = $file; print "username in ham log: $u\n"; } if (scalar keys %spam <= 0 && scalar keys %ham <= 0) { warn "no files found"; return; } my $tmp_h_all = "$cf{tmp}/hamall.log.$$"; my $tmp_s_all = "$cf{tmp}/spamall.log.$$"; unlink $tmp_h_all, $tmp_s_all; my %alluserkeys; for my $k (keys %spam, keys %ham) { next if exists $alluserkeys{$k}; undef $alluserkeys{$k}; } for my $user (sort keys %alluserkeys) { my $files_h = []; if ($ham{$user}) { $files_h = [ "$logsdir/$ham{$user}" ]; } my $files_s = []; if ($spam{$user}) { $files_s = [ "$logsdir/$spam{$user}" ]; } time_filter_fileset($files_h, $tmp_h, $OLDEST_HAM_WEEKS, undef); time_filter_fileset($files_s, $tmp_s, $OLDEST_SPAM_WEEKS, undef); start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h"); while() { chomp; push @output, "$_:$user\n"; } close(FREQS); runcmd("cat $tmp_h >> $tmp_h_all"); runcmd("cat $tmp_s >> $tmp_s_all"); } if (-z $tmp_h_all && -z $tmp_s_all) { warn "time_filter_fileset() returned empty logs. not creating freqs!"; return; # we'll try again later } start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s_all $tmp_h_all"); while() { /\s0\s+0\s+0.500\s+0.00\s+0.00\s+$all messages$/ and $no_messages_in_freqs = 1; push @output, $_; } close(FREQS); for (sort sort_all @output) { print OUT; } } # --------------------------------------------------------------------------- sub extract_username_from_log_filename { my $u = shift; # spam-someuser.log, spam-net-bb-jm.20090518-r775863-n.log $u =~ s/\.log$//; $u =~ s/.*\///; $u =~ s/^(h|sp)am-(?:net-)?//; $u =~ s/\.\d{8}-r\d+-[a-z]//; # daterev return $u; } # --------------------------------------------------------------------------- sub gen_report_freqs_age { my ($tmpfname, $hamref, $spamref, $rev) = @_; my @output; for my $which (("0-1", "1-2", "2-3", "3-6")) { my ($before, $after) = split(/-/, $which); time_filter_fileset($hamref, $tmp_h, $after, $before); time_filter_fileset($spamref, $tmp_s, $after, $before); # print out by age start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h"); while() { chomp; push @output, "$_:$which\n"; } close(FREQS); } for (sort sort_all @output) { print OUT; } } # --------------------------------------------------------------------------- sub gen_report_freqs_basic { my ($tmpfname, $hamref, $spamref, $rev) = @_; time_filter_fileset($hamref, $tmp_h, $OLDEST_HAM_WEEKS, undef); time_filter_fileset($spamref, $tmp_s, $OLDEST_SPAM_WEEKS, undef); if (-z $tmp_h && -z $tmp_s) { warn "time_filter_fileset() returned empty logs. not creating freqs!"; return; # we'll try again later } start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h"); while() { /\s0\s+0\s+0.500\s+0.00\s+0.00\s+$all messages$/ and $no_messages_in_freqs = 1; print(OUT); } close(FREQS); } sub runcmd { my ($cmd) = @_; print "[$cmd]\n"; system $cmd; }