#!/local/perl586/bin/perl -w # settings are located in $HOME/.corpus use strict; use Getopt::Long; use File::Path; use File::Copy; use Time::ParseDate; use Cwd; use POSIX qw(nice strftime); sub usage { die " usage: mk-ruleqa-reports [--tag=(n|b)] [--dir=/path/to/files] [--copylogs] [--reports='...'] [--datadir=/path/to/data] [--daterev=daterev] --copylogs copies log files to data (log, report) storage. One of --copylogs or --reports is required. Both are ok. reports can be one or more of: DETAILS.new DETAILS.age DETAILS.all NET.new NET.all NET.age SCOREMAP.new see http://wiki.apache.org/spamassassin/DateRev for info on daterevs. "; } use vars qw( $opt_dir $opt_reports $opt_tag $opt_copylogs $opt_datadir $opt_daterev ); GetOptions( "tag=s" => \$opt_tag, "dir=s" => \$opt_dir, "daterev=s" => \$opt_daterev, "reports=s" => \$opt_reports, "datadir=s" => \$opt_datadir, 'copylogs' => \$opt_copylogs ); $opt_tag ||= 'n'; # nightly is the default usage() unless ($opt_reports or $opt_copylogs); nice(15); # what's the max age of mail we will accept data from? (in weeks) # TODO: maybe this should be in ~/.corpus my $OLDEST_HAM_WEEKS = 52 * 5; # 5 years my $OLDEST_SPAM_WEEKS = 6 * 4; # 6 months # daterevs -- e.g. "20060429-r239832-r" -- are aligned to 0800 UTC, just before # the time of day when the mass-check tagging occurs; see # http://wiki.apache.org/spamassassin/DateRev for more details. use constant DATEREV_ADJ => -(8 * 60 * 60); use constant WEEK => (7 * 60 * 60 * 24); use constant OLDEST_DATEREV_SEARCHED => (24 * 60 * 60 * 30); # 1 month # --------------------------------------------------------------------------- my %conf = (); $ENV{TZ} = 'UTC'; configure("$ENV{HOME}/.corpus"); $opt_dir ||= $conf{corpus}; $opt_datadir ||= $conf{html}; # logfile metadata my %logmd = (); my $time_start = time; my $output_revpath; my %logs_by_daterev = (); my %is_net_daterev = (); my @tmps = (); $SIG{INT} = sub { clean_up(); warn "SIGINT"; exit 23; }; $SIG{TERM} = sub { clean_up(); warn "SIGTERM"; exit 23; }; $ENV{TIME} = '%e,%U,%S'; if ($opt_copylogs) { copylogs(); %logs_by_daterev = (); } mk_reports(); clean_up(); exit; # --------------------------------------------------------------------------- sub configure { my $configuration = shift; # does rough equivalent of source open(C, $configuration) || die "open failed: $configuration: $!\n"; my $pwd = Cwd::getcwd; while () { chomp; s/#.*//; if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) { my ($key, $val) = ($1, $2); $val =~ s/\$PWD/$pwd/gs; $conf{$key} = $val; } } close(C); $conf{tmp} ||= "/tmp"; } # --------------------------------------------------------------------------- sub copylogs { print "copying logs from: $opt_dir\n"; opendir(LOGDIR, $opt_dir); my @files = readdir(LOGDIR); closedir(LOGDIR); @files = grep { /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/i && -f "$opt_dir/$_" # && -M _ < 10 } @files; foreach my $file (sort @files) { parse_file_metadata($opt_dir, $file, 0); } foreach my $daterev (sort keys %logs_by_daterev) { my @ham = grep { /^ham/ } @{$logs_by_daterev{$daterev}}; my @spam = grep { /^spam/ } @{$logs_by_daterev{$daterev}}; my $drdir = $daterev; $drdir =~ s/-/\//; print STDERR "\ncopying to: $opt_datadir/$drdir/LOGS.all\n"; print STDERR "input h: " . join(' ', @ham) . "\n"; print STDERR "input s: " . join(' ', @spam) . "\n"; my $dir = create_outputdir($daterev); my $bytes = 0; foreach my $f (@ham, @spam) { $f =~ s/[^-\._A-Za-z0-9]+/_/gs; # sanitize! my $srcf = "$opt_dir/$f"; my $zf = "$dir/LOGS.all-$f.gz"; my $targetfreshness = (-M $zf); my $srcfreshness = (-M $srcf); if (!defined ($srcfreshness)) { warn "no srcfile found! $srcf"; } if (defined($targetfreshness) && $targetfreshness < $srcfreshness && !-z $zf) { print "already copied: $zf\n"; next; } my $when = scalar localtime time; print "copying: $zf\nstarted $when...\n"; system("gzip -c < '$srcf' > '$zf.$$'"); if ($? >> 8 != 0) { warn "gzip -c < '$srcf' > '$zf.$$' failed"; } rename("$zf.$$", $zf) or warn "cannot rename $zf.$$ to $zf"; $bytes += (-s $zf); } my $when = scalar localtime time; print "copied: $bytes bytes, finished at $when\n"; } } # --------------------------------------------------------------------------- sub mk_reports { # find each daterev's data directory: # /home/automc/corpus/html/20070609/r545837-b/ my $threshold_date = strftime("%Y%m%d", gmtime(time - OLDEST_DATEREV_SEARCHED)); foreach my $datedir (<$opt_datadir/*>) { next unless ($datedir =~ /\/(2\d+)$/); next if ($threshold_date && $1 < $threshold_date); foreach my $revdir (<$datedir/r*>) { if ($opt_daterev) { $revdir =~ m!/(2\d+/r\d+-\S+)$!; my $dr = $1; $dr =~ s!/!-!g; if ($dr ne $opt_daterev) { warn "skipping daterev: $dr != $opt_daterev\n"; next; } } next unless -d $revdir; report_dir($revdir); } } } sub report_dir { my $dir = shift; opendir(LOGDIR, $dir); my @files = readdir(LOGDIR); closedir(LOGDIR); @files = grep { # LOGS.all-spam-mc-fast.log.gz /^LOGS\.all-(?:spam|ham)-\S+\.log\.gz$/i && -f "$dir/$_" # && -M _ < 10 } @files; foreach my $file (sort @files) { parse_file_metadata($dir, $file, 1); } foreach my $daterev (sort keys %logs_by_daterev) { my $rev; if ($daterev !~ /-r(\d+)/) { warn "bad daterev: $daterev"; next; } $rev = $1; foreach my $entry (split(' ', $opt_reports)) { my ($class, $age) = ($entry =~ /^(\S+)\.(\S+)$/); if (!$age) { warn "no age: $entry"; next; } if ($class eq 'HTML') { warn "class HTML obsolete: $entry"; next; } if ($class eq "NET" && !$is_net_daterev{$daterev}) { next; } eval { gen_class ($daterev, $rev, $class, $age); 1; } or warn "gen_class failed: $@"; } } } our $no_msgs; sub gen_class { my ($daterev, $rev, $class, $age) = @_; my @ham = grep { /LOGS\.all-ham/ } @{$logs_by_daterev{$daterev}}; my @spam = grep { /LOGS\.all-spam/ } @{$logs_by_daterev{$daterev}}; if ($class eq "NET") { @ham = grep { /-net-/ } @ham; @spam = grep { /-net-/ } @spam; } if ($opt_tag eq 'b') { @ham = grep { /-mc-/ } @ham; @spam = grep { /-mc-/ } @spam; } else { @ham = grep { !/-mc-/ } @ham; @spam = grep { !/-mc-/ } @spam; } my $drdir = $daterev; $drdir =~ s/-/\//; print STDERR "\ngenerating: $opt_datadir/$drdir/$class.$age\n"; print STDERR "input h: " . join(' ', @ham) . "\n"; print STDERR "input s: " . join(' ', @spam) . "\n"; # we cannot continue if we have no files that match the criteria... # demand at least 1 ham and 1 spam file if (scalar @spam <= 0 || scalar @ham <= 0) { die "not enough files found matching criteria ($daterev $class $age)\n"; } $daterev =~ /r(\d+)/; get_rulemetadata_for_revision($daterev, $1); # not really creating here; it already exists by now due to copylogs() my $dir = create_outputdir($daterev); my $fname = "$dir/$class.$age"; # now, if the target file already exists, check to see if it's newer # than all the sources, make-style; if not, don't re-create it if (-f $fname) { my $targetfreshness = (-M $fname); my $needsrebuild = 0; foreach my $srcfile (@spam, @ham) { my $srcfreshness = (-M "$dir/$srcfile"); if (!defined ($srcfreshness)) { warn "no srcfile found! $dir/$srcfile"; last; } if (defined($targetfreshness) && defined($srcfreshness) && $targetfreshness > $srcfreshness) { # src is fresher print "$fname is older than $dir/$srcfile: $targetfreshness > $srcfreshness\n"; $needsrebuild = 1; last; } } if (!$needsrebuild) { print "existing: $fname, fresher than sources\n"; return; } } my $when = scalar localtime time; print qq{creating: $fname started $when... }; my $bytes = 0; my $tmpfname = "$fname.$$"; open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname"; push @tmps, $tmpfname; print OUT log_metadata($daterev, $class, $age, \@ham, \@spam); my @pathspam = map { "$dir/$_" } @spam; my @pathham = map { "$dir/$_" } @ham; my $flags = ""; $flags = "-t net -s 1" if $class eq "NET"; $flags = "-o" if $class eq "OVERLAP"; $flags = "-S" if $class eq "SCOREMAP"; if ($conf{rules_dir}) { $flags .= " -c '$conf{rules_dir}'"; } # are we analyzing --net mass-check logs? if so, use scoreset 1 # jm: always use set 1 if the logs allow it if (join(" ", @ham) =~ /-net-/) { $flags .= " -s 1"; # if $class eq "NET"; } # catch an odd error condition, where hit-frequencies creates output # with no log lines included at all $no_msgs = 0; my $tmp_h = "$conf{tmp}/ham.log.$$"; my $tmp_s = "$conf{tmp}/spam.log.$$"; unlink $tmp_h; unlink $tmp_s; my @output = (); if ($age eq "all") { my %spam; my %ham; # LOGS.all-spam-bb-jm.log.gz for my $f (@pathspam) { $spam{$1} = $f if ($f =~ m/LOGS.all-spam-(\w[-\w]+)\.log/); } for my $f (@pathham) { $ham{$1} = $f if ($f =~ m/LOGS.all-ham-(\w[-\w]+)\.log/); } if (scalar keys %spam <= 0 || scalar keys %ham <= 0) { warn "no files found for $class.$age"; return; } my $tmp_all_h = $tmp_h.".all.$$"; my $tmp_all_s = $tmp_s.".all.$$"; for my $user (sort keys %spam) { next unless $ham{$user}; time_filter_fileset([ "$dir/$ham{$user}" ], $tmp_h, $OLDEST_HAM_WEEKS, undef); time_filter_fileset([ "$dir/$spam{$user}" ], $tmp_s, $OLDEST_SPAM_WEEKS, undef); push @output, run_hit_frequencies($flags, $user, $tmp_s, $tmp_h); system("cat $tmp_h >> $tmp_all_h"); system("cat $tmp_s >> $tmp_all_s"); } push @output, run_hit_frequencies($flags, undef, $tmp_all_s, $tmp_all_h); for (sort sort_all @output) { print OUT; } unlink ($tmp_all_h, $tmp_all_s); } elsif ($age eq "age") { for my $which (("0-1", "1-2", "2-3", "3-6")) { my ($before, $after) = split(/-/, $which); time_filter_fileset(\@pathham, $tmp_h, $after, $before); time_filter_fileset(\@pathspam, $tmp_s, $after, $before); push @output, run_hit_frequencies($flags, $which, $tmp_s, $tmp_h); } for (sort sort_all @output) { print OUT; } } elsif ($age eq "new") { time_filter_fileset(\@pathham, $tmp_h, $OLDEST_HAM_WEEKS, undef); time_filter_fileset(\@pathspam, $tmp_s, $OLDEST_SPAM_WEEKS, undef); print OUT run_hit_frequencies($flags, undef, $tmp_s, $tmp_h); } else { warn "bad age $age"; } if ($no_msgs) { warn "ERROR: no data in freqs! aborting, leaving tmp file as $tmpfname"; return; } $bytes = (-s OUT); close(OUT); rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname"; unlink ($tmp_h, $tmp_s); # compress for certain classes if ($class eq "OVERLAP") { $fname =~ s/'//gs; system ("gzip '$fname'"); # takes care of keeping the original around so we don't have to if ($? >> 8 != 0) { warn "gzip '$fname' failed"; } } $when = scalar localtime time; print qq{created: $bytes bytes, finished at $when URL: $conf{ruleqa_url}$output_revpath }; } sub run_hit_frequencies { my ($flags, $suffix, $spamf, $hamf) = @_; if (-z $hamf && -z $spamf) { warn "time_filter_fileset() returned empty logs. not creating freqs!"; return; # we'll try again later } my @output = (); my $origwd = Cwd::getcwd; { chdir "$conf{tree}/masses" or die "cannot chdir $conf{tree}/masses"; print "[hit-frequencies -TxpaP $flags '$spamf' '$hamf']\n"; open(IN, "./hit-frequencies -TxpaP $flags '$spamf' '$hamf' |"); while() { chomp; /\s0\s+0\s+0.500\s+0.00\s+0.00\s+\(all messages\)/ and $no_msgs = 1; push @output, $_ . ($suffix ? ":$suffix" : "") . "\n"; } close(IN) or die "hit-frequencies failed"; chdir $origwd or die "cannot return to $origwd"; } return @output; } sub log_metadata { my ($daterev, $class, $age, $ham, $spam) = @_; # LOGS.all-ham-bb-jm.log.gz => ham-bb-jm.log my @h_nice = map { my $x = $_; $x=~s/^LOGS\.all-//; $x=~s/\.gz$//; $x } @$ham; my @s_nice = map { my $x = $_; $x=~s/^LOGS\.all-//; $x=~s/\.gz$//; $x } @$spam; my $out = ""; $out .= "# ham results used for $daterev $class $age: ".join(" ", @h_nice)."\n"; $out .= "# spam results used for $daterev $class $age: ".join(" ", @s_nice)."\n"; $out .= "# ".log_metadata_xml($daterev, @$ham, @$spam)."\n"; return $out; } # --------------------------------------------------------------------------- sub sort_all { my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/); my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/); $a1 =~ s/^[\+\-]//; $b1 =~ s/^[\+\-]//; my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || '')); if ($a1 =~ /^OVERALL/) { $n -= 1000; } elsif ($a1 =~ /^\(all messages\)/) { $n -= 100; } elsif ($a1 =~ /^\(all messages as \%\)/) { $n -= 10; } if ($b1 =~ /^OVERALL/) { $n += 1000; } elsif ($b1 =~ /^\(all messages\)/) { $n += 100; } elsif ($b1 =~ /^\(all messages as \%\)/) { $n += 10; } return $n; } sub time_filter_fileset { my ($fileary, $outname, $after, $before) = @_; my $timet_before = (defined $before ? ($time_start - ($before * WEEK)) : $time_start+1); my $timet_after = (defined $after ? ($time_start - ($after * WEEK)) : 0); open(TMP, "> $outname") or warn "cannot write $outname"; for my $file (@{$fileary}) { if ($file =~ /\.gz$/) { open(IN, "gunzip -cd < '$file' |") or warn "cannot gunzip $file"; } else { open(IN, $file) or warn "cannot read $file"; } while () { next unless /\btime=(\d+)/; next if ($1 < $timet_after || $1 > $timet_before); print TMP; } close IN; } close TMP or warn "failed to close $outname"; } # --------------------------------------------------------------------------- sub parse_file_metadata { my $dir = shift; my $file = shift; my $usecache = shift; my @s = stat("$dir/$file"); my $fsize = $s[7]; my $mtime = $s[9]; my $cached = { }; if ($usecache && open (CACHE, "<$dir/LOGS.metadata")) { while () { /^\Q$file\E\|(.*)$/ or next; foreach my $kvpair (split (/\|/, $1)) { $kvpair =~ /^(.*?)=(.*)$/; $cached->{$1} = $2; } last; # use the first cached set } close CACHE; } if ($cached->{time_t} && $cached->{fsize} == $fsize && $cached->{mtime} == $mtime) { # hooray, we can use the cached copy $logmd{$file} = $cached; } else { # read it from the gzipped logfile $logmd{$file}->{fsize} = $fsize; $logmd{$file}->{mtime} = $mtime; if ($file =~ /\.gz$/) { open(FILE, "gunzip -cd < '$dir/$file' |") or warn "cannot read $dir/$file"; } else { open(FILE, "$dir/$file") or warn "cannot read $dir/$file"; } while (my $line = ) { last if $line !~ /^#/; if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) { my ($datepre, $hh, $datepost) = ($1,$2,$3); my $timet = Time::ParseDate::parsedate($datepre.$hh.$datepost, GMT => 1, PREFER_PAST => 1); $logmd{$file}->{time_t} = $timet; } elsif ($line =~ m/^# Date:\s*(\S+)/) { # a better way to do the above. TODO: parse it instead $logmd{$file}->{startdate} = $1; } elsif ($line =~ m/^# SVN revision:\s*(\S+)/) { $logmd{$file}->{revision} = $1; } } close(FILE); # and write to the cache if ($usecache) { my $line = "$file"; foreach my $k (keys %{$logmd{$file}}) { $line .= "|".$k."=".$logmd{$file}->{$k}; } if (open (CACHE, ">>$dir/LOGS.metadata")) { print CACHE $line."\n"; close CACHE or unlink "$dir/LOGS.metadata"; } } } if (!defined $logmd{$file}->{time_t}) { warn "$dir/$file: no time found, ignored\n"; return; } if (!defined $logmd{$file}->{revision}) { warn "$dir/$file: no revision found, ignored\n"; return; } if ($logmd{$file}->{revision} eq 'unknown') { warn "$dir/$file: not tagged with a revision, ignored\n"; return; } my $daterev = mk_daterev($logmd{$file}->{time_t}, $logmd{$file}->{revision}, $opt_tag); $logs_by_daterev{$daterev} ||= [ ]; push @{$logs_by_daterev{$daterev}}, $file; if ($file =~ /-net-/) { $is_net_daterev{$daterev} = 1; print "$dir/$file: rev=$daterev time=$logmd{$file}->{time_t} (set 1)\n"; } else { print "$dir/$file: rev=$daterev time=$logmd{$file}->{time_t} (set 0)\n"; } } # --------------------------------------------------------------------------- sub mk_daterev { my ($timet, $rev, $tag) = @_; return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "-r$rev-$tag"; } sub create_outputdir { my ($revpath) = @_; $revpath =~ s/-/\//; my $dir = $opt_datadir .'/'. $revpath; # print "output dir: $dir\n"; if (!-d $dir) { my $prevu = umask 0; mkpath([$dir], 0, (oct($conf{html_mode}) || 0755)) or warn "failed to mkdir $dir"; umask $prevu; } $output_revpath = $revpath; # set the global $output_revpath =~ s/\//-/; # looks nicer return $dir; } # --------------------------------------------------------------------------- sub clean_up { chdir "/"; system "rm -rf $conf{tmp}/*.$$ '".join("' '", @tmps)."'"; } # --------------------------------------------------------------------------- sub log_metadata_xml { my ($daterev, @files) = @_; my $str = ''; # this is extracted into the info.xml file later by the gen_info_xml script foreach my $f (@files) { my $nice_f = $f; # LOGS.all-ham-bb-jm.log.gz => ham-bb-jm.log $nice_f =~ s/^LOGS\.all-//; $nice_f =~ s/\.gz$//; $str .= qq{ $daterev $logmd{$f}->{revision} $logmd{$f}->{fsize} $logmd{$f}->{startdate} $logmd{$f}->{mtime} }; } $str =~ s/\s+/ /gs; # on a single line please return ''.$str.''; } sub get_rulemetadata_for_revision { my ($daterev, $rev) = @_; my $dir = create_rulemetadata_dir($rev); # argh. this is silly; ~bbmass/.corpus specifies "$PWD" in its # "tree" path, so we have to ensure we're in the 'masses' dir # for this to work! { my $origwd = Cwd::getcwd; chdir "$conf{tree}/masses" or die "cannot chdir $conf{tree}/masses"; my $cmd = "rule-qa/get-rulemetadata-for-revision ". "--rev=$rev --outputdir='$dir'"; system($cmd); if ($? >> 8 != 0) { warn "'$cmd' failed"; } chdir $origwd or die "cannot return to $origwd"; } } sub create_rulemetadata_dir { my $rev = shift; my $dir = "$opt_datadir/rulemetadata/$rev"; if (!-d $dir) { my $prevu = umask 0; mkpath([$dir], 0, (oct($conf{html_mode}) || 0755)) or warn "failed to mkdir $dir"; umask $prevu; } return $dir; }