#!/usr/bin/perl #my $email_to = 'pds@apache.org'; my $email_to = 'ruleqa@spamassassin.apache.org'; use strict; use Getopt::Long; our ( $corpusdir ); GetOptions( "dir=s" => \$corpusdir, ); use File::Path; use File::Copy; use Time::ParseDate; use Cwd; use POSIX qw(nice strftime); nice(15); my %revision = (); my %logs_by_rev = (); my %is_net_revision = (); my %dateline = (); my %time = (); my @files; my $time_start = time; my %revision_date = (); my %before_nine = (); my $delete_weekly = 60*60*24*9; my $delete_nightly = 60*60*24*3; &rename_corpus; &read_files; &cleanup_old; &email_beforenine; sub rename_corpus { opendir(CORPUS, $corpusdir); my @rfiles = sort readdir(CORPUS); closedir(CORPUS); @rfiles = grep { /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && !(/\.r[0-9]+\.log$/) && -f "$corpusdir/$_" && -M _ < 10 } @rfiles; foreach my $file (@rfiles) { my $rev; open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file"; while (my $line = ) { last if $line !~ /^#/; if ($line =~ m/^# Date:\s*(\S+)/) { my $date_line = $1; my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/; my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0", GMT => 1, PREFER_PAST => 1); my $timetgt = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0", GMT => 1, PREFER_PAST => 1); if ($timet < $timetgt) { $before_nine{$file} = $timet; } } if ($line =~ m/^# SVN revision:\s*(\S+)/) { $rev = $1; } } close(FILE); if ($rev) { my $newfile = $file; $newfile =~ s/\.log$/.r$rev.log/; rename("$corpusdir/$file", "$corpusdir/$newfile"); } } } sub read_files { opendir(CORPUS, $corpusdir); @files = sort readdir(CORPUS); closedir(CORPUS); @files = grep { /^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10 } @files; foreach my $file (@files) { open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file"; while (my $line = ) { last if $line !~ /^#/; if ($line =~ m/^# Date:\s*(\S+)/) { $dateline{$file} = $1; # if time line unparseable (localized?) use this instead my ($yyyy, $mm, $dd, $h, $m, $s) = $dateline{$file} =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/; my $timetgt = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0", GMT => 1, PREFER_PAST => 1); $time{$file} = $timetgt; } if ($line =~ m/^# SVN revision:\s*(\S+)/) { my $rev = $1; $revision{$file} = $rev; $logs_by_rev{$rev} ||= [ ]; push (@{$logs_by_rev{$rev}}, $file); if ($file =~ /-net-/) { $is_net_revision{$rev} = 1; } } } if ($time{$file} && $revision{$file}) { my $rev = $revision{$file}; $revision_date{$rev} = $time{$file} unless defined $revision_date{$rev}; # set earliest file that has this revision if ($time{$file} < $revision_date{$rev}) { $revision_date{$rev} = $time{$file}; } } close(FILE); } } sub cleanup_old { my @cleanup = (); foreach my $revision (keys %revision_date) { # set target date based on if net rev my $target_date = ($time_start - $delete_nightly); $target_date = ($time_start - $delete_weekly) if $is_net_revision{$revision}; # add all files to cleanup arr if ($revision_date{$revision} < $target_date) { push(@cleanup, @{$logs_by_rev{$revision}}) } } my @cleanup = map "$corpusdir/$cleanup[$_]", 0..$#cleanup; unlink($_) foreach @cleanup; } sub email_beforenine { my $size = keys %before_nine; return unless $size; my $from = 'automc@sa-vm1.apache.org'; my $subject = '[corpus-cleanup] Early runners'; my $message = "The following files were submitted by early runners:\n\n"; foreach my $revision (keys %before_nine) { my $time = strftime("%F %R:%S %z", gmtime($before_nine{$revision})); $message .= "$revision - Started at $time\n"; } $message .= "\nPlease run automasscheck after 0900 UTC"; open(MAIL, "|/usr/sbin/sendmail -t"); # Email Header print MAIL "To: $email_to\n"; print MAIL "From: $from\n"; print MAIL "Subject: $subject\n"; print MAIL "MIME-Version: 1.0\n"; print MAIL "Content-Type: text/plain; charset=UTF-8\n"; print MAIL "Content-Transfer-Encoding: 8bit\n"; print MAIL "\n"; # Email Body print MAIL $message; close(MAIL); }