#!/usr/bin/perl -w use strict; # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # sub aidbg; sub usage { my $status = shift; my $out = $status ? \*STDERR : \*STDOUT; print $out < -j=jobs specify the number of processes to run simultaneously --net turn on network checks! --mid report Message-ID from each message --debug=LIST report debugging information (default is all facilities, LIST is a comma-separated list of facilities) --progress show progress updates during check --noisy show noisier progress updates during check --showdots print a dot for each scanned message --rewrite=OUT save rewritten message to OUT (default is /tmp/out) --rules=RE Only test rules matching the given regexp RE --restart=N restart all of the children after processing N messages --deencap=RE Extract SpamAssassin-encapsulated spam mails only if they were encapsulated by servers matching the regexp RE (default = extract all SpamAssassin-encapsulated mails) --lint check rules for syntax before running --cf='config line' Additional line of configuration client/server mode options --server host:port use server mode, running on the given hostname and port --client host:port use client mode, connecting to the given hostname and port --cs_max N at most, only ever request (client)/give out (server) a maximum of N messages (defaults to 1000) --cs_timeout N in client mode, try to connect to the server every N seconds defaults to 120 in server mode, timeout messages after N seconds defaults to 300 --cs_paths_only only used in client mode. when making requests of the server, only ask for paths to the messages and not the messages themselves. useful when the client and server have the same paths to the corpus data. log options -o write all logs to stdout --loghits log the text hit for patterns (useful for debugging) --loguris log the URIs found --logmem log the memory delta (only on Linux) --hamlog=log use as ham log ('ham.log' is default) --spamlog=log use as spam log ('spam.log' is default) message selection options -n no date sorting or spam/ham interleaving --cache use cache information when selecting messages --cachedir=dir write cache info for --cache in this directory tree --all don't skip big messages message selection options, can be specified for each target --after=N only test mails received after time_t N (negative values are an offset from current time, e.g. -86400 = last day) or after date as parsed by Time::ParseDate (e.g. '-6 months') --before=N same as --after, except received times are before time_t N --scanprob=N probability of scanning a message, range 0.0 - 1.0 (default: 1.0) message selection options, can be specified for each target class --head=N only check first N ham and N spam (N messages if -n used) --tail=N only check last N ham and N spam (N messages if -n used) simple target options (implies -o and no ham/spam classification) --dir subsequent targets are directories --file subsequent targets are files in RFC 822 format --mbox subsequent targets are mbox files --mbx subsequent targets are mbx files Just left over functions we should remove at some point: --bayes report score from Bayesian classifier options used during score generation process --learn=N learn N% of messages as spam or ham --reuse reuse network checks if X-Spam-Status: is present in messages non-option arguments are used as target names (mail files and folders), the target format is: :: is "spam" or "ham" is "dir", "file", "mbx", "mbox", or "detect" (see 'perldoc Mail::SpamAssassin::ArchiveIterator) is a file or directory name. globbing of ~ and * is supported EOF exit($status); } ########################################################################### use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits $opt_mid $opt_net $opt_nosort $opt_progress $opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart $opt_loguris $opt_logmem $opt_after $opt_before $opt_rewrite $opt_deencap $opt_learn $opt_reuse $opt_lint $opt_cache $opt_noisy $opt_cf $total_messages $statusevery $opt_cachedir $opt_scanprob $opt_client $opt_cs_max $opt_cs_timeout $opt_cs_paths_only $opt_server %postdata %real $svn_revision $tmpfd %reuse %orig_conf %reuse_conf $reuse_rules_loaded_p); use FindBin; # use "blib" so that we can use e.g. @@LOCAL_STATE_DIR@@; "lib" doesn't # have that stuff substituted :( use lib too, though, as a backup, # since some users might be running mass-check without "make" first use lib "$FindBin::Bin/../lib"; use lib "$FindBin::Bin/../blib/lib"; eval "use bytes"; use IO::Select; use IO::Socket; use Mail::SpamAssassin::ArchiveIterator; use Mail::SpamAssassin; use Mail::SpamAssassin::Logger; use File::Copy; use File::Spec; use Getopt::Long; use POSIX qw(strftime); use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; }; use constant HAS_IO_ZLIB => eval { require IO::Zlib; }; use Config; # default settings $opt_c = "$FindBin::Bin/../rules"; $opt_p = "$FindBin::Bin/spamassassin"; $opt_j = 1; $opt_head = 0; $opt_tail = 0; $opt_net = 0; $opt_hamlog = "ham.log"; $opt_spamlog = "spam.log"; $opt_learn = 0; $reuse_rules_loaded_p = 0; $opt_cf = []; my @ORIG_ARGV = @ARGV; GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug:s", "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net", "progress!", "rewrite:s", "showdots", "spamlog=s", "tail=i", "rules=s", "restart=i", "loguris", "deencap=s", "logmem", "learn=i", "reuse", "lint", "cache", "cachedir=s", "noisy", "scanprob=f", "server=s", "cs_max=i", "cs_timeout=i", "cs_paths_only", "client=s", "before=s" => \&deal_with_before_after, "after=s" => \&deal_with_before_after, 'cf=s' => \@{$opt_cf}, "dir" => sub { $opt_format = "dir"; }, "file" => sub { $opt_format = "file"; }, "mbox" => sub { $opt_format = "mbox"; }, "mbx" => sub { $opt_format = "mbx"; }, "help" => sub { usage(0); }, '<>' => \&target) or usage(1); # We need IO::Zlib for client-server mode! if ( ($opt_client || $opt_server) && ! HAS_IO_ZLIB ) { die "IO::Zlib required for client/server mode!\n"; } # rules.pl is for the --reuse option, score set doesn't matter if ($opt_reuse) { my $rules_path = "$FindBin::Bin/tmp/rules.pl"; if (! -f $rules_path) { # some people specify paths relatively, whereas this needs an absolute path, # so "do the right thing"(tm). my $abs_opt_c = File::Spec->rel2abs($opt_c); system("cd $FindBin::Bin; perl ../build/parse-rules-for-masses -d $abs_opt_c"); } require $rules_path; } if ($opt_noisy) { $opt_progress = 1; # implies --progress } $opt_debug ||= 'all' if defined $opt_debug; my $user_prefs = "$opt_p/user_prefs"; # --lint # In theory we could probably use the same spamtest object as below, # but since it's probably not expecting that, and we don't want # strange things happening, create a local object. if ($opt_lint) { my $spamlint = new Mail::SpamAssassin ({ 'debug' => $opt_debug, 'rules_filename' => $opt_c, 'userprefs_filename' => $user_prefs, 'site_rules_filename' => "$opt_p/local.cf", 'userstate_dir' => "$opt_p", 'save_pattern_hits' => $opt_loghits, 'dont_copy_prefs' => 1, 'local_tests_only' => $opt_net ? 0 : 1, 'only_these_rules' => $opt_rules, 'ignore_safety_expire_timeout' => 1, 'post_config_text' => join("\n", @{$opt_cf})."\n", PREFIX => '', DEF_RULES_DIR => $opt_c, # TODO: it would be nicer for mass-check to not have to specify # this, or to use the same compiler as spamassassin, sa-update etc. LOCAL_RULES_DIR => '', }); $spamlint->debug_diagnostics(); my $res = $spamlint->lint_rules(); $spamlint->finish(); warn "lint: $res issues detected, please rerun with debug enabled for more information\n" if ($res); exit 1 if $res; } # test messages for the mass-check my @targets; if (!$opt_client) { if ($opt_f) { open(F, $opt_f) || die "cannot read target $opt_f: $!"; push(@targets, map { chomp; $_ } ); close(F); } usage(1) if !@targets; } my $spamtest = new Mail::SpamAssassin ({ 'debug' => $opt_debug, 'rules_filename' => $opt_c, 'userprefs_filename' => $user_prefs, 'site_rules_filename' => "$opt_p/local.cf", 'userstate_dir' => "$opt_p", 'save_pattern_hits' => $opt_loghits, 'dont_copy_prefs' => 1, 'local_tests_only' => $opt_net ? 0 : 1, 'only_these_rules' => $opt_rules, 'ignore_safety_expire_timeout' => 1, 'post_config_text' => join("\n", @{$opt_cf})."\n", PREFIX => '', DEF_RULES_DIR => $opt_c, LOCAL_RULES_DIR => '', }); $spamtest->compile_now(1); $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf"); # generated user_prefs if ($opt_reuse) { # copy current prefs if it exists $spamtest->copy_config(undef, \%orig_conf); # zeroed scores to mass_prefs my @zero = sort grep { defined $reuse{$_}->{skip} } keys %reuse; open(PREFS, ">> $opt_p/mass_prefs") || die "Unable to open $opt_p/mass_prefs: $!\nNeeded for --reuse to work properly"; for my $zero (@zero) { print PREFS "score $zero 0\n"; } close(PREFS); $spamtest->read_scoreonly_config("$opt_p/mass_prefs"); $spamtest->copy_config(undef, \%reuse_conf); $reuse_rules_loaded_p = 1; } my $who = `id -un 2>/dev/null`; my $where = `uname -n 2>/dev/null`; my $when = `date -u`; my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost'; chomp $who; chomp $where; chomp $when; chomp $host; $svn_revision = get_current_svn_revision(); # when displaying the commandline, quote any arguments which have # "questionable" characters such as spaces, pipes, etc. my $cmdline = join(' ',map { m@[^A-Za-z0-9_/\\.-]@ ? qq/"$_"/ : $_ } @ORIG_ARGV); $cmdline =~ s/\s+/ /gs; my $isowhen = strftime("%Y%m%dT%H%M%SZ", gmtime(time)); # better my $log_header = "# mass-check results from $who\@$where, on $when\n" . "# M:SA version ".$spamtest->Version()."\n" . "# SVN revision: $svn_revision\n" . "# Date: $isowhen\n" . "# Perl version: $] on $Config{archname}\n" . "# Switches: '$cmdline'\n"; my $updates = ($opt_noisy ? 100 : 10); my $total_count = 0; my $spam_count = 0; my $ham_count = 0; my $init_results = 0; my $showdots_active = ($opt_showdots || $opt_noisy); my $showdots_counter = 0; my $showdots_every = ($opt_showdots ? 1 : 20); my $AIopts = { 'opt_all' => $opt_all, }; if (!$opt_client) { # Deal with --rewrite if (defined $opt_rewrite) { my $rewrite = ($opt_rewrite ? $opt_rewrite : "/tmp/out"); open(REWRITE, "> $rewrite") || die "open of $rewrite failed: $!"; } # ArchiveIterator options for non-client mode $AIopts->{'opt_scanprob'} = $opt_scanprob; $AIopts->{'opt_cache'} = $opt_cache; $AIopts->{'opt_cachedir'} = $opt_cachedir; $AIopts->{'opt_after'} = $opt_after; $AIopts->{'opt_before'} = $opt_before; $AIopts->{'scan_progress_sub'} = \&showdots_blip; $AIopts->{'opt_want_date'} = ! $opt_n; # ensure that scanprob stuff is predictable and reproducable if (defined $opt_scanprob && $opt_scanprob < 1.0) { srand(1); } } else { # ArchiveIterator options for client mode -- tends to be simple $opt_n = 1; $AIopts->{'opt_want_date'} = 0; } ########################################################################### ## SCAN MODE my $iter = new Mail::SpamAssassin::ArchiveIterator($AIopts); # setup the AI functions if ($opt_client) { $iter->set_functions(\&wanted, \&result_client); } elsif ($opt_server) { $iter->set_functions(\&wanted_server, \&result); } else { $iter->set_functions(\&wanted, \&result); } my $messages; # normal mode as well as a server do scan mode and get a temp file if (!$opt_client) { status('starting scan stage') if ($opt_progress); # Make a temp file and delete it my $tmpf; ($tmpf, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile(); die 'mass-check: failed to create temp file' unless $tmpf; unlink $tmpf or die "mass-check: unlink '$tmpf': $!"; # having opt_j or server mode means do scan in a separate process if ($opt_server || $opt_j) { if ($tmpf = fork()) { # parent waitpid($tmpf, 0); } elsif (defined $tmpf) { # child -- process using message_array generate_queue(\@targets, $tmpfd); exit; } else { die "mass-check: cannot fork: $!"; } } else { # we get here if opt_j == 0, so scan in this process generate_queue(\@targets, $tmpfd); } # we now have a temporary file with the messages to process seek($tmpfd, 0, 0); # the first line is the number of messages $total_messages = read_line($tmpfd); if (!$total_messages) { die "mass-check: no messages to process\n"; } status("completed scan stage, $total_messages messages") if ($opt_progress); } ########################################################################### ## RUN MODE if ($opt_client) { client_mode(); } else { status('starting run stage') if ($opt_progress); if ($opt_server) { server_mode(); } else { run_through_messages(); } status('completed run stage') if ($opt_progress); } # Even though we're about to exit, let's clean up after ourselves close($tmpfd) if ($tmpfd); showdots_finish(); if (defined $opt_rewrite) { close(REWRITE); } $spamtest->finish(); # exit status: did we check at least one message correctly? exit(!($ham_count || $spam_count)); ########################################################################### sub target { my ($target) = @_; # message-selection options; these can now be specified separately # for each target my %selopts = ( opt_scanprob => $opt_scanprob, opt_after => $opt_after, opt_before => $opt_before ); if (!defined($opt_format)) { push(@targets, { %selopts, target => $target }); } else { $opt_o = 1; push(@targets, { %selopts, target => "spam:$opt_format:$target" }); } } ########################################################################### sub init_results { $init_results = 1; showdots_finish(); # now, showdots only happens if --showdots was used $showdots_active = $opt_showdots; if ($opt_progress) { # round up since 100% will be caught at end already $statusevery = int($total_messages / $updates + 1); # if $messages < $updates, just give a status line per msg. $statusevery ||= 1; } return if $opt_client; if ($opt_o) { autoflush STDOUT 1; print STDOUT $log_header; } else { open(HAM, "> $opt_hamlog") || die "open of $opt_hamlog failed: $!"; open(SPAM, "> $opt_spamlog") || die "open of $opt_spamlog failed: $!"; autoflush HAM 1; autoflush SPAM 1; print HAM $log_header; print SPAM $log_header; } } sub result { my ($class, $result, $time) = @_; # don't open results files until we get here to avoid overwriting files init_results() if !$init_results; if ($class eq "s") { if ($opt_o) { print STDOUT $result; } else { print SPAM $result; } $spam_count++; } elsif ($class eq "h") { if ($opt_o) { print STDOUT $result; } else { print HAM $result; } $ham_count++; } $total_count++; if ($opt_progress) { progress($time); } } sub wanted { my ($class, $id, $time, $dataref, $format) = @_; my $out = ''; # if origid is defined, it'll be the message number from server mode my $origid; # client mode is a little crazy because we need to kluge around the fact # that the information needed to do the run is different than the # information that goes into the results. if ($opt_client) { if ($opt_cs_paths_only) { # the server message number $origid = $real{$id}; } else { # if we're a non-paths_only client, change the format and id to the real # version, make sure to remember the server's message number $origid=$id; $origid =~ s/^.+?(\d+)$/$1/; $format = $real{$id}->[2]; $id = $real{$id}->[3]; } } memory_track_start() if ($opt_logmem); # parse the message, and force it to complete my $ma = $spamtest->parse($dataref, 1); # remove SpamAssassin markup, if present and the mail was spam my $header = $ma->get_header("Received"); my $x_spam_status; if ($opt_reuse) { # get X-Spam-Status: header for rule hit resue $x_spam_status = $ma->get_header("X-Spam-Status"); } # previous hits my @previous; if ($x_spam_status) { $x_spam_status =~ s/,\s+/,/gs; if ($x_spam_status =~ m/tests=(\S*)/ && $x_spam_status !~ /\bshortcircuit=(?:ham|spam|default)\b/) { push @previous, split(/,/, $1); # we found previous tests, so move the reuse config into place unless ($reuse_rules_loaded_p) { $spamtest->copy_config(\%reuse_conf, undef); $reuse_rules_loaded_p = 1; } } } elsif ($opt_reuse) { if ($reuse_rules_loaded_p) { $spamtest->copy_config(\%orig_conf, undef); $reuse_rules_loaded_p = 0; } } if ($header && $header =~ /\bwith SpamAssassin\b/) { if (!$opt_deencap || message_should_be_deencapped($ma)) { my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1); $ma->finish(); $ma = $new_ma; } } # plugin hook to cause us to skip messages my $skip = $spamtest->call_plugins("mass_check_skip_message", { class => $class, 'time' => $time, 'id' => $id, msg => $ma }); if ($skip) { $ma->finish(); return; } # log-uris support my $status; my @uris; my $before; my $after; if ($opt_loguris) { my $pms = Mail::SpamAssassin::PerMsgStatus->new($spamtest, $ma); @uris = $pms->get_uri_list(); $pms->finish(); } else { $before = time; $status = $spamtest->check($ma); $after = time; } my @extra; # sample-based learning if ($opt_learn > 0) { my $spam; # spam learned as ham = 0.05% if ($class eq 's' && rand(100) < 0.05) { $spam = 0; } # ham learned as spam = 0.01% elsif ($class eq 'h' && rand(100) < 0.01) { $spam = 1; } # spam/ham learned correctly elsif (rand(100) < $opt_learn) { if ($class eq 's') { $spam = 1; } elsif ($class eq 'h') { $spam = 0; } else { die "unknown class, learning failed"; } } if (defined $spam) { my $result = ($spam ? "spam" : "ham"); my $status = $spamtest->learn($ma, undef, $spam, 0); my $learned = $status->did_learn(); $result = "undef" if !defined $learned; push(@extra, "learn=".$result); } } if (defined($time)) { push(@extra, "time=".$time); } if ($status && defined $status->{bayes_score}) { push(@extra, "bayes=".sprintf("%06f", $status->{bayes_score})); } if ($opt_mid) { my $mid = $ma->get_header("Message-Id"); if ($mid) { # message contains a Message-Id: while($mid =~ s/\([^\(\)]*\)//s) {}; # remove comments and $mid =~ s/^\s+|\s+$//sg; # leading and trailing spaces $mid =~ s/\s.*$//s; # keep only the first token } else { # it doesn't have a Message-Id: $mid = $id; # so build one from the id $mid =~ s,^.*/,,; # remove the path $mid = "<$mid\@$host.masses.spamassassin.org>"; # and put it together } $mid =~ tr/-A-Za-z0-9_!#%&=~<@>/./c; # replace dangerous chars with . (so regexp search just works) push(@extra, "mid=$mid"); } push(@extra, "scantime=" . ($after - $before)); push(@extra, "format=$format"); if ($opt_logmem) { my $mem = memory_track_finish(); if ($mem) { push(@extra, $mem); } } if ($reuse_rules_loaded_p) { push(@extra, "reuse=yes"); } else { push(@extra, "reuse=no"); } if ($opt_client) { push(@extra, "host=$where"); } my $yorn; my $score; my $tests; my $extra; if ($opt_loguris) { $yorn = '.'; $score = 0; $tests = join(" ", sort @uris); $extra = ''; } else { $yorn = $status->is_spam() ? 'Y' : '.'; # don't bother adjusting scores for reuse $score = $status->get_score(); # list of tests hit my @tests; push @tests, split(/,/, $status->get_names_of_tests_hit()); push @tests, split(/,/, $status->get_names_of_subtests_hit()); # hit reuse if ($x_spam_status) { # generate mapping of hits to remove hits that are marked as skip @tests = grep { !$reuse{$_}->{skip} } @tests; # add hits from previous for (@previous) { push(@tests, $reuse{$_}->{reuse}) if $reuse{$_}->{reuse}; } } $tests = join(",", sort(@tests)); $extra = join(",", @extra); } if (defined $opt_rewrite) { print REWRITE $status->rewrite_mail(); } $id =~ s/\s/_/g; # if we have an origid set, it'll be the server mode's message number, so # attach it to our result appropriately. if (defined $origid) { $out = "$origid "; } $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra); if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) { $out .= logkilled($ma, $id, "possible virus"); } if ($opt_loghits) { my $log = ''; foreach my $t (sort keys %{$status->{pattern_hits}}) { $_ = $status->{pattern_hits}->{$t}; $_ ||= ''; s/\r/\\r/gs; # fix unprintables s/\n/\\n/gs; $log .= "$t=\"$_\" "; } if ($log) { chomp $log; $out .= "# $log\n"; } } if (defined $status) { $status->finish(); } $ma->finish(); undef $ma; # clean 'em up undef $status; showdots_blip(); # print ">>>> out = $out\n"; return $out; } sub showdots_blip { return unless ($showdots_active); $showdots_counter++; if ($showdots_counter % $showdots_every == 0) { print STDERR '.'; if ($showdots_counter % (60 * $showdots_every) == 0) { print STDERR "\n"; } } } sub showdots_finish { print STDERR "\n" if ($showdots_active); $showdots_counter = 0; } # ick. We have to go grovelling through the body parts to see if a message # is a report_safe-marked-up message, because a local scanner will overwrite # any remote scanner's X-Spam-Checker-Version header. # sub message_should_be_deencapped { my ($ma) = @_; # not sure why this is undefined, but it is sometimes if (defined $ma->{body_parts} && scalar @{$ma->{body_parts}} > 0) { my $firstpart = $ma->{body_parts}->[0]; if (!$firstpart->{headers}->{'content-type'} || $firstpart->{headers}->{'content-type'} ne 'text/plain') { return 0; # not a 'report_safe' encapsulation } if (scalar @{$firstpart->{raw}} < 3) { return 0; } # too short to be a report # grab first 2 lines my $text = $firstpart->{raw}->[0] . $firstpart->{raw}->[1]; $text =~ s/\s+/ /gs; if ($text =~ /^Spam detection software, running on the system \"(\S+)\"/) { my $hname = $1; if ($hname =~ /$opt_deencap/io) { return 1; } } } return 0; # a different host marked it up. pass it through! } sub logkilled { my ($ma, $id, $reason) = @_; my $from = $ma->get_header("From") || 'undef'; my $to = $ma->get_header("To") || 'undef'; my $subj = $ma->get_header("Subject") || 'undef'; my $mid = $ma->get_header("Message-Id") || 'undef'; chomp ($from); chomp ($to); chomp ($subj); chomp ($mid); return "# skipped killfiled message ($reason): from=$from to=$to subj=$subj mid=$mid id=$id\n"; } sub progress { my ($time) = @_; $time ||= 0; # Are we at the end or otherwise at a point we should print status? Then do it. if ($total_messages == $total_count || $total_count % $statusevery == 0) { my $time = strftime("%Y-%m-%d", localtime($time)); status(sprintf("%3d%% ham: %-6d spam: %-6d date: %s", int(($total_count / $total_messages) * 100), $ham_count, $spam_count, $time)); } } sub status { my($str) = @_; my $now = strftime("%Y-%m-%d %X", localtime(time)); printf STDERR "status: %-48s now: %s\n", $str, $now; } ########################################################################### our ($mem_size, $mem_rss, $mem_shared); sub memory_track_start { if ($^O =~ /linux/i) { if (open (IN, "; close IN; if ($statm =~ /^(\d+) (\d+) (\d+) /) { $mem_size = $1; $mem_rss = $2; $mem_shared = $3; } } } } sub memory_track_finish { my $str = ''; if ($^O =~ /linux/i) { if (open (IN, "; close IN; if ($statm =~ /^(\d+) (\d+) (\d+) /) { my $size = $1; my $rss = $2; my $shared = $3; $str = sprintf ("memsz=%d,memrss=%d,memshr=%d", ($size - $mem_size), ($rss - $mem_rss), ($shared - $mem_shared)); } } } return $str; } sub get_current_svn_revision { my $revision; # this is usually "${TOPDIR}/masses" my $dir = $FindBin::Bin || "."; if (-d "$dir/.svn" || -f "$dir/svninfo.tmp") { if (-f "$dir/svninfo.tmp") { # created by build/automc/buildbot_ready for chrooted mass-checks open (SVNINFO, "< $dir/svninfo.tmp"); } else { # note, ".." since we want to pick up changes outside 'masses' # too! open (SVNINFO, "( svn info --non-interactive $dir/.. || svn info $dir/.. ) 2>&1 |"); } while () { # Revision: 383822 next unless /^Revision: (\d+)/; $revision = $1; last; } close SVNINFO; return $revision if $revision; } # this probably will never work due to Rules Project changes TODO if (open(TESTING, "$opt_c/70_testing.cf")) { chomp($revision = ); $revision =~ s/.*\$Rev:\s*(\S+).*/$1/; close(TESTING); return $revision if $revision; } return $revision || "unknown"; } ############################################################################ ## children processors, start and process, used when opt_j > 1 sub start_children { my ($count, $child, $pid, $socket) = @_; my $io = IO::Socket->new(); my $parent; # create children for (my $i = 0; $i < $count; $i++) { ($child->[$i],$parent) = $io->socketpair(AF_UNIX,SOCK_STREAM,PF_UNSPEC) or die "mass-check: socketpair failed: $!"; if ($pid->[$i] = fork) { close $parent; # disable caching for parent<->child relations my ($old) = select($child->[$i]); $|++; select($old); $socket->add($child->[$i]); aidbg "mass-check: starting new child $i (pid ".$pid->[$i].")\n"; next; } elsif (defined $pid->[$i]) { my $result; my $line; close $tmpfd if defined $tmpfd; close $child->[$i]; select($parent); $| = 1; # print to parent by default, turn off buffering send_line($parent,"START"); while ($line = read_line($parent)) { if ($line eq "exit") { close $parent; exit; } my($class, $format, $date, $where, $result) = $iter->_run_message($line); $result ||= ''; # If determine_receive_date is not set, the original input date # wasn't calculated, but run_message would have done so, so reset # the packed version if possible ... use defined for date since # it could == 0. if (!$iter->{determine_receive_date} && $class && $format && defined $date && $where) { $line = Mail::SpamAssassin::ArchiveIterator::_index_pack($date, $class, $format, $where); } send_line($parent,"$result\0RESULT $line"); } exit; } else { die "mass-check: cannot fork: $!"; } } } ## handling killing off the children sub reap_children { my ($count, $socket, $pid) = @_; # If the child died, sending it the exit will generate a SIGPIPE, but we # don't really care since the readline will go undef (which is fine), # then we do the waitpid which will finish it off. So we end up in the # right state, in theory. local $SIG{'PIPE'} = 'IGNORE'; for (my $i = 0; $i < $count; $i++) { aidbg "mass-check: killing child $i (pid ",$pid->[$i],")\n"; send_line($socket->[$i],"exit"); # tell the child to die. close $socket->[$i]; waitpid($pid->[$i], 0); # wait for the signal ... } } # in server mode, this gets called to read in the HTTP request from a given # socket, then return the information the client sent to us. sub handle_http_request { my $socket = shift; my $headers = {}; my $postdata = {}; # read in the request (POST / HTTP/1.0) my $line = $socket->getline(); $line ||= ''; $line =~ s/\r\n$//; my ($type, $URI, $VERS) = $line =~ /^([a-zA-Z]+)\s+(\S+)(?:\s*(\S+))/; unless ($type && $URI && $VERS) { $type ||= ''; $URI ||= ''; return ($type, $URI, $headers, $postdata); } $type = uc $type; # read in headers, "key: value" up to a blank line do { $line = $socket->getline(); last unless defined $line; $line =~ s/\r\n$//; if ($line) { my ($k,$v) = split(/:\s*/, $line, 2); $headers->{lc $k} = $v; } } while ($line !~ /^$/); # if this is a POST request w/ content-length, there'll be a payload, deal # with it. if ($type eq 'POST' && $headers->{'content-length'}) { my $pd; $socket->read($pd, $headers->{'content-length'}); $pd =~ s/[\r\n]+$//; # a hack for manual requests/telnet/etc # key1=value1&key2=value2... %{$postdata} = map { my($k,$v) = split(/=/, $_, 2); # we need to decode the key and value $k =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg; $v =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg; $k => $v; } split(/\&/, $pd); } return($type, $URI, $headers, $postdata); } # in server mode, generate a gzip compressed data stream with the messages and # return the path to the compressed file which the server will read and pass # to the client. # # Input: # - Number of messages to generate (scalar) # - Hash of Arrays of outstanding requests (reference to hash of array refs) # timestamp# -> [ num1, num2, ... ] # Used to quickly find outstanding/timed out messages to send to client. # - Hash of outstanding messages and associated data (ref to hash of hash refs) # num1 -> { data => 'binary data from scan mode', timestamp => timestamp# } # Used later on to specify the timestamp entry to remove the entry from. # - Paths only? If true, just include the original message data in the gzip # file. Otherwise, include the message data. Useful if the client has the # corpus available via the same paths as originally specified. # # Returns: scalar path to gzip file # sub generate_messages { my($msgs, $timestamps, $msgsout, $paths_only) = @_; # Hold the message numbers we'll be sending out my @tosend = (); # Find out if any of the messages we sent out before need to be sent out # again because we haven't seen a response within the timeout. my $tooold = time - $opt_cs_timeout; foreach (sort { $a <=> $b } keys %{$timestamps}) { # since we're going in numeric order, if the current entry is newer than # the timeout value, the rest will be too, so stop looking. last if ($_ > $tooold); # how many messages do we still need to fulfill the request? my $wanted = $msgs - @tosend; if (@{$timestamps->{$_}} > $wanted) { # there are more entries in the timestamp list than we want, so just # grab that many off the list. push(@tosend, splice @{$timestamps->{$_}}, 0, $wanted); } else { # there are just enough, or not enough entries on the timestamp list to # satisfy our request, so take them all and we'll loop around. push(@tosend, @{$timestamps->{$_}}); delete $timestamps->{$_}; } # Ok, we have enough messages so we can stop now. last if (@tosend == $msgs); } # if we still have the temp file with the input messages open, we'll fillup # out message output queue with messages from there. if ($tmpfd) { while (@tosend < $msgs) { my $msg = read_line($tmpfd); # no more messages from the temp file, close it out unless ($msg) { delete $msgsout->{'curnum'}; close $tmpfd; undef $tmpfd; last; } # we got a result, so assign it a number (curnum) and store the data # appropriately, then add the new number to the queue. my $num = $msgsout->{'curnum'}++; $msgsout->{$num}->{'data'} = $msg; push(@tosend, $num); } } # ok, at this point, @tosend ought to have a list of numbers, pointers into # %{$msgsout}. turn that into a tar file. return '' unless @tosend; my($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); die "Can't make tempfile, exiting" unless $gzpath; close($gzfd); $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!"; # first line is the number of messages included in the file send_line($gzfd, scalar @tosend) || die "mass-check: error when writing to gz temp file\n"; # Generate an archive in the temp file foreach my $num (@tosend) { # Archive format, gzip compressed file w/ 3 parts per message: # 1- server message number in text format # 2- server index string, binary packed format # 3- message content -- unless paths_only send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n"; my $data = $msgsout->{$num}->{'data'}; send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n"; if (!$paths_only) { my $msg = ($iter->_run_message($data))[4]; send_line($gzfd, join('', @{$msg})) || die "mass-check: error when writing to gz temp file\n"; } } $gzfd->close; # update timestamp entries my $ts = time; foreach (@tosend) { $msgsout->{$_}->{'timestamp'} = $ts; } # conveniently, this list should be the only thing sent out w/ this # timestamp, so just set the reference appropriately. :) $timestamps->{$ts} = \@tosend; if ($opt_noisy) { print "generated ".scalar(@tosend)." messages\n"; } return $gzpath; } # we've gotten results posted, so clean up msgsout and timestamp hashes and # process result... sub handle_post_results { my($postdata, $timestamps, $msgsout) = @_; # local version to batch the removals my %timestamps = (); # $msgsout->{num}->{data|timestamp} # $timestamp{num} = [ msgout_nums ... ] # $postdata{num} = result_string while( my($k,$v) = each %{$postdata} ) { # message run results will be \d+ => log entry next if ($k !~ /^\d+$/); # if we've been waiting for this result, process it, otherwise throw it on # the ground. multiple clients could have been given the same messages to # process, and we take whatever the first responder sends us. if (exists $msgsout->{$k}) { # the result_sub will need parts of the message data, so get it ready my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msgsout->{$k}->{'data'}); # go ahead and do the result &{$iter->{result_sub}}($d[1], $v, $d[0]); # prep to get rid of the cached entries $timestamps{$msgsout->{$k}->{'timestamp'}}->{$k} = 1; delete $msgsout->{$k}; } } # if we got any results, clean out the results from the timestamp arrays while ( my($k,$v) = each %timestamps ) { # trim out the result list from the timestamp sent list my @temp = grep(!exists $v->{$_}, @{$timestamps->{$k}}); # if there are results left for a specific timestamp, update the array # pointer. otherwise, delete the timestamp entry since it's empty. if (@temp) { $timestamps->{$k} = \@temp; } else { delete $timestamps->{$k}; } } } # This function reads from $tmpfd and processes the message as appropriate wrt # $opt_j, $opt_restart, etc. # sub run_through_messages { # do everything in one process if ($opt_j <= 1 && !defined $opt_restart) { my $message; my $messages; my $total_count = 0; while (($total_messages > $total_count) && ($message = read_line($tmpfd))) { my($class, undef, $date, undef, $result) = $iter->_run_message($message); if ($result) { &{$iter->{result_sub}}($class, $result, $date); } $total_count++; } } # more than one process or one process with restarts else { my $select = IO::Select->new(); my $total_count = 0; my $needs_restart = 0; my @child = (); my @pid = (); my $messages; # start children processes start_children($opt_j, \@child, \@pid, $select); # feed childen, make them work for it, repeat while ($select->count()) { foreach my $socket ($select->can_read()) { my $line = read_line($socket); # some error happened during the read! if (!defined $line) { $needs_restart = 1; warn "mass-check: readline failed, attempting to recover\n"; $select->remove($socket); } elsif ($line =~ /^([^\0]*)\0RESULT (.+)$/s) { my $result = $1; my ($date,$class,$type) = Mail::SpamAssassin::ArchiveIterator::_index_unpack($2); aidbg "mass-check: $class, $type, $date\n"; if (defined $opt_restart && ($total_count % $opt_restart) == 0) { $needs_restart = 1; } # if messages remain, and we don't need to restart, send message if (($total_messages > $total_count) && !$needs_restart) { send_line($socket, read_line($tmpfd)); $total_count++; aidbg "mass-check: $total_messages $total_count\n"; } else { # stop listening on this child since we're done with it aidbg "mass-check: $needs_restart $total_messages $total_count\n"; $select->remove($socket); } # deal with the result we received if ($result) { &{$iter->{result_sub}}($class, $result, $date); } } elsif ($line eq "START") { if ($total_messages > $total_count) { # we still have messages, send one to child send_line($socket, read_line($tmpfd)); $total_count++; aidbg "mass-check: $total_messages $total_count\n"; } else { # no more messages, so stop listening on this child aidbg "mass-check: $needs_restart $total_messages $total_count\n"; $select->remove($socket); } } else { $needs_restart = 1; warn "mass-check: bad line from readline: $line\n"; $select->remove($socket); } } aidbg "mass-check: out of loop, $total_messages $total_count $needs_restart ".$select->count()."\n"; # If there are still messages to process, and we need to restart # the children, and all of the children are idle, let's go ahead. if ($needs_restart && $select->count == 0 && $total_messages > $total_count) { $needs_restart = 0; aidbg "mass-check: needs restart, $total_messages total, $total_count done\n"; reap_children($opt_j, \@child, \@pid); @child=(); @pid=(); start_children($opt_j, \@child, \@pid, $select); } } # reap children reap_children($opt_j, \@child, \@pid); } } # send an HTTP response to a socket based on the input result, headers, and # data values. sub http_response { my($socket, $result, $headers, $data) = @_; print $socket "HTTP/1.0 $result\r\n", "Pragma: no-cache\r\n", "Server: mass-check/$svn_revision\r\n", map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers}; print $socket "\r\n"; print $socket $data; } # the client needs to make a request to the server on a given socket. sub http_make_request { my($socket, $type, $uri, $headers, $data) = @_; print $socket "$type $uri HTTP/1.0\r\n", "User-Agent: mass-check/$svn_revision\r\n", map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers}; print $socket "\r\n"; print $socket $data; # parse the response that the server sends us my $line = $socket->getline() || ''; my(undef, $code, $string) = split(/\s+/, $line, 3); return unless $code == 200; my %headers = (); do { $line = $socket->getline(); last unless defined $line; $line =~ s/\r\n$//; if ($line) { my ($k,$v) = split(/:\s*/, $line, 2); $headers{lc $k} = $v; } } while ($line !~ /^$/); # the server has sent us notification that it's going to exit, so let's # follow suit. return 'finished' if ($headers{'finished'}); my $gzpath = ''; if ($headers{'content-length'}) { my $gzfd; ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); die "Can't make tempfile, exiting" unless $gzpath; my $rd; $socket->read($rd, $headers{'content-length'}) || die "mass-check: error reading in data from server\n"; print $gzfd $rd; close $gzfd; } $socket->close(); return $gzpath; } # Be conservative -- encode most things. # we could encode spaces to plusses, then decode that later, but... sub post_encode { my $string = shift; $string =~ s/([^a-zA-Z0-9_,.\/\\-])/sprintf "%%%02x",unpack("C",$1)/egx; return $string; } # remove all of the files in a given directory, non-recursive sub clean_dir { my $dir = shift; unless (opendir(DIR, $dir)) { warn "error: can't opendir $dir: $!\n"; return; } while(my $file = readdir(DIR)) { $file =~ /^(.+)$/; # untaint $file = $1; my $path = File::Spec->catfile($dir, $file); next unless (-f $path); if (!unlink $path) { warn "error: can't remove file $path: $!\n"; closedir(DIR); return; } } closedir(DIR); return 1; } ############################################################################ # four bytes in network/vax format (little endian) as length of message # the rest is the actual message sub read_line { my $fd = shift; my($length,$msg); # read in the 4 byte length and unpack $fd->read($length, 4) || return; $length = unpack("V", $length); return unless $length; # read in the rest of the single message $fd->read($msg, $length) || return; return $msg; } sub send_line { my $fd = shift; foreach ( @_ ) { my $length = pack("V", length $_); $fd->print($length.$_) || return 0; } return 1; } ############################################################################ # this is the function that implemented server mode. basically, sit and wait # for connections to come in. when a client sends in a request, deal with any # results that the client sent, then generate a response and send it back, # and then go back to waiting. lather, rinse, repeat. sub server_mode { $opt_cs_max ||= 1000; $opt_cs_timeout ||= 60 * 5; my $serv_socket = IO::Socket::INET->new( LocalAddr => $opt_server, Proto => 'tcp', Listen => 5, ReuseAddr => 1, ); die "Could not create socket: $!\n" unless $serv_socket; if ($opt_progress) { status('server ready for connections'); } # Setup out "what messages have been sent out" hashes my $timestamps = {}; my $msgsout = { 'curnum' => 0 }; # Generate an IO::Select object and put the server socket on the queue my $select = IO::Select->new( $serv_socket ); # We'll keep looping while there's something to pay attention to while ($select->count()) { # Sit and block until there's something for us to read from foreach my $socket ($select->can_read()) { if ($socket == $serv_socket) { # it's the server socket, go ahead and accept the connection and add # it to the queue. $select->add($serv_socket->accept); } else { # it's some client, so deal with the request my($type, $URI, $headers, $postdata) = handle_http_request($socket); # we don't do GET, so just send something back if ($type eq 'GET') { if ($opt_noisy) { print "GET request from ".$socket->peerhost."\n"; } http_response($socket, "200 OK", { 'Content-type' => 'text/plain', }, "Your GET request came from IP Address: ".$socket->peerhost."\n"); } elsif ($type eq 'POST') { # ooh, POST. deal with any results that the client sent handle_post_results($postdata, $timestamps, $msgsout); if ($opt_noisy) { print "POST request from ".$socket->peerhost."\n"; } # based on the number of messages that the client requested, # generate a gzip file with the appropriate data in it my $messages = ''; if ($postdata->{'max_messages'}) { my $msgnum = $postdata->{'max_messages'}; if ($msgnum > $opt_cs_max || $msgnum < 1) { $msgnum = $opt_cs_max; } if ($opt_noisy) { print "client requested ".$postdata->{'max_messages'}." messages\n"; } $messages = generate_messages($msgnum, $timestamps, $msgsout, $postdata->{'paths_only'}); } # $messages will contain the path to the gzip file if there are # messages to send out. if ($messages && open(MSG, $messages)) { binmode(MSG); local $/ = undef; # go go slurp mode # send the response http_response($socket, "200 OK", { 'Content-Type' => 'application/x-gzip', 'Content-Encoding' => 'x-gzip', "Content-Length" => (-s $messages), }, scalar ); close(MSG); # we don't need the file anymore, so get rid of it unlink $messages; } elsif (!keys %{$msgsout} && !defined $tmpfd) { # we have no more outstanding messages and our original queue of # messages to process is empty, so tell the client to exit. http_response($socket, "200 OK", { "Content-type" => "text/plain", "Finished" => 1, }, 'We are all done'); } else { # when in doubt, treat this like a GET http_response($socket, "200 OK", { "Content-type" => "text/plain", }, "Your POST request (sans max_messages) came from IP Address: ".$socket->peerhost."\n"); } } else { # for error, "501 Not Implemented" http_response($socket, '501 Not Implemented', {}, ''); } # ok, we don't do keepalive, so get rid of the socket $select->remove($socket); $socket->close; } } if ($opt_noisy) { print scalar(keys %{$msgsout})." messages outstanding\n"; } #print "msgs waiting: ".join(" ", keys %{$msgsout})."\n"; #print "tmpfd defined? ".(defined $tmpfd ? "yes" : "no")."\n"; # we're not awaiting responses and we've exhausted the input file, so # drop the server socket. :) $select->remove($serv_socket) if (!keys %{$msgsout} && !defined $tmpfd); } } # this is the function that implements client mode. generally, in a loop: # make a request of the server for some max number of messages, and send our # results back at the same time. based on the results of that request, put # messages into a temp dir and process them. prep the results and loop. # lather, rinse, repeat. sub client_mode { $opt_cs_max ||= 1000; $opt_cs_timeout ||= 60 * 2; my($host, $uri); if ($opt_client =~ /^http:\/\/([^\/]+)(\/.*)?/) { ($host, $uri) = ($1,$2); } else { $host = $opt_client; if ($host =~ /^:/) { $host = 'localhost'.$host; } } my($http_host) = split(/:/, $host); die "No host found in opt_client" unless $host; $uri ||= "/"; # use this to track how many messages we ought to be requesting # start at 100 to get warmed up my $msgnum = $opt_cs_max > 100 ? 100 : $opt_cs_max; my $tmpdir; # if we're not doing paths_only, create a temp dir where we'll put the # incoming messages to process. if (!$opt_cs_paths_only) { $tmpdir = Mail::SpamAssassin::Util::secure_tmpdir(); die "Can't create tempdir" unless $tmpdir; } # keep going until something stops us. while (1) { # if the number of messages to request is too much, bring it down $msgnum = $opt_cs_max if ($msgnum > $opt_cs_max); # prep the POST request $postdata{'max_messages'} = $msgnum; $postdata{'paths_only'} = 1 if ($opt_cs_paths_only); # the actual POST data string my $POSTDATA = join('&', map { post_encode($_) . '=' . post_encode($postdata{$_}) } keys %postdata); # connect to server my $socket = IO::Socket::INET->new($host); # last if connection fails last unless ($socket); print "Requesting $msgnum messages from server\n" if ($opt_noisy); # make request, include and then drop results if there are any my $result = http_make_request($socket, 'POST', $uri, { 'Host' => $http_host, 'Content-Type' => 'application/x-www-form-urlencoded', 'Content-Length' => length($POSTDATA), }, $POSTDATA ); %postdata = (); undef $POSTDATA; # If we received messages to run through, go ahead and do it. # otherwise, just sleep for the timeout length and try again if (!defined $result) { # we got an error?!? abort! last; } elsif ($result eq 'finished') { # the server said that we're done print "Server states that there is no more work, exiting.\n" if ($opt_noisy); last; } elsif ($result eq '') { # no messages means the server may give us more work down the road. # sleep for client_timeout seconds and try the request again print "Received no messages from server, waiting $opt_cs_timeout seconds\n" if ($opt_noisy); sleep $opt_cs_timeout; } else { # we got messages, so deal with them. my $time_start = time; # postdata will hold our results, real will hold the original message # data from the server's scan mode. %postdata = (); %real = (); $init_results = $total_count = $spam_count = $ham_count = 0; # we got a result, so do things with it! my $gzfd = IO::Zlib->new($result, "rb"); die "Can't open temp result file: $!" unless $gzfd; # used for the temp queue file my $tmppath; ($tmppath, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile(); die "Can't make tempfile, exiting" unless $tmppath; unlink $tmppath; # if we have a temp directory, clean it out for this run clean_dir($tmpdir) if ($tmpdir); # Archive format, gzip compressed file w/ 3 parts per message: # 1- server message number in text format # 2- server index string, binary packed format # 3- message content, if not doing paths_only # number of messages $msgnum = $total_messages = read_line($gzfd) || die "mass-check: error reading from gzip message file\n"; status("server gave us $total_messages messages") if ($opt_progress); # loop through and prep all of the messages the server sent for(my $i = 0 ; $i < $total_messages; $i++ ) { my $num = read_line($gzfd); last unless defined $num; my $index = read_line($gzfd); last unless defined $index; # if we're doing paths_only, there'll be no message content if (!$opt_cs_paths_only) { my $msg = read_line($gzfd); last unless defined $msg; # it's going to be a dir of file formatted messages if (open(OUT, ">$tmpdir/$num")) { print OUT $msg; close(OUT); # this is a little tricky -- we need to process the files in the # path and format we've created, but the original data is needed # to create a proper result later, so deal with that here. my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($index); $real{"$tmpdir/$num"} = \@d; send_line($tmpfd, Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], 'f', "$tmpdir/$num")) || die "mass-check: error writing out temp file in client mode\n"; } else { warn "Can't create/write $tmpdir/$num: $!"; } } else { # in paths_only mode, there's no kluging between formats since we're # reading the same corpus, however we do still need to track server # message number to message data so our results will be useable. my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($index); $real{$d[3]} = $num; send_line($tmpfd, $index) || die "mass-check: error writing out temp file in client mode\n"; } } $gzfd->close; unlink $result; if ($opt_progress) { status('starting run stage'); } # we're about to start running, so go back to the start of the file seek $tmpfd, 0, 0; run_through_messages(); # we're done with the temp file -- bye bye close($tmpfd); # figure out new max messages, try keeping ~cs_timeout between runs my $time_end = time; # if we only requested a small number of messages, it may take <1s to # run through them, so fake it and say it took 1s. if ($time_end == $time_start) { $time_end++; } if ($opt_progress) { status('completed run stage'); } print "Completed run in ".($time_end-$time_start)." seconds\n" if ($opt_noisy); $msgnum = int($msgnum * $opt_cs_timeout / ($time_end-$time_start)) || 1; } } # if we were using a temp dir, clean it out and then remove it if ($tmpdir) { clean_dir($tmpdir); rmdir $tmpdir; } } ############################################################################ # in server mode, just return the ref to the message data sub wanted_server { my ($class, $id, $time, $dataref, $format) = @_; return $dataref; } # very similar to result() except the result has the message number at the # front, so strip it off and then set the POST data appropriately. sub result_client { my ($class, $result, $time) = @_; # don't open results files until we get here to avoid overwriting files init_results() if !$init_results; if ($class eq "s") { $spam_count++; } elsif ($class eq "h") { $ham_count++; } $total_count++; if ($opt_progress) { progress($time); } if ($result =~ s/^(\d+)\s+//m) { $postdata{$1} = $result; } else { warn ">> WTH!? result is not in the correct format: $result\n"; } } sub aidbg { if (would_log("dbg", "mass-check") == 2) { dbg (@_); } } sub deal_with_before_after { my($which, $time) = @_; if ($time && $time =~ /^-\d+$/) { $time = time + $time; } elsif ($time && $time !~ /^-?\d+$/) { if (HAS_TIME_PARSEDATE) { $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1); } else { die "You need Time::ParseDate if you use either the --before or --after option."; } } if ($which eq 'before') { $opt_before = $time; } else { $opt_after = $time; } if ($opt_before && $opt_after && $opt_after >= $opt_before) { die "--before ($opt_before) <= --after ($opt_after) -- conflict!"; } } sub generate_queue { my ($targets, $tmpfd) = @_; # scan the targets and get the number and list of messages $iter->_scan_targets($targets, sub { my($self, $date, $class, $format, $mail) = @_; push(@{$self->{$class}}, Mail::SpamAssassin::ArchiveIterator::_index_pack($date, $class, $format, $mail)); } ); # deal with opt_head and opt_tail top_and_tail_messages($iter->{h}); top_and_tail_messages($iter->{s}); my $messages; if ($opt_n) { # OPT_N == 1 means don't bother sorting on message receive date # for ease of memory, we'll play with pointers $messages = $iter->{s}; undef $iter->{s}; push(@{$messages}, @{$iter->{h}}); undef $iter->{h}; } else { # OPT_N == 0 means sort on message receive date # Sort the spam and ham groups by date my @s = @{$iter->{s}}; undef $iter->{s}; my @h = @{$iter->{h}}; undef $iter->{h}; # interleave ordered spam and ham if (@s && @h) { my $ratio = @s / @h; while (@s && @h) { push @{$messages}, (@s / @h > $ratio) ? (shift @s) : (shift @h); } } # push the rest onto the end push @{$messages}, @s, @h; } # head or tail < 0 means crop the total list, negate the value appropriately if ($opt_tail < 0) { splice(@{$messages}, 0, $opt_tail); } if ($opt_head < 0) { splice(@{$messages}, -$opt_head); } my $num = $Mail::SpamAssassin::ArchiveIterator::MESSAGES = scalar(@{$messages}); # Dump out the number of messages and the message index info to # the temp file send_line($tmpfd, $num, @{$messages}); } sub top_and_tail_messages { my ($ary) = @_; if ($opt_n) { # OPT_N == 1 means don't bother sorting on message receive date # head or tail > 0 means crop each list if ($opt_tail > 0) { splice(@{$ary}, 0, -$opt_tail); } if ($opt_head > 0) { splice(@{$ary}, min ($opt_head, scalar @{$ary})); } } else { # OPT_N == 0 means sort on message receive date # Sort the spam and ham groups by date my @s = sort { $a cmp $b } @{$ary}; # head or tail > 0 means crop each list if ($opt_tail > 0) { splice(@s, 0, -$opt_tail); } if ($opt_head > 0) { splice(@s, min ($opt_head, scalar @s)); } @{$ary} = @s; } } sub min { return ($_[0] < $_[1] ? $_[0] : $_[1]); }