#!/usr/bin/perl -w use strict; # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # sub aidbg; sub usage { my $status = shift; my $out = $status ? \*STDERR : \*STDOUT; print $out < -j=jobs specify the number of processes to run simultaneously --net turn on network checks --mid report Message-ID from each message --debug[=LIST] report debugging information (default is all facilities, LIST is a comma-separated list of facilities) --rewrite=OUT save rewritten message to OUT (default is /tmp/out) --rules=RE Only test rules matching the given regexp RE --restart=N restart all of the children after processing N messages --deencap=RE Extract SpamAssassin-encapsulated spam mails only if they were encapsulated by servers matching the regexp RE (default = extract all SpamAssassin-encapsulated mails) --lint check rules for syntax before running --cf='config line' Additional line of configuration --pre='config line' Additional line of ".pre" (prepended to configuration) --run_post_scan='command' Run the named command after the 'scan' stage, before starting the 'run' stage verbosity options --progress show progress updates during check --noisy show noisier progress updates during check --showdots print a dot for each scanned message client/server mode options --server=host:port use server mode, running on the given hostname and port --client=host:port use client mode, connecting to the given hostname and port --cs_conn_retries=N only used in client mode. set the number of times to retry the initial connection to the server, while waiting 60 seconds between connection attempts, default is 60 retries --cs_max=N at most, only ever request (client)/give out (server) a maximum of N messages (defaults to 1000) --cs_timeout=N in client mode, try to connect to the server every N seconds defaults to 120 in server mode, timeout messages after N seconds defaults to 300 --cs_paths_only only used in client mode. when making requests of the server, only ask for paths to the messages and not the messages themselves. useful when the client and server have the same paths to the corpus data. --cs_ssl use SSL to encrypt on-the-wire client-server traffic (requires IO::Socket::SSL, see http://wiki.apache.org/spamassassin/SslMassCheck for more) --cs_verbose Log network bandwidth utilization figures and other statistics --cs_schedule_cache Distribute messages so that they are checked on clients who have the messages in their local message cache, implies --cs_cache; if --cs_schedule_cache is not enabled, but --cs_cache is, clients running in --cs_paths_only mode will opportunistically use the messages from their local cache --cs_cache in client mode, enable the local message cache in server mode, allows the clients to use cached messages and/or add to their local caches --cs_cachedir=dir write cache info for --cs_cache in this directory tree --cs_max_tries=N maximum number of attempts to have a client scan a message defaults to 3 log options -o write all logs to stdout --loghits log the text hit for patterns (useful for debugging) --loguris log the URIs found --logmem log the memory delta (only on Linux) --hamlog=log use as ham log ('ham.log' is default) --spamlog=log use as spam log ('spam.log' is default) message selection options -n no date sorting or spam/ham interleaving --cache use cache information when selecting messages --cachedir=dir write cache info for --cache in this directory tree --all don't skip big messages message selection options, can be specified for each target --after=N only test mails received after time_t N (negative values are an offset from current time, e.g. -86400 = last day) or after date as parsed by Time::ParseDate (e.g. '-6 months') --before=N same as --after, except received times are before time_t N --scanprob=N probability of scanning a message, range 0.0 - 1.0 (default: 1.0) message selection options, can be specified for each target class --head=N only check first N ham and N spam (N messages if -n used) --tail=N only check last N ham and N spam (N messages if -n used) simple target options (implies -o and no ham/spam classification) --dir subsequent targets are directories --file subsequent targets are files in RFC 822 format --mbox subsequent targets are mbox files --mbx subsequent targets are mbx files Just left over functions we should remove at some point: --bayes report score from Bayesian classifier options used during score generation process --learn=N learn N% of messages as spam or ham --reuse reuse network checks if X-Spam-Status: is present in messages (note: both clients and servers in c/s mode need this) non-option arguments are used as target names (mail files and folders), the target format is: :: is "spam" or "ham" is "dir", "file", "mbx", "mbox", or "detect" is a file or directory name. globbing of ~ and * is supported (see http://wiki.apache.org/spamassassin/MassCheck for more details.) EOF exit($status); } ########################################################################### use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits $opt_mid $opt_net $opt_nosort $opt_progress $opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart $opt_loguris $opt_logmem $opt_after $opt_before $opt_rewrite $opt_deencap $opt_learn $opt_reuse $opt_lint $opt_cache $opt_noisy $opt_cf $total_messages $statusevery $opt_cachedir $opt_scanprob $opt_client $opt_cs_conn_retries $opt_cs_max $opt_cs_timeout $opt_cs_paths_only $opt_server %postdata %real $svn_revision $opt_cs_ssl $opt_run_post_scan $opt_cs_verbose %client_caches %server_caches @cache_tmp_files %min_other_caches %unique_cache_completed $opt_cs_schedule_cache $opt_cs_cache $opt_cs_cachedir $opt_cs_max_tries $opt_pre $tmpfd); use FindBin; # use "blib" so that we can use e.g. @@LOCAL_STATE_DIR@@; "lib" doesn't # have that stuff substituted :( use lib too, though, as a backup, # since some users might be running mass-check without "make" first use lib "$FindBin::Bin/../lib"; use lib "$FindBin::Bin/../blib/lib"; eval "use bytes"; use IO::Select; use IO::Socket; use Socket qw(); use Mail::SpamAssassin::ArchiveIterator; use Mail::SpamAssassin; use Mail::SpamAssassin::Logger; use File::Copy; use File::Spec; use Getopt::Long; use POSIX qw(strftime); use Fcntl qw(O_RDWR O_CREAT);; use Config; use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; }; use constant HAS_IO_ZLIB => eval { require IO::Zlib; }; use constant HAS_IO_SOCKET_SSL => eval { require IO::Socket::SSL; }; use constant HAS_TIME_HI_RES => eval { require Time::HiRes; }; use constant HAS_SDBM_FILE => eval { require SDBM_File; }; # default settings $opt_c = "$FindBin::Bin/../rules"; $opt_p = "$FindBin::Bin/spamassassin"; $opt_j = 1; $opt_head = 0; $opt_tail = 0; $opt_net = 0; $opt_hamlog = "ham.log"; $opt_spamlog = "spam.log"; $opt_learn = 0; $opt_cf = []; $opt_pre = []; my $rcvd_bytes = 0; my $sent_bytes = 0; my $t_first_msg = 0; my $t_last_msg = 0; my $msgs_processed = 0; my $failed_msgs = 0; my $cache_hits = 0; my $client_id = 0; my @ORIG_ARGV = @ARGV; GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug:s", "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net", "progress!", "rewrite:s", "showdots", "spamlog=s", "tail=i", "rules=s", "restart=i", "loguris", "run_post_scan=s", "deencap=s", "logmem", "learn=i", "reuse", "lint", "cache", "cachedir=s", "noisy", "scanprob=f", "server=s", "cs_max=i", "cs_timeout=i", "cs_conn_retries=i", "cs_paths_only", "client=s", "cs_ssl", "cs_verbose", "cs_schedule_cache", "cs_cache", "cs_cachedir=s", "cs_max_tries=i", "before=s" => \&deal_with_before_after, "after=s" => \&deal_with_before_after, "cf=s" => \@{$opt_cf}, "pre=s" => \@{$opt_pre}, "dir" => sub { $opt_format = "dir"; }, "file" => sub { $opt_format = "file"; }, "mbox" => sub { $opt_format = "mbox"; }, "mbx" => sub { $opt_format = "mbx"; }, "help" => sub { usage(0); }, '<>' => \&target) or usage(1); # We need IO::Zlib for client-server mode! if ( ($opt_client || $opt_server) && ! HAS_IO_ZLIB ) { die "IO::Zlib required for client/server mode!\n"; } if ($opt_cs_ssl && ! HAS_IO_SOCKET_SSL ) { die "IO::Socket::SSL required for --cs_ssl!\n"; } if ($opt_noisy) { $opt_progress = 1; # implies --progress } $opt_debug ||= 'all' if defined $opt_debug; if ($opt_cs_schedule_cache) { $opt_cs_cache = 1; # implies --cs_cache } if ($opt_client && $opt_cs_cache && !$opt_cs_cachedir) { warn "You must specify a local message cache directory with --cs_cachedir\n". "when using the --cs_cache option.\n"; exit; } if ($opt_server && $opt_cs_schedule_cache && !$opt_n) { warn '*'x74 ."\n"; warn '*'. ' 'x72 ."*\n"; warn "* Corpus will be run un-sorted but with date stamp loggging which is *\n"; warn "* needed for score generation log selection but is not available when *\n"; warn "* using the -n option. This may affect the results of any bayes and *\n"; warn "* AWL tests run during this mass-check. *\n"; warn '*'. ' 'x72 ."*\n"; warn '*'x74 ."\n"; } if ($opt_cs_schedule_cache && !HAS_SDBM_FILE) { warn "--cs_schedule_cache requires the Perl module SDBM_File.\n"; exit; } # --lint if ($opt_lint) { # In theory we could probably use the same spamtest object as below, # but since it's probably not expecting that, and we don't want # strange things happening, create a local object. my $spamlint = create_spamtest(); $spamlint->debug_diagnostics(); my $res = $spamlint->lint_rules(); $spamlint->finish(); if ($res) { warn "lint: $res issues detected, ". "please rerun with debug enabled for more information\n"; exit 1; } } # test messages for the mass-check my @targets; if (!$opt_client) { if ($opt_f) { open(F, $opt_f) || die "cannot read target $opt_f: $!"; push(@targets, map { chomp; $_ } ); close(F); } usage(1) if !@targets; } if ($opt_reuse) { # if we have --reuse, don't bother testing DNS; we shouldn't be hitting # the wire at all, and in fact we may be running without a net connection push @{$opt_cf}, "dns_available yes\n"; # need to load M:SA:Plugin:Reuse push @{$opt_pre}, "loadplugin Mail::SpamAssassin::Plugin::Reuse\n"; } my $user_prefs = "$opt_p/user_prefs"; sub create_spamtest { return new Mail::SpamAssassin ({ 'debug' => $opt_debug, 'rules_filename' => $opt_c, 'site_rules_filename' => "$opt_p/local.cf", 'userprefs_filename' => $user_prefs, 'userstate_dir' => $opt_p, 'save_pattern_hits' => $opt_loghits, 'dont_copy_prefs' => 1, 'local_tests_only' => $opt_net ? 0 : 1, 'only_these_rules' => $opt_rules, 'ignore_safety_expire_timeout' => 1, 'pre_config_text' => join("\n", @{$opt_pre})."\n", 'post_config_text' => join("\n", @{$opt_cf})."\n", PREFIX => '', DEF_RULES_DIR => $opt_c, LOCAL_RULES_DIR => '', }); } my $spamtest = create_spamtest(); $spamtest->compile_now(0); # 0 since we will be reading more configs $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf"); $spamtest->read_scoreonly_config($user_prefs); my $who = `id -un 2>/dev/null`; my $where = `uname -n 2>/dev/null`; my $when = `date -u`; my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || $where || `hostname` || 'localhost'; chomp $who; chomp $where; chomp $when; chomp $host; $svn_revision = get_current_svn_revision(); # when displaying the commandline, quote any arguments which have # "questionable" characters such as spaces, pipes, etc. my $cmdline = join(' ',map { m@[^A-Za-z0-9_/\\.-]@ ? qq/"$_"/ : $_ } @ORIG_ARGV); $cmdline =~ s/\s+/ /gs; my $isowhen = strftime("%Y%m%dT%H%M%SZ", gmtime(time)); # better my $log_header = "# mass-check results from $who\@$where, on $when\n" . "# M:SA version ".$spamtest->Version()."\n" . "# SVN revision: $svn_revision\n" . "# Date: $isowhen\n" . "# Perl version: $] on $Config{archname}\n" . "# Switches: '$cmdline'\n"; my $updates = ($opt_noisy ? 100 : 10); my $total_count = 0; my $spam_count = 0; my $ham_count = 0; my $init_results = 0; my $showdots_active = ($opt_showdots || $opt_noisy); my $showdots_counter = 0; my $showdots_every = ($opt_showdots ? 1 : 20); my $AIopts = { 'opt_all' => $opt_all, 'opt_skip_empty_messages' => 1, }; if (!$opt_client) { # Deal with --rewrite if (defined $opt_rewrite) { my $rewrite = ($opt_rewrite ? $opt_rewrite : "/tmp/out"); open(REWRITE, "> $rewrite") || die "open of $rewrite failed: $!"; } # ArchiveIterator options for non-client mode $AIopts->{'opt_scanprob'} = $opt_scanprob; $AIopts->{'opt_cache'} = $opt_cache; $AIopts->{'opt_cachedir'} = $opt_cachedir; $AIopts->{'opt_after'} = $opt_after; $AIopts->{'opt_before'} = $opt_before; $AIopts->{'scan_progress_sub'} = \&showdots_blip; $AIopts->{'opt_want_date'} = ! $opt_n; # ensure that scanprob stuff is predictable and reproducable if (defined $opt_scanprob && $opt_scanprob < 1.0) { srand(1); } } else { # ArchiveIterator options for client mode -- tends to be simple $opt_n = 1; $AIopts->{'opt_want_date'} = 0; } ########################################################################### ## SCAN MODE my $iter = new Mail::SpamAssassin::ArchiveIterator($AIopts); # setup the AI functions if ($opt_client) { $iter->set_functions(\&wanted, \&result_client); } elsif ($opt_server) { $iter->set_functions(\&wanted_server, \&result); } else { $iter->set_functions(\&wanted, \&result); } #AFTER SETTING ALL OPTS, RUN _set_default_message_selection_opts() TO GET SANE DEFAULTS $iter->_set_default_message_selection_opts(); my $messages; # normal mode as well as a server do scan mode and get a temp file if (!$opt_client) { status('starting scan stage') if ($opt_progress); # Make a temp file and delete it my $tmpf; ($tmpf, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile(); die 'mass-check: failed to create temp file' unless $tmpf; unlink $tmpf or die "mass-check: unlink '$tmpf': $!"; # having opt_j or server mode means do scan in a separate process if ($opt_server || $opt_j) { if ($tmpf = fork()) { # parent waitpid($tmpf, 0); } elsif (defined $tmpf) { # child -- process using message_array generate_queue(\@targets, $tmpfd); exit; } else { die "mass-check: cannot fork: $!"; } } else { # we get here if opt_j == 0, so scan in this process generate_queue(\@targets, $tmpfd); } # we now have a temporary file with the messages to process seek($tmpfd, 0, 0); # the first line is the number of messages $total_messages = read_line($tmpfd); if (!$total_messages) { die "mass-check: no messages to process\n"; } if ($opt_cs_schedule_cache) { # create a tied hash database for the server to use for matching against # client hashes # while we're at it, count the total number of messages for ourself # I suspect that AI gets it wrong sometimes, somehow $total_messages = 0; # get a temp file for the server database on messages to process my ($dbpath, $dbfd) = Mail::SpamAssassin::Util::secure_tmpfile(); close $dbfd; unlink $dbpath; push @cache_tmp_files, $dbpath.'.pag'; push @cache_tmp_files, $dbpath.'.dir'; tie %{$server_caches{'to_process'}}, "SDBM_File", $dbpath, O_RDWR|O_CREAT, 0600 || die "Cannot tie hash to file $dbpath: $!"; # get a temp file for the server DB of messages that are not cached anywhere ($dbpath, $dbfd) = Mail::SpamAssassin::Util::secure_tmpfile(); close $dbfd; unlink $dbpath; push @cache_tmp_files, $dbpath.'.pag'; push @cache_tmp_files, $dbpath.'.dir'; tie %{$server_caches{'not_cached'}}, "SDBM_File", $dbpath, O_RDWR|O_CREAT, 0600 || die "Cannot tie hash to file $dbpath: $!"; # get a temp file for the server database of cache hit counts per message ($dbpath, $dbfd) = Mail::SpamAssassin::Util::secure_tmpfile(); close $dbfd; unlink $dbpath; push @cache_tmp_files, $dbpath.'.pag'; push @cache_tmp_files, $dbpath.'.dir'; tie %{$server_caches{'cache_count'}}, "SDBM_File", $dbpath, O_RDWR|O_CREAT, 0600 || die "Cannot tie hash to file $dbpath: $!"; # dump the server's list of messages to process to the DB_HASHes while (my $msg = read_line($tmpfd)) { my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msg); $server_caches{'to_process'}{$d[3]} = $msg; $server_caches{'not_cached'}{$d[3]} = undef; $total_messages++; } # and back to the first message (although we currently never use it again) seek($tmpfd, 0, 0); read_line($tmpfd); } showdots_finish(); status("completed scan stage, $total_messages messages") if ($opt_progress); } ########################################################################### ## RUN MODE run_post_scan(); if ($opt_client) { client_mode(); } else { status('starting run stage') if ($opt_progress); if ($opt_server) { server_mode(); } else { $t_first_msg = time; run_through_messages(); $t_last_msg = time; } status('completed run stage') if ($opt_progress); } # Even though we're about to exit, let's clean up after ourselves close($tmpfd) if ($tmpfd); showdots_finish(); if (defined $opt_rewrite) { close(REWRITE); } $spamtest->finish(); if ($opt_cs_verbose) { if ($opt_client || $opt_server) { warn "network I/O: sent=$sent_bytes, received=$rcvd_bytes\n"; } $t_last_msg++ if $t_last_msg == $t_first_msg; warn "processed $msgs_processed messages (failed to scan $failed_msgs) in ".($t_last_msg - $t_first_msg). ' seconds ('.(sprintf("%.1f", ($msgs_processed / ($t_last_msg - $t_first_msg) * 3.6))). " kmsgs/hr)\n"; $msgs_processed ||= 1; warn "cache hits: $cache_hits (" .(sprintf("%.1f", ($cache_hits / $msgs_processed * 100)))."%)\n" if $opt_cs_cache; } # exit status: did we check at least one message correctly? exit(!($ham_count || $spam_count)); ########################################################################### sub target { my ($target) = @_; # message-selection options; these can now be specified separately # for each target my %selopts = ( opt_scanprob => $opt_scanprob, opt_after => $opt_after, opt_before => $opt_before ); if (!defined($opt_format)) { push(@targets, { %selopts, target => $target }); } else { $opt_o = 1; push(@targets, { %selopts, target => "spam:$opt_format:$target" }); } } ########################################################################### sub init_results { $init_results = 1; showdots_finish(); # now, showdots only happens if --showdots was used $showdots_active = $opt_showdots; if ($opt_progress) { # round up since 100% will be caught at end already $statusevery = int($total_messages / $updates + 1); # if $messages < $updates, just give a status line per msg. $statusevery ||= 1; } return if $opt_client; if ($opt_o) { autoflush STDOUT 1; print STDOUT $log_header; } else { open(HAM, "> $opt_hamlog") || die "open of $opt_hamlog failed: $!"; open(SPAM, "> $opt_spamlog") || die "open of $opt_spamlog failed: $!"; autoflush HAM 1; autoflush SPAM 1; print HAM $log_header; print SPAM $log_header; } } sub result { my ($class, $result, $time) = @_; # don't open results files until we get here to avoid overwriting files init_results() if !$init_results; if ($class eq "s") { if ($opt_o) { print STDOUT $result; } else { print SPAM $result; } $spam_count++; } elsif ($class eq "h") { if ($opt_o) { print STDOUT $result; } else { print HAM $result; } $ham_count++; } $total_count++; if ($opt_progress) { progress($time); } } sub wanted { my ($class, $id, $time, $dataref, $format) = @_; my $out = ''; # if origid is defined, it'll be the message number from server mode my $origid; # client mode is a little crazy because we need to kluge around the fact # that the information needed to do the run is different than the # information that goes into the results. if ($opt_client) { # change the format and id to the real version, make sure to remember # the server's message number # note: pop/push makes it work regardless of any changes to M::SA::AI $origid = pop @{$real{$id}}; $format = $real{$id}->[2]; $id = $real{$id}->[3]; } memory_track_start() if ($opt_logmem); $spamtest->timer_reset; # parse the message, and force it to complete my $ma = $spamtest->parse($dataref, 1); # get X-Spam-Status: header for rule hit resue my $x_spam_status; my $reusing; if ($opt_reuse) { $x_spam_status = $ma->get_header("X-Spam-Status"); $x_spam_status and $x_spam_status =~ s/,\s+/,/gs; } # remove SpamAssassin markup, if present and the mail was spam my $header = $ma->get_header("Received"); if ($header && $header =~ /\bwith SpamAssassin\b/) { if (!$opt_deencap || message_should_be_deencapped($ma)) { my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1); $ma->finish(); $ma = $new_ma; } } if ($opt_reuse) { if ($x_spam_status && $x_spam_status =~ m/tests=(\S*)/ && $x_spam_status !~ /\bshortcircuit=(?:ham|spam|default)\b/) { my @previous = split(/,/, $1); $ma->{metadata}->{reuse_tests_hit} = { map {$_ => 1} @previous }; $reusing = 1; } } # plugin hook to cause us to skip messages my $skip = $spamtest->call_plugins("mass_check_skip_message", { class => $class, 'time' => $time, 'id' => $id, msg => $ma }); if ($skip) { $ma->finish(); return; } # log-uris support my $status; my @uris; my $before; my $after; if ($opt_loguris) { my $pms = Mail::SpamAssassin::PerMsgStatus->new($spamtest, $ma); @uris = $pms->get_uri_list(); $pms->finish(); } else { $before = time; $status = $spamtest->check($ma); $after = time; } my @extra; # sample-based learning if ($opt_learn > 0) { my $spam; # spam learned as ham = 0.05% if ($class eq 's' && rand(100) < 0.05) { $spam = 0; } # ham learned as spam = 0.01% elsif ($class eq 'h' && rand(100) < 0.01) { $spam = 1; } # spam/ham learned correctly elsif (rand(100) < $opt_learn) { if ($class eq 's') { $spam = 1; } elsif ($class eq 'h') { $spam = 0; } else { die "unknown class, learning failed"; } } if (defined $spam) { my $result = ($spam ? "spam" : "ham"); my $status = $spamtest->learn($ma, undef, $spam, 0); my $learned = $status->did_learn(); $result = "undef" if !defined $learned; push(@extra, "learn=".$result); } } if (defined($time)) { push(@extra, "time=".$time); } if ($status && defined $status->{bayes_score}) { push(@extra, "bayes=".sprintf("%06f", $status->{bayes_score})); } if ($opt_mid) { my $mid = $ma->get_header("Message-Id"); if ($mid) { # message contains a Message-Id: while($mid =~ s/\([^\(\)]*\)//s) {}; # remove comments and $mid =~ s/^\s+|\s+$//sg; # leading and trailing spaces $mid =~ s/\s.*$//s; # keep only the first token } else { # it doesn't have a Message-Id: $mid = $id; # so build one from the id $mid =~ s,^.*/,,; # remove the path $mid = "<$mid\@$host.masses.spamassassin.org>"; # and put it together } $mid =~ tr/-A-Za-z0-9_!#%&=~<@>/./c; # replace dangerous chars with . (so regexp search just works) push(@extra, "mid=$mid"); } push(@extra, "scantime=" . ($after - $before)); push(@extra, "format=$format"); if ($opt_logmem) { my $mem = memory_track_finish(); if ($mem) { push(@extra, $mem); } } push(@extra, "reuse=" . ($reusing ? "yes" : "no")); # log the scoreset we're in { my $set = 0; if ($opt_net) { $set |= 1; } if ($status && defined $status->{bayes_score}) { $set |= 2; } push(@extra, "set=".$set); } if ($opt_client) { push(@extra, "host=$where"); } my $yorn; my $score; my $tests; my $extra; if ($opt_loguris) { $yorn = '.'; $score = 0; $tests = join(" ", sort @uris); $extra = ''; } else { $yorn = $status->is_spam() ? 'Y' : '.'; # don't bother adjusting scores for reuse $score = $status->get_score(); # list of tests hit my @tests; push @tests, split(/,/, $status->get_names_of_tests_hit()); push @tests, split(/,/, $status->get_names_of_subtests_hit()); $tests = join(",", sort(@tests)); $extra = join(",", @extra); } if (defined $opt_rewrite) { print REWRITE $status->rewrite_mail(); } $id =~ s/\s/_/g; # if we have an origid set, it'll be the server mode's message number, so # attach it to our result appropriately. if (defined $origid) { $out = "$origid "; } $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra); if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) { $out .= logkilled($ma, $id, "possible virus"); } if ($opt_loghits) { my $log = ''; foreach my $t (sort keys %{$status->{pattern_hits}}) { $_ = $status->{pattern_hits}->{$t}; $_ ||= ''; s/\r/\\r/gs; # fix unprintables s/\n/\\n/gs; $log .= "$t=\"$_\" "; } if ($log) { chomp $log; $out .= "# $log\n"; } } if (defined $status) { $status->finish(); } $ma->finish(); undef $ma; # clean 'em up undef $status; showdots_blip(); # print ">>>> out = $out\n"; return $out; } sub showdots_blip { return unless ($showdots_active); $showdots_counter++; if ($showdots_counter % $showdots_every == 0) { print STDERR '.'; if ($showdots_counter % (60 * $showdots_every) == 0) { print STDERR "\n"; } } } sub showdots_finish { print STDERR "\n" if ($showdots_active); $showdots_counter = 0; } # ick. We have to go grovelling through the body parts to see if a message # is a report_safe-marked-up message, because a local scanner will overwrite # any remote scanner's X-Spam-Checker-Version header. # sub message_should_be_deencapped { my ($ma) = @_; # not sure why this is undefined, but it is sometimes if (defined $ma->{body_parts} && scalar @{$ma->{body_parts}} > 0) { my $firstpart = $ma->{body_parts}->[0]; if (!$firstpart->{headers}->{'content-type'} || $firstpart->{headers}->{'content-type'} ne 'text/plain') { return 0; # not a 'report_safe' encapsulation } if (scalar @{$firstpart->{raw}} < 3) { return 0; } # too short to be a report # grab first 2 lines my $text = $firstpart->{raw}->[0] . $firstpart->{raw}->[1]; $text =~ s/\s+/ /gs; if ($text =~ /^Spam detection software, running on the system \"(\S+)\"/) { my $hname = $1; if ($hname =~ /$opt_deencap/io) { return 1; } } } return 0; # a different host marked it up. pass it through! } sub logkilled { my ($ma, $id, $reason) = @_; my $from = $ma->get_header("From") || 'undef'; my $to = $ma->get_header("To") || 'undef'; my $subj = $ma->get_header("Subject") || 'undef'; my $mid = $ma->get_header("Message-Id") || 'undef'; chomp ($from); chomp ($to); chomp ($subj); chomp ($mid); return "# skipped killfiled message ($reason): from=$from to=$to subj=$subj mid=$mid id=$id\n"; } sub progress { my ($time) = @_; $time ||= 0; # Are we at the end or otherwise at a point we should print status? Then do it. if ($total_messages == $total_count || $total_count % $statusevery == 0) { my $time = strftime("%Y-%m-%d", localtime($time)); status(sprintf("%3d%% ham: %-6d spam: %-6d date: %s", int(($total_count / $total_messages) * 100), $ham_count, $spam_count, $time)); } } sub status { my($str) = @_; my $now = strftime("%Y-%m-%d %X", localtime(time)); printf STDERR "status: %-48s now: %s\n", $str, $now; } ########################################################################### our ($mem_size, $mem_rss, $mem_shared); sub memory_track_start { if ($^O =~ /linux/i) { if (open (IN, "; close IN; if ($statm =~ /^(\d+) (\d+) (\d+) /) { $mem_size = $1; $mem_rss = $2; $mem_shared = $3; } } } } sub memory_track_finish { my $str = ''; if ($^O =~ /linux/i) { if (open (IN, "; close IN; if ($statm =~ /^(\d+) (\d+) (\d+) /) { my $size = $1; my $rss = $2; my $shared = $3; $str = sprintf ("memsz=%d,memrss=%d,memshr=%d", ($size - $mem_size), ($rss - $mem_rss), ($shared - $mem_shared)); } } } return $str; } sub get_current_svn_revision { my $revision; # this is usually "${TOPDIR}/masses" my $dir = $FindBin::Bin || "."; if (-d "$dir/.svn" || -f "$dir/svninfo.tmp") { if (-f "$dir/svninfo.tmp") { # created by build/automc/buildbot_ready for chrooted mass-checks open (SVNINFO, "< $dir/svninfo.tmp"); } else { # note, ".." since we want to pick up changes outside 'masses' # too! open (SVNINFO, "( svn info --non-interactive $dir/.. || svn info $dir/.. ) 2>&1 |"); } while () { # Revision: 383822 next unless /^Revision: (\d+)/; $revision = $1; last; } close SVNINFO; return $revision if $revision; } # this probably will never work due to Rules Project changes TODO if (open(TESTING, "$opt_c/70_testing.cf")) { chomp($revision = ); $revision =~ s/.*\$Rev:\s*(\S+).*/$1/; close(TESTING); return $revision if $revision; } return $revision || "unknown"; } ############################################################################ ## children processors, start and process, used when opt_j > 1 sub start_children { my ($count, $child, $pid, $socket) = @_; my $io = IO::Socket->new(); my $parent; # create children for (my $i = 0; $i < $count; $i++) { ($child->[$i],$parent) = $io->socketpair(AF_UNIX,SOCK_STREAM,PF_UNSPEC) or die "mass-check: socketpair failed: $!"; if ($pid->[$i] = fork) { close $parent; # disable caching for parent<->child relations my ($old) = select($child->[$i]); $|++; select($old); $socket->add($child->[$i]); aidbg "mass-check: starting new child $i (pid ".$pid->[$i].")\n"; next; } elsif (defined $pid->[$i]) { my $result; my $line; close $tmpfd if defined $tmpfd; close $child->[$i]; select($parent); $| = 1; # print to parent by default, turn off buffering send_line($parent,"START"); while ($line = read_line($parent)) { if ($line eq "exit") { close $parent; exit; } my($class, $format, $date, $where, $result) = $iter->_run_message($line); $result ||= ''; # If determine_receive_date is not set, the original input date # wasn't calculated, but run_message would have done so, so reset # the packed version if possible ... use defined for date since # it could == 0. if (!$iter->{determine_receive_date} && $class && $format && defined $date && $where) { $line = Mail::SpamAssassin::ArchiveIterator::_index_pack($date, $class, $format, $where); } send_line($parent,"$result\0RESULT $line"); } exit; } else { die "mass-check: cannot fork: $!"; } } } ## handling killing off the children sub reap_children { my ($count, $socket, $pid) = @_; # If the child died, sending it the exit will generate a SIGPIPE, but we # don't really care since the readline will go undef (which is fine), # then we do the waitpid which will finish it off. So we end up in the # right state, in theory. local $SIG{'PIPE'} = 'IGNORE'; for (my $i = 0; $i < $count; $i++) { aidbg "mass-check: killing child $i (pid ",$pid->[$i],")\n"; send_line($socket->[$i],"exit"); # tell the child to die. close $socket->[$i]; waitpid($pid->[$i], 0); # wait for the signal ... } } # in server mode, this gets called to read in the HTTP request from a given # socket, then return the information the client sent to us. sub handle_http_request { my $socket = shift; my $headers = {}; my $postdata = {}; # read in the request (POST / HTTP/1.0) my $line = $socket->getline(); $line ||= ''; $rcvd_bytes += length($line); $line =~ s/\r\n$//; my ($type, $URI, $VERS) = $line =~ /^([a-zA-Z]+)\s+(\S+)(?:\s*(\S+))/; unless ($type && $URI && $VERS) { $type ||= ''; $URI ||= ''; return ($type, $URI, $headers, $postdata); } $type = uc $type; # read in headers, "key: value" up to a blank line do { $line = $socket->getline(); last unless defined $line; $rcvd_bytes += length($line); $line =~ s/\r\n$//; if ($line) { my ($k,$v) = split(/:\s*/, $line, 2); $headers->{lc $k} = $v; } } while ($line !~ /^$/); # if this is a POST request w/ content-length, there'll be a payload, deal # with it. we only support compressed payloads. my $postheader; if ($type eq 'POST' && $headers->{'content-length'}) { $rcvd_bytes += $headers->{'content-length'}; my $pd = ''; if ($headers->{'content-encoding'} eq 'x-gzip') { # assign an id to the client if it doesn't already have one $headers->{'client-id'} ||= ++$client_id; my ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); die "Can't make tempfile, exiting" unless $gzpath; # TODO: don't read in the entire thing at once to avoid memory bloat my $rd; $socket->read($rd, $headers->{'content-length'}) || die "mass-check: error reading in data from client\n"; print $gzfd $rd; $gzfd->close; my $fd = IO::Zlib->new($gzpath, "rb"); die "Can't open temp result file: $!" unless $fd; if ($headers->{'action'}) { # different types of POST contents are packed different ways if ($headers->{'action'} eq 'sending cache') { if ($opt_cs_schedule_cache) { # save the client cache to a tied hash # get a temp file for the client cache database my ($dbpath, $dbfd) = Mail::SpamAssassin::Util::secure_tmpfile(); close $dbfd; unlink $dbpath; push @cache_tmp_files, $dbpath.'.pag'; push @cache_tmp_files, $dbpath.'.dir'; tie %{$client_caches{$headers->{'client-id'}}}, "SDBM_File", $dbpath, O_RDWR|O_CREAT, 0600 || die "Cannot tie hash to file $dbpath: $!\n"; # dump the client's list of cached message paths to the DB_HASH my $client_cache = 0; my $client_cache_hits = 0; while (my $msg = read_line($fd)) { my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msg); $client_cache++; if (exists $server_caches{'to_process'}{$d[3]}) { $client_caches{$headers->{'client-id'}}{$d[3]} = undef; $server_caches{'cache_count'}{$d[3]}++; delete $server_caches{'not_cached'}{$d[3]}; # log this client's cache hit rate $client_cache_hits++; } } status("client $headers->{'client-id'} has $client_cache msgs cached ($client_cache_hits usable)"); status("client $headers->{'client-id'} has ".(sprintf("%.1f", ($client_cache_hits / $total_messages * 100))).'% of required messages'); } } elsif ($headers->{'action'} eq 'sending results') { # process the results $pd = read_line($fd); # key1=value1&key2=value2... %{$postdata} = map { my($k,$v) = split(/=/, $_, 2); # we need to decode the key and value $k =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg; $v =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg; $k => $v; } split(/\&/, $pd); } } $fd->close; undef $fd; $gzfd->close; unlink $gzpath; } } return($type, $URI, $headers, $postdata); } # in server mode, generate a gzip compressed data stream with the messages and # return the path to the compressed file which the server will read and pass # to the client. # # Input: # - Number of messages to generate (scalar) # - Hash of Arrays of outstanding requests (reference to hash of array refs) # timestamp# -> [ num1, num2, ... ] # Used to quickly find outstanding/timed out messages to send to client. # - Hash of outstanding messages and associated data (ref to hash of hash refs) # num1 -> { data => 'binary data from scan mode', timestamp => timestamp# } # Used later on to specify the timestamp entry to remove the entry from. # - Paths only? If true, just include the original message data in the gzip # file. Otherwise, include the message data. Useful if the client has the # corpus available via the same paths as originally specified. # # Returns: scalar path to gzip file # sub generate_messages { my($msgs, $timestamps, $msgsout, $paths_only, $client_id) = @_; # Hold the message numbers we'll be sending out my @tosend = (); my @sent = (); # Find out if any of the messages we sent out before need to be sent out # again because we haven't seen a response within the timeout. my $tooold = time - $opt_cs_timeout; foreach (sort { $a <=> $b } keys %{$timestamps}) { # since we're going in numeric order, if the current entry is newer than # the timeout value, the rest will be too, so stop looking. last if ($_ > $tooold); # messages that might be eligible for retry my @toretry = (); # how many messages do we still need to fulfill the request? my $wanted = $msgs - @tosend; if (@{$timestamps->{$_}} > $wanted) { # there are more entries in the timestamp list than we want, so just # grab that many off the list. push(@toretry, splice @{$timestamps->{$_}}, 0, $wanted); } else { # there are just enough, or not enough entries on the timestamp list to # satisfy our request, so take them all and we'll loop around. push(@toretry, @{$timestamps->{$_}}); delete $timestamps->{$_}; } # limit retries foreach my $num (@toretry) { if ($msgsout->{$num}->{'count'} < $opt_cs_max_tries) { $msgsout->{$num}->{'count'}++; push @tosend, $num; } else { warn "skipping message num $num after $opt_cs_max_tries attempts, index: ". (Mail::SpamAssassin::ArchiveIterator::_index_unpack($msgsout->{$num}->{'data'}))[3]."\n"; delete $msgsout->{$num}; $failed_msgs++; } } # Ok, we have enough messages so we can stop now. last if (@tosend == $msgs); } if (!$opt_cs_schedule_cache) { # if we still have the temp file with the input messages open, we'll fillup # out message output queue with messages from there. if ($tmpfd) { while (@tosend < $msgs) { my $msg = read_line($tmpfd); # no more messages from the temp file, close it out unless ($msg) { delete $msgsout->{'curnum'}; close $tmpfd; undef $tmpfd; last; } # we got a result, so assign it a number (curnum) and store the data # appropriately, then add the new number to the queue. my $num = ++$msgsout->{'curnum'}; $msgsout->{$num}->{'data'} = $msg; $msgsout->{$num}->{'count'}++; push(@tosend, $num); } } } else { if (($msgs_processed + $failed_msgs) != $total_messages) { # select messages based on what the client*s* have cached # if the client hasn't sent cache data, fake it unless (exists $client_caches{$client_id}) { %{$client_caches{$client_id}} = (); $unique_cache_completed{$client_id} = 1; } # first: select messages that only the current client has cached MESSAGE: while (!$unique_cache_completed{$client_id} && @tosend < $msgs && (my($path, undef) = each %{$client_caches{$client_id}})) { # check that no other clients have it cached next MESSAGE if $server_caches{'cache_count'}{$path} > 1; # we got a result, so assign it a number (curnum) and store the data # appropriately, then add the new number to the queue. my $num = ++$msgsout->{'curnum'}; $msgsout->{$num}->{'data'} = $server_caches{'to_process'}{$path}; $msgsout->{$num}->{'count'}++; push(@tosend, [ $num, 1 ]); delete $server_caches{'to_process'}{$path}; delete $server_caches{'cache_count'}{$path}; delete $client_caches{$client_id}{$path}; $cache_hits++; } if (@tosend < $msgs) { $unique_cache_completed{$client_id} = 1; } # second: hand out messages that no clients have cached while (@tosend < $msgs && (my($path, undef) = each %{$server_caches{'not_cached'}})) { # we got a result, so assign it a number (curnum) and store the data # appropriately, then add the new number to the queue. my $num = ++$msgsout->{'curnum'}; $msgsout->{$num}->{'data'} = $server_caches{'to_process'}{$path}; $msgsout->{$num}->{'count'}++; push(@tosend, [ $num, 0 ]); delete $server_caches{'to_process'}{$path}; delete $server_caches{'cache_count'}{$path}; delete $server_caches{'not_cached'}{$path}; } # third: hand out messages in the client's cache regardless of how many # other clients have them cached; smart scheduling takes too long while (@tosend < $msgs && (my($path, undef) = each %{$client_caches{$client_id}})) { # we got a result, so assign it a number (curnum) and store the data # appropriately, then add the new number to the queue. my $num = ++$msgsout->{'curnum'}; $msgsout->{$num}->{'data'} = $server_caches{'to_process'}{$path}; $msgsout->{$num}->{'count'}++; push(@tosend, [ $num, 1 ]); delete $server_caches{'to_process'}{$path}; delete $server_caches{'cache_count'}{$path}; $cache_hits++; # testing with 5 clients, this seems to be faster (in msgs/hr) than # deleting from each other clients' cache when we go to get their # messages; of course it's more I/O on the server this way my $cache_count = $server_caches{'cache_count'}{$path}; foreach my $cc (keys %client_caches) { delete $client_caches{$cc}{$path}; last unless --$cache_count; } } # fourth: hand out messages that other clients have cached without # regard for how many clients have cached them; smart # scheduling takes too long (at least when using DBM) while (@tosend < $msgs && (my($path, $msg) = each %{$server_caches{'to_process'}})) { # we got a result, so assign it a number (curnum) and store the data # appropriately, then add the new number to the queue. my $num = ++$msgsout->{'curnum'}; $msgsout->{$num}->{'data'} = $server_caches{'to_process'}{$path}; $msgsout->{$num}->{'count'}++; push(@tosend, [ $num, 0 ]); delete $server_caches{'to_process'}{$path}; delete $server_caches{'cache_count'}{$path}; my $cache_count = $server_caches{'cache_count'}{$path}; foreach my $cc (keys %client_caches) { delete $client_caches{$cc}{$path}; last unless --$cache_count; } } } else { # close the tmpfd, etc so that the main loop knows we're done delete $msgsout->{'curnum'}; close $tmpfd; undef $tmpfd; } } # ok, at this point, @tosend ought to have a list of numbers, pointers into # %{$msgsout}. turn that into a tar file. return '' unless @tosend; my($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); die "Can't make tempfile, exiting" unless $gzpath; close($gzfd); $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!"; # first line is the number of messages included in the file send_line($gzfd, scalar @tosend) || die "mass-check: error when writing to gz temp file\n"; # Generate an archive in the temp file foreach my $num (@tosend) { my $in_cache = 0; if (ref($num) eq 'ARRAY') { $in_cache = $num->[1]; $num = $num->[0]; } # Archive format, gzip compressed file w/ 4 parts per message: # 1- server message number in text format # 2- server index string, binary packed format # 3- a 1 if the message is included, 0 otherwise -- unless paths_only # 4- message content -- unless paths_only or #3 is 0 my $data = $msgsout->{$num}->{'data'}; if (!$paths_only) { unless ($in_cache) { my $msg = ($iter->_run_message($data))[4]; if ($msg) { send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n"; send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n"; send_line($gzfd, '1') || die "mass-check: error when writing to gz temp file\n"; send_line($gzfd, (defined $msg ? join('', @{$msg}) : '')) || die "mass-check: error when writing to gz temp file\n"; push @sent, $num; } else { # if the message has an error (probably due to it being removed sometime during the # run) we send a message number of 0 to indicate to the client that the server had an # error retrieving the message; we need to do this since we have already added the # line telling the client how many messages to expect my $filename = (Mail::SpamAssassin::ArchiveIterator::_index_unpack($data))[3]; warn qq/mass-check: ArchiveIterator returned error for "$filename"\n/; send_line($gzfd, 0) || die "mass-check: error when writing to gz temp file\n"; delete $msgsout->{$num}; $failed_msgs++; } } else { send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n"; send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n"; send_line($gzfd, '0') || die "mass-check: error when writing to gz temp file\n"; push @sent, $num; } } else { send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n"; send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n"; # the client deals with missing messages on its own (sort of) push @sent, $num; } } $gzfd->close; # update timestamp entries my $ts = time; # make sure the timestamp is unique! without Time::HiRes it is trivial and # common to have two reissueings of timedout messages in the same second # with Time::HiRes we'll check anyway since we'll waste less time checking # for uniqueness than one of us will waste debugging a report of mass-check # not completing due to someone's wacky clock (some (non-para-)virtualized # servers will have their clocks go forward and backward) while (exists $timestamps->{$ts}) { if (HAS_TIME_HI_RES) { $ts = Time::HiRes::time(); } else { $ts += 0.01; } } foreach (@sent) { $msgsout->{$_}->{'timestamp'} = $ts; } # conveniently, this list should be the only thing sent out w/ this # timestamp, so just set the reference appropriately. :) $timestamps->{$ts} = \@sent; if ($opt_noisy) { status('sent '.scalar(@sent).' of '.scalar(@tosend).' intended messages'); } return $gzpath; } # we've gotten results posted, so clean up msgsout and timestamp hashes and # process result... sub handle_post_results { my($postdata, $timestamps, $msgsout) = @_; # local version to batch the removals my %timestamps = (); # $msgsout->{num}->{data|timestamp} # $timestamp{num} = [ msgout_nums ... ] # $postdata{num} = result_string while( my($k,$v) = each %{$postdata} ) { # message run results will be \d+ => log entry next if ($k !~ /^\d+$/); # if we've been waiting for this result, process it, otherwise throw it on # the ground. multiple clients could have been given the same messages to # process, and we take whatever the first responder sends us. if (exists $msgsout->{$k}) { # the result_sub will need parts of the message data, so get it ready my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msgsout->{$k}->{'data'}); # go ahead and do the result &{$iter->{result_sub}}($d[1], $v, $d[0]); # prep to get rid of the cached entries $timestamps{$msgsout->{$k}->{'timestamp'}}->{$k} = 1; delete $msgsout->{$k}; $msgs_processed++; } } # if we got any results, clean out the results from the timestamp arrays while ( my($k,$v) = each %timestamps ) { # trim out the result list from the timestamp sent list my @temp = grep(!exists $v->{$_}, @{$timestamps->{$k}}); # if there are results left for a specific timestamp, update the array # pointer. otherwise, delete the timestamp entry since it's empty. if (@temp) { $timestamps->{$k} = \@temp; } else { delete $timestamps->{$k}; } } } # This function reads from $tmpfd and processes the message as appropriate wrt # $opt_j, $opt_restart, etc. # sub run_through_messages { # do everything in one process if ($opt_j <= 1 && !defined $opt_restart) { my $message; my $messages; my $total_count = 0; while (($total_messages > $total_count) && ($message = read_line($tmpfd))) { my($class, undef, $date, undef, $result) = $iter->_run_message($message); if ($result) { &{$iter->{result_sub}}($class, $result, $date); } $total_count++; } $msgs_processed += $total_count; } # more than one process or one process with restarts else { my $select = IO::Select->new(); my $total_count = 0; my $needs_restart = 0; my @child = (); my @pid = (); my $messages; # start children processes start_children($opt_j, \@child, \@pid, $select); # feed childen, make them work for it, repeat while ($select->count()) { foreach my $socket ($select->can_read()) { my $line = read_line($socket); # some error happened during the read! if (!defined $line) { $needs_restart = 1; warn "mass-check: readline failed, attempting to recover\n"; $select->remove($socket); } elsif ($line =~ /^([^\0]*)\0RESULT (.+)$/s) { my $result = $1; my ($date,$class,$type) = Mail::SpamAssassin::ArchiveIterator::_index_unpack($2); aidbg "mass-check: $class, $type, $date\n"; if (defined $opt_restart && ($total_count % $opt_restart) == 0) { $needs_restart = 1; } # if messages remain, and we don't need to restart, send message if (($total_messages > $total_count) && !$needs_restart) { my $line = read_line($tmpfd); unless ($line) { warn "mass-check: found short message list ($total_messages, $total_count)\n"; $select->remove($socket); next; } send_line($socket, $line); $total_count++; aidbg "mass-check: $total_messages $total_count\n"; } else { # stop listening on this child since we're done with it aidbg "mass-check: $needs_restart $total_messages $total_count\n"; $select->remove($socket); } # deal with the result we received if ($result) { &{$iter->{result_sub}}($class, $result, $date); } } elsif ($line eq "START") { if ($total_messages > $total_count) { # we still have messages, send one to child send_line($socket, read_line($tmpfd)); $total_count++; aidbg "mass-check: $total_messages $total_count\n"; } else { # no more messages, so stop listening on this child aidbg "mass-check: $needs_restart $total_messages $total_count\n"; $select->remove($socket); } } else { $needs_restart = 1; warn "mass-check: bad line from readline: $line\n"; $select->remove($socket); } } aidbg "mass-check: out of loop, $total_messages $total_count $needs_restart ".$select->count()."\n"; # If there are still messages to process, and we need to restart # the children, and all of the children are idle, let's go ahead. if ($needs_restart && $select->count == 0 && $total_messages > $total_count) { $needs_restart = 0; aidbg "mass-check: needs restart, $total_messages total, $total_count done\n"; reap_children($opt_j, \@child, \@pid); @child=(); @pid=(); start_children($opt_j, \@child, \@pid, $select); } } $msgs_processed += $total_count; # reap children reap_children($opt_j, \@child, \@pid); } } # send an HTTP response to a socket based on the input result, headers, and # data values. sub http_response { my($socket, $result, $headers, $data) = @_; $headers->{'Content-Length'} = length $data; $headers->{'Cache-Usage'} = $opt_cs_cache ? 'allowed' : 'disallowed'; print $socket "HTTP/1.0 $result\r\n", "Pragma: no-cache\r\n", "Server: mass-check/$svn_revision\r\n", (map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers}), "\r\n"; print $socket $data; if ($opt_cs_verbose) { $sent_bytes += length( "HTTP/1.0 $result\r\n". "Pragma: no-cache\r\n". "Server: mass-check/$svn_revision\r\n"); # length of a map output isn't what the other end claims to see while (my($f,$v) = each %{$headers}) { $sent_bytes += length($f) + length($v) + 4; } $sent_bytes += 2 + length($data); } } # the client needs to make a request to the server on a given socket. sub http_make_request { my($socket, $type, $uri, $headers, $data) = @_; $headers->{'Content-Length'} = length $data; print $socket "$type $uri HTTP/1.0\r\n", "User-Agent: mass-check/$svn_revision\r\n", (map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers}), "\r\n"; print $socket $data; if ($opt_cs_verbose) { $sent_bytes += length( "$type $uri HTTP/1.0\r\n". "User-Agent: mass-check/$svn_revision\r\n"); # length of a map output isn't what the other end claims to see while (my($f,$v) = each %{$headers}) { $sent_bytes += length($f) + length($v) + 4; } $sent_bytes += 2 + length($data); } # parse the response that the server sends us my $line = $socket->getline() || ''; $rcvd_bytes += length($line); my(undef, $code, $string) = split(/\s+/, $line, 3); return unless $code == 200; my %headers = (); do { $line = $socket->getline(); last unless defined $line; $rcvd_bytes += length($line); $line =~ s/\r\n$//; if ($line) { my ($k,$v) = split(/:\s*/, $line, 2); $headers{lc $k} = $v; } } while ($line !~ /^$/); $rcvd_bytes += $headers{'content-length'} if $headers{'content-length'}; $client_id ||= $headers{'client-id'} if $headers{'client-id'}; # disable cache-usage if the server disallows cache usage $opt_cs_cache = 0 if $headers{'cache-usage'} eq 'disallowed'; # the server has sent us notification that it's going to exit, so let's # follow suit. return 'finished' if ($headers{'finished'}); my $gzpath = ''; if ($headers{'content-type'} eq 'application/x-gzip') { my $gzfd; ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); die "Can't make tempfile, exiting" unless $gzpath; my $rd; $socket->read($rd, $headers{'content-length'}) || die "mass-check: error reading in data from server\n"; print $gzfd $rd; close $gzfd; } $socket->close(); return $gzpath; } # Be conservative -- encode most things. # we could encode spaces to plusses, then decode that later, but... sub post_encode { my $string = shift; $string =~ s/([^a-zA-Z0-9_,.\/\\-])/sprintf "%%%02x",unpack("C",$1)/egx; return $string; } # remove all of the files in a given directory, non-recursive sub clean_dir { my $dir = shift; unless (opendir(DIR, $dir)) { warn "error: can't opendir $dir: $!\n"; return; } while(my $file = readdir(DIR)) { $file =~ /^(.+)$/; # untaint $file = $1; my $path = File::Spec->catfile($dir, $file); next unless (-f $path); if (!unlink $path) { warn "error: can't remove file $path: $!\n"; closedir(DIR); return; } } closedir(DIR); return 1; } ############################################################################ # four bytes in network/vax format (little endian) as length of message # the rest is the actual message sub read_line { my $fd = shift; my($length,$msg); # read in the 4 byte length and unpack $fd->read($length, 4) || return; $length = unpack("V", $length); return unless $length; # read in the rest of the single message $fd->read($msg, $length) || return; return $msg; } sub send_line { my $fd = shift; foreach ( @_ ) { my $length = pack("V", length $_); $fd->print($length.$_) || return 0; } return 1; } ############################################################################ # this is the function that implemented server mode. basically, sit and wait # for connections to come in. when a client sends in a request, deal with any # results that the client sent, then generate a response and send it back, # and then go back to waiting. lather, rinse, repeat. sub server_mode { $opt_cs_max ||= 1000; $opt_cs_timeout ||= 60 * 5; $opt_cs_max_tries ||= 3; # IO::Socket::SSL isn't that smart $opt_server =~ s/:(\d+)$//; my $port = $1; # ::SSL needs resolved hostnames, at least on Solaris. note: not IPv6-aware my $localaddr = (gethostbyname($opt_server))[4] || $opt_server; $localaddr = Socket::inet_ntoa($localaddr); my $serv_socket; if ($opt_cs_ssl) { $serv_socket = IO::Socket::SSL->new( LocalAddr => $localaddr, LocalPort => $port, ReuseAddr => 1, Listen => 5, SSL_verify_mode => 0x02, SSL_use_cert => 1, SSL_version => "TLSv1", SSL_key_file => "spamassassin/server-key.pem", SSL_cert_file => "spamassassin/server-cert.pem", ); } else { $serv_socket = IO::Socket::INET->new( LocalAddr => $localaddr, LocalPort => $port, ReuseAddr => 1, Listen => 5, ); } die "Could not create socket: $!\n" unless $serv_socket; if ($opt_progress) { status('server ready for connections'); } # Setup out "what messages have been sent out" hashes my $timestamps = {}; my $msgsout = { 'curnum' => 0 }; # Generate an IO::Select object and put the server socket on the queue my $select = IO::Select->new( $serv_socket ); # We'll keep looping while there's something to pay attention to while ($select->count()) { # Sit and block until there's something for us to read from foreach my $socket ($select->can_read()) { if ($socket == $serv_socket) { # it's the server socket, go ahead and accept the connection and add # it to the queue. $select->add($serv_socket->accept); } else { # it's some client, so deal with the request my($type, $URI, $headers, $postdata) = handle_http_request($socket); # we don't do GET, so just send something back if ($type eq 'GET') { if ($opt_noisy) { status('GET request from '.$socket->peerhost); } http_response($socket, "200 OK", { 'Content-type' => 'text/plain', }, "Your GET request came from IP Address: ".$socket->peerhost."\n"); } elsif ($type eq 'POST') { # ooh, POST. deal with any results that the client sent handle_post_results($postdata, $timestamps, $msgsout); if ($opt_noisy) { status('POST request from '.$socket->peerhost); } # based on the number of messages that the client requested, # generate a gzip file with the appropriate data in it my $messages = ''; if ($postdata->{'max_messages'}) { my $msgnum = $postdata->{'max_messages'}; if ($msgnum > $opt_cs_max || $msgnum < 1) { $msgnum = $opt_cs_max; } if ($opt_noisy) { status('client requested '.$postdata->{'max_messages'}.' messages'); } $messages = generate_messages($msgnum, $timestamps, $msgsout, $postdata->{'paths_only'}, $headers->{'client-id'}); } # $messages will contain the path to the gzip file if there are # messages to send out. if ($messages && open(MSG, $messages)) { $t_first_msg = time unless $t_first_msg; binmode(MSG); local $/ = undef; # go go slurp mode # send the response http_response($socket, "200 OK", { 'Content-Type' => 'application/x-gzip', 'Content-Encoding' => 'x-gzip', 'Client-ID' => $headers->{'client-id'}, }, scalar ); close(MSG); # we don't need the file anymore, so get rid of it unlink $messages; } elsif (!keys %{$msgsout} && !defined $tmpfd) { # we have no more outstanding messages and our original queue of # messages to process is empty, so tell the client to exit. http_response($socket, "200 OK", { "Content-type" => "text/plain", 'Client-ID' => $headers->{'client-id'}, "Finished" => 1, }, 'We are all done'); $t_last_msg = time; } else { # when in doubt, treat this like a GET http_response($socket, "200 OK", { "Content-type" => "text/plain", 'Client-ID' => $headers->{'client-id'}, }, "Your POST request (sans max_messages) came from IP Address: ".$socket->peerhost."\n"); } } else { # for error, "501 Not Implemented" http_response($socket, '501 Not Implemented', {}, ''); } # ok, we don't do keepalive, so get rid of the socket $select->remove($socket); $socket->close; } } if ($opt_noisy) { status((exists $msgsout->{'curnum'} ? (scalar(keys %{$msgsout})-1) : scalar(keys %{$msgsout})).' messages outstanding'); } #print "msgs waiting: ".join(" ", keys %{$msgsout})."\n"; #print "tmpfd defined? ".(defined $tmpfd ? "yes" : "no")."\n"; # we're not awaiting responses and we've exhausted the input file, so # drop the server socket. :) $select->remove($serv_socket) if (!keys %{$msgsout} && !defined $tmpfd); } # remove the server and client message cache temp files foreach my $tied_hash (keys %server_caches) { undef $tied_hash; } foreach my $tied_hash (keys %client_caches) { undef $tied_hash; } foreach my $tmp_file (@cache_tmp_files) { unlink $tmp_file; } } # this is the function that implements client mode. generally, in a loop: # make a request of the server for some max number of messages, and send our # results back at the same time. based on the results of that request, put # messages into a temp dir and process them. prep the results and loop. # lather, rinse, repeat. sub client_mode { $opt_cs_max ||= 1000; $opt_cs_timeout ||= 60 * 2; $opt_cs_conn_retries ||= 60; # 1 hour my($host, $port, $uri); if ($opt_client =~ /^http:\/\/([^\/]+)(\/.*)?/) { ($host, $uri) = ($1,$2); } else { $host = $opt_client; if ($host =~ /^:/) { $host = 'localhost'.$host; } } ($host, $port) = split(/:/, $host); die "No host found in opt_client" unless $host; $uri ||= "/"; # ::SSL needs resolved hostnames, at least on Solaris. note: not IPv6-aware $host = Socket::inet_ntoa(Socket::inet_aton($host)); # use this to track how many messages we ought to be requesting # start at 100 to get warmed up my $msgnum = $opt_cs_max > 100 ? 100 : $opt_cs_max; my $tmpdir; # if we're not doing paths_only, create a temp dir where we'll put the # incoming messages to process. if (!$opt_cs_paths_only) { $tmpdir = Mail::SpamAssassin::Util::secure_tmpdir(); die "Can't create tempdir" unless $tmpdir; } my $made_conn_once = 0; # keep going until something stops us. while (1) { # send cache data if this is the first connect # do this before creating the connection so we don't waste all the other # clients' time, once we've connected, by monopolizing the single server my ($gzpath, $gzfd, $action); if (!$made_conn_once && $opt_cs_schedule_cache) { # we need to find out what messages we have in our client cache so that we # can tell the server what we've got so that it can optimize our cache hit # rate by requesting us to scan messages that are already in our cache $action = 'sending cache'; $gzpath = scan_client_cache(); } else { $action = 'sending results'; # if the number of messages to request is too much, bring it down $msgnum = $opt_cs_max if ($msgnum > $opt_cs_max); # prep the POST request $postdata{'max_messages'} = $msgnum; $postdata{'paths_only'} = 1 if ($opt_cs_paths_only); # compress the results ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); die "Can't make tempfile, exiting" unless $gzpath; $gzfd->close; $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!"; # the actual POST data string send_line($gzfd, join('&', map { post_encode($_) . '=' . post_encode($postdata{$_}) } keys %postdata)) || die "mass-check: error when writing to gz temp file\n"; $gzfd->close; %postdata = (); } # connect to server my $socket; if ($opt_cs_ssl) { $socket = IO::Socket::SSL->new( PeerAddr => $host, PeerPort => $port, SSL_version => "TLSv1", SSL_use_cert => 1, SSL_key_file => "spamassassin/client-key.pem", SSL_cert_file => "spamassassin/client-cert.pem", ); } else { $socket = IO::Socket::INET->new( PeerAddr => $host, PeerPort => $port ); } if (!$socket) { unlink $gzpath; undef $gzfd; # if we haven't yet made a connection, keep retrying; # this is probably the server still in "scan stage" if (!$made_conn_once) { if ($opt_cs_conn_retries-- > 0) { status('failed to connect, sleeping for retry') if ($opt_noisy); sleep 60; next; } else { status('failed to connect, giving up') if ($opt_noisy); last; } } # last if connection fails after scanning something last; } $made_conn_once = 1; status("requesting $msgnum messages from server") if ($opt_noisy); # make request, include and then drop results if there are any my $POSTDATA = ''; if ($gzpath) { if (open(RESULTS, $gzpath)) { binmode(RESULTS); { # slurp here, rather than into an anonymous variable in the # http_make_request call so that we don't end up slurping the # response from the server too local $/ = undef; # slurp $POSTDATA = scalar ; } close(RESULTS); unlink $gzpath; undef $gzfd; } else { die "Can't open tempfile, exiting" unless $gzpath; } } my $result = http_make_request($socket, 'POST', $uri, { 'Host' => $host, 'Content-Type' => 'application/x-www-form-urlencoded', 'Content-Encoding' => 'x-gzip', 'Action' => $action, 'Client-ID' => ($client_id || '') }, $POSTDATA ); undef $POSTDATA; $t_last_msg = time; # If we received messages to run through, go ahead and do it. # otherwise, just sleep for the timeout length and try again if (!defined $result) { # we got an error?!? abort! last; } elsif ($result eq 'finished') { # the server said that we're done status('server has no more work, exiting.') if ($opt_noisy); last; } elsif ($result eq '') { # no messages means the server may give us more work down the road. # sleep for client_timeout seconds and try the request again status("Received no messages, waiting $opt_cs_timeout seconds") if ($opt_noisy); sleep $opt_cs_timeout; } else { # we got messages, so deal with them. my $time_start = time; $t_first_msg = $time_start unless $t_first_msg; # postdata will hold our results, real will hold the original message # data from the server's scan mode. %postdata = (); %real = (); $init_results = $total_count = $spam_count = $ham_count = 0; # we got a result, so do things with it! my $gzfd = IO::Zlib->new($result, "rb"); die "Can't open temp result file: $!" unless $gzfd; # used for the temp queue file my $tmppath; ($tmppath, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile(); die "Can't make tempfile, exiting" unless $tmppath; unlink $tmppath; # if we have a temp directory, clean it out for this run clean_dir($tmpdir) if ($tmpdir); # Archive format, gzip compressed file w/ 4 parts per message: # 1- server message number in text format # 2- server index string, binary packed format # 3- a 1 if the message is included, 0 otherwise -- unless paths_only # 4- message content -- unless paths_only or #3 is 0 # number of messages $msgnum = $total_messages = read_line($gzfd) || die "mass-check: error reading from gzip message file\n"; status("server says it gave us $total_messages messages") if ($opt_progress); my $actual_messages = 0; # loop through and prep all of the messages the server sent for(my $i = 0 ; $i < $total_messages; $i++ ) { my $num = read_line($gzfd); last unless defined $num; next unless $num; # a message number of 0 indicates a msg-error on the server my $index = read_line($gzfd); last unless defined $index; my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($index); # if we're doing paths_only, there'll be no message content if (!$opt_cs_paths_only) { my $msg_included = read_line($gzfd); last unless defined $msg_included; my $msg; if ($msg_included == 1) { $msg = read_line($gzfd); last unless defined $msg; } # permanently cache the message on the client my $cache_success = 0; if ($opt_cs_cache) { unless (-f "$opt_cs_cachedir/$d[3]") { umask 0077; # prevent others on the system from reading corpus messages # TODO: the entire mass-check script should probably use umask 0077; mkdir $opt_cs_cachedir; # mkdir won't bitch if it already exists, that's ok if (-d $opt_cs_cachedir && $d[3] =~ m{^/?(.*(?![^\\]\\)/)?(.+)$}) { my ($path, $filename) = ($1, $2); my $dir = ''; $path ||= ''; while ($path =~ m#((?![^\\]\\)[^/]+/)#gc) { $dir .= $1; mkdir "$opt_cs_cachedir/$dir"; } if (-d "$opt_cs_cachedir/$dir") { if (open(OUT, ">$opt_cs_cachedir/$dir$filename")) { print OUT $msg; close $msg; $cache_success = 1; } else { warn "Can't create/write message cache file $opt_cs_cachedir/$dir$filename: $!"; } } } } else { $cache_success = 1; $cache_hits++; } if ($cache_success) { # this is a little tricky -- we need to process the files in the # path and format we've created, but the original data is needed # to create a proper result later, so deal with that here. $real{"$opt_cs_cachedir/$d[3]"} = \@d; push @{$real{"$opt_cs_cachedir/$d[3]"}}, $num; send_line($tmpfd, Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], 'f', "$opt_cs_cachedir/$d[3]")) || die "mass-check: error writing out temp file in client mode\n"; $actual_messages++; } } # if the cache failed try a temp file instead if (!$opt_cs_cache || !$cache_success) { next unless defined $msg; # it's going to be a dir of file formatted messages if (open(OUT, ">$tmpdir/$num")) { print OUT $msg; close(OUT); # this is a little tricky -- we need to process the files in the # path and format we've created, but the original data is needed # to create a proper result later, so deal with that here. $real{"$tmpdir/$num"} = \@d; push @{$real{"$tmpdir/$num"}}, $num; send_line($tmpfd, Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], 'f', "$tmpdir/$num")) || die "mass-check: error writing out temp file in client mode\n"; $actual_messages++; } else { warn "Can't create/write message temp file $tmpdir/$num: $!"; } } } # paths_only_mode else { # permanently cache the message on the client my $cache_success = 0; if ($opt_cs_cache) { unless (-f "$opt_cs_cachedir/$d[3]") { umask 0077; # prevent others on the system from reading corpus messages # TODO: the entire mass-check script should probably use umask 0077; mkdir $opt_cs_cachedir; # mkdir won't bitch if it already exists, that's ok if (-d $opt_cs_cachedir && $d[3] =~ m{^/?(.*(?![^\\]\\)/)?(.+)$}) { my ($path, $filename) = ($1, $2); my $dir = ''; $path ||= ''; while ($path =~ m#((?![^\\]\\)[^/]+/)#gc) { $dir .= $1; mkdir "$opt_cs_cachedir/$dir"; } if (-d "$opt_cs_cachedir/$dir") { # copy the file to the local cache for use next (and this) time if (copy($d[3], "$opt_cs_cachedir/$d[3]")) { $cache_success = 1; } } } } else { $cache_success = 1; $cache_hits++; } if ($cache_success) { # this is a little tricky -- we need to process the files in the # path and format we've created, but the original data is needed # to create a proper result later, so deal with that here. $real{"$opt_cs_cachedir/$d[3]"} = \@d; push @{$real{"$opt_cs_cachedir/$d[3]"}}, $num; send_line($tmpfd, Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], 'f', "$opt_cs_cachedir/$d[3]")) || die "mass-check: error writing out temp file in client mode\n"; $actual_messages++; } } # if the messages isn't in our cache use the supplied path instead if (!$opt_cs_cache || !$cache_success) { # in paths_only mode, there's no kluging between formats since we're # reading the same corpus, however we do still need to track server # message number to message data so our results will be useable. $real{$d[3]} = \@d; push @{$real{$d[3]}}, $num; send_line($tmpfd, $index) || die "mass-check: error writing out temp file in client mode\n"; $actual_messages++; } } } $gzfd->close; unlink $result; # if the server tries to give us messages, but some are errored, # we need to know that so we don't try to process them all. if ($total_messages != $actual_messages) { status("server actually gave us $actual_messages messages") if ($opt_progress); $total_messages = $actual_messages; } if ($opt_progress) { status('starting run stage'); } # we're about to start running, so go back to the start of the file seek $tmpfd, 0, 0; run_through_messages(); # we're done with the temp file -- bye bye close($tmpfd); # figure out new max messages, try keeping ~cs_timeout between runs my $time_end = time; # if we only requested a small number of messages, it may take <1s to # run through them, so fake it and say it took 1s. if ($time_end == $time_start) { $time_end++; } if ($opt_progress) { status('completed run stage'); } status('completed run in '.($time_end-$time_start).' seconds') if ($opt_noisy); $msgnum = int($msgnum * $opt_cs_timeout / ($time_end-$time_start)) || 1; } } # if we were using a temp dir, clean it out and then remove it if ($tmpdir) { clean_dir($tmpdir); rmdir $tmpdir; } } # scan the client's cache and return a path to a gzip archive of AI output sub scan_client_cache { status('starting cache scan stage') if ($opt_progress); # make sure the cachedir exists so that AI doesn't bail out mkdir $opt_cs_cachedir unless -e $opt_cs_cachedir; # Make a temp file and delete it my $tmpf; ($tmpf, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile(); die 'mass-check: failed to create temp file' unless $tmpf; unlink $tmpf or die "mass-check: unlink '$tmpf': $!"; my @targets = ("h:dir:$opt_cs_cachedir"); generate_queue(\@targets, $tmpfd); # we now have a temporary file with the messages to process seek($tmpfd, 0, 0); # the first line is the number of messages $total_messages = read_line($tmpfd); showdots_finish(); status("completed cache scan stage, $total_messages messages") if ($opt_progress); unless ($tmpfd) { status('cache scan failed'); return; } status('compressing cache data') if ($opt_progress); # create a temp file for compression my($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); die "Can't make tempfile, exiting" unless $gzpath; $gzfd->close; $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!"; while (my $msg = read_line($tmpfd)) { $msg =~ s/\r?\n$//; my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msg); $d[3] =~ s/^$opt_cs_cachedir//; send_line($gzfd, Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], $d[2], $d[3])) || die "mass-check: error writing out temp file in client mode\n"; } close $tmpfd; undef $tmpfd; $gzfd->close; status('cache data compressed to '.(-s $gzpath).' bytes') if ($opt_progress); return $gzpath; } ############################################################################ # in server mode, just return the ref to the message data sub wanted_server { my ($class, $id, $time, $dataref, $format) = @_; return $dataref; } # very similar to result() except the result has the message number at the # front, so strip it off and then set the POST data appropriately. sub result_client { my ($class, $result, $time) = @_; # don't open results files until we get here to avoid overwriting files init_results() if !$init_results; if ($class eq "s") { $spam_count++; } elsif ($class eq "h") { $ham_count++; } $total_count++; if ($opt_progress) { progress($time); } if ($result =~ s/^(\d+)\s+//m) { $postdata{$1} = $result; } else { warn ">> WTH!? result is not in the correct format: $result\n"; # 20071114: bit of a hack # prevent malformed cs_client message result lines from preventing the # cs_server running in cs_schedule cache mode from completing # TODO: find out how result lines get malformed (malformed result lines # will still hang any cs_server not running with cs_schedule_cache) $failed_msgs++; } } sub aidbg { if (would_log("dbg", "mass-check") == 2) { dbg (@_); } } sub deal_with_before_after { my($which, $time) = @_; if ($time && $time =~ /^-\d+$/) { $time = time + $time; } elsif ($time && $time !~ /^-?\d+$/) { if (HAS_TIME_PARSEDATE) { $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1); } else { die "You need Time::ParseDate if you use either the --before or --after option."; } } if ($which eq 'before') { $opt_before = $time; } else { $opt_after = $time; } if ($opt_before && $opt_after && $opt_after >= $opt_before) { die "--before ($opt_before) <= --after ($opt_after) -- conflict!"; } } sub generate_queue { my ($targets, $tmpfd) = @_; # scan the targets and get the number and list of messages $iter->_scan_targets($targets, sub { my($self, $date, $class, $format, $mail) = @_; push(@{$self->{$class}}, Mail::SpamAssassin::ArchiveIterator::_index_pack($date, $class, $format, $mail)); } ); # deal with opt_head and opt_tail top_and_tail_messages($iter->{h}); top_and_tail_messages($iter->{s}); my $messages; if ($opt_n) { # OPT_N == 1 means don't bother sorting on message receive date # for ease of memory, we'll play with pointers $messages = $iter->{s}; undef $iter->{s}; if ($iter->{h}) { push(@{$messages}, @{$iter->{h}}); undef $iter->{h}; } } else { # OPT_N == 0 means sort on message receive date # Sort the spam and ham groups by date my @s = @{$iter->{s}}; undef $iter->{s}; my @h = @{$iter->{h}}; undef $iter->{h}; # interleave ordered spam and ham if (@s && @h) { my $ratio = @s / @h; while (@s && @h) { push @{$messages}, (@s / @h > $ratio) ? (shift @s) : (shift @h); } } # push the rest onto the end push @{$messages}, @s, @h; } # head or tail < 0 means crop the total list, negate the value appropriately if ($opt_tail < 0) { splice(@{$messages}, 0, $opt_tail); } if ($opt_head < 0) { splice(@{$messages}, -$opt_head); } my $num = $Mail::SpamAssassin::ArchiveIterator::MESSAGES = scalar(@{$messages}); # Dump out the number of messages and the message index info to # the temp file send_line($tmpfd, $num, @{$messages}); } sub top_and_tail_messages { my ($ary) = @_; if ($opt_n) { # OPT_N == 1 means don't bother sorting on message receive date # head or tail > 0 means crop each list if ($opt_tail > 0) { splice(@{$ary}, 0, -$opt_tail); } if ($opt_head > 0) { splice(@{$ary}, min ($opt_head, scalar @{$ary})); } } else { # OPT_N == 0 means sort on message receive date # Sort the spam and ham groups by date my @s = sort { $a cmp $b } @{$ary}; # head or tail > 0 means crop each list if ($opt_tail > 0) { splice(@s, 0, -$opt_tail); } if ($opt_head > 0) { splice(@s, min ($opt_head, scalar @s)); } @{$ary} = @s; } } sub min { return ($_[0] < $_[1] ? $_[0] : $_[1]); } ############################################################################ sub run_post_scan { return unless $opt_run_post_scan; system($opt_run_post_scan); if ($? >> 8 != 0) { warn "$opt_run_post_scan failed"; } }