#!/usr/bin/perl -w # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # sub usage { my $status = shift; my $out = $status ? STDERR : STDOUT; print $out < -j=jobs specify the number of processes to run simultaneously --net turn on network checks! --mid report Message-ID from each message --debug=LIST report debugging information (default is all facilities, LIST is a comma-separated list of facilities) --progress show progress updates during check --noisy show noisier progress updates during check --showdots print a dot for each scanned message --rewrite=OUT save rewritten message to OUT (default is /tmp/out) --rules=RE Only test rules matching the given regexp RE --restart=N restart all of the children after processing N messages --deencap=RE Extract SpamAssassin-encapsulated spam mails only if they were encapsulated by servers matching the regexp RE (default = extract all SpamAssassin-encapsulated mails) --lint check rules for syntax before running log options -o write all logs to stdout --loghits log the text hit for patterns (useful for debugging) --loguris log the URIs found --logmem log the memory delta (only on Linux) --hamlog=log use as ham log ('ham.log' is default) --spamlog=log use as spam log ('spam.log' is default) message selection options -n no date sorting or spam/ham interleaving --cache use cache information when selecting messages --cachedir=dir write cache info for --cache in this directory tree --after=N only test mails received after time_t N (negative values are an offset from current time, e.g. -86400 = last day) or after date as parsed by Time::ParseDate (e.g. '-6 months') --before=N same as --after, except received times are before time_t N --all don't skip big messages --head=N only check first N ham and N spam (N messages if -n used) --tail=N only check last N ham and N spam (N messages if -n used) simple target options (implies -o and no ham/spam classification) --dir subsequent targets are directories --file subsequent targets are files in RFC 822 format --mbox subsequent targets are mbox files --mbx subsequent targets are mbx files Just left over functions we should remove at some point: --bayes report score from Bayesian classifier options used during score generation process --learn=N learn N% of messages as spam or ham --reuse reuse network checks if X-Spam-Status: is present in messages non-option arguments are used as target names (mail files and folders), the target format is: :: is "spam" or "ham" is "dir", "file", "mbx", "mbox", or "detect" (see 'perldoc Mail::SpamAssassin::ArchiveIterator) is a file or directory name. globbing of ~ and * is supported EOF exit($status); } ########################################################################### use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits $opt_mid $opt_net $opt_nosort $opt_progress $opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart $opt_loguris $opt_logmem $opt_after $opt_before $opt_rewrite $opt_deencap $opt_learn $opt_reuse $opt_lint $opt_cache $opt_noisy $total_messages $statusevery $opt_cachedir %reuse %orig_conf %reuse_conf $reuse_rules_loaded_p); use FindBin; use lib "$FindBin::Bin/../lib"; use lib "$FindBin::Bin/tmp"; eval "use bytes"; use Mail::SpamAssassin::ArchiveIterator; use Mail::SpamAssassin; use Mail::SpamAssassin::Logger; use File::Copy; use File::Spec; use Getopt::Long; use POSIX qw(strftime); use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; }; use Config; # default settings $opt_c = "$FindBin::Bin/../rules"; $opt_p = "$FindBin::Bin/spamassassin"; $opt_j = 1; $opt_net = 0; $opt_hamlog = "ham.log"; $opt_spamlog = "spam.log"; $opt_learn = 0; $reuse_rules_loaded_p = 0; my @ORIG_ARGV = @ARGV; GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug:s", "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net", "progress", "rewrite:s", "showdots", "spamlog=s", "tail=i", "rules=s", "restart=i", "after=s", "before=s", "loguris", "deencap=s", "logmem", "learn=i", "reuse", "lint", "cache", "cachedir=s", "noisy", "dir" => sub { $opt_format = "dir"; }, "file" => sub { $opt_format = "file"; }, "mbox" => sub { $opt_format = "mbox"; }, "mbx" => sub { $opt_format = "mbx"; }, "help" => sub { usage(0); }, '<>' => \&target) or usage(1); # rules.pl is for the --reuse option, score set doesn't matter if ($opt_reuse && ! -f "$FindBin::Bin/tmp/rules.pl") { # some people specify paths relatively, whereas this needs an absolute path, # so "do the right thing"(tm). my $abs_opt_c = File::Spec->rel2abs($opt_c); system("cd $FindBin::Bin; perl parse-rules-for-masses -d $abs_opt_c"); } require "rules.pl" if $opt_reuse; if ($opt_noisy) { $opt_progress = 1; # implies --progress } # test messages for the mass-check my @targets; if ($opt_f) { open(F, $opt_f) || die "cannot read target $opt_f: $!"; push(@targets, map { chomp; $_ } ); close(F); } usage(1) if !@targets; $opt_debug ||= 'all' if defined $opt_debug; my $user_prefs = "$opt_p/user_prefs"; # --lint # In theory we could probably use the same spamtest object as below, # but since it's probably not expecting that, and we don't want # strange things happening, create a local object. if ($opt_lint) { my $spamlint = new Mail::SpamAssassin ({ 'debug' => $opt_debug, 'rules_filename' => $opt_c, 'userprefs_filename' => $user_prefs, 'site_rules_filename' => "$opt_p/local.cf", 'userstate_dir' => "$opt_p", 'save_pattern_hits' => $opt_loghits, 'dont_copy_prefs' => 1, 'local_tests_only' => $opt_net ? 0 : 1, 'only_these_rules' => $opt_rules, 'ignore_safety_expire_timeout' => 1, PREFIX => '', DEF_RULES_DIR => $opt_c, LOCAL_RULES_DIR => '', }); $spamlint->debug_diagnostics(); my $res = $spamlint->lint_rules(); $spamlint->finish(); warn "lint: $res issues detected, please rerun with debug enabled for more information\n" if ($res); exit 1 if $res; } $spamtest = new Mail::SpamAssassin ({ 'debug' => $opt_debug, 'rules_filename' => $opt_c, 'userprefs_filename' => $user_prefs, 'site_rules_filename' => "$opt_p/local.cf", 'userstate_dir' => "$opt_p", 'save_pattern_hits' => $opt_loghits, 'dont_copy_prefs' => 1, 'local_tests_only' => $opt_net ? 0 : 1, 'only_these_rules' => $opt_rules, 'ignore_safety_expire_timeout' => 1, PREFIX => '', DEF_RULES_DIR => $opt_c, LOCAL_RULES_DIR => '', }); $spamtest->compile_now(1); $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf"); # generated user_prefs if ($opt_reuse) { # copy current prefs if it exists $spamtest->copy_config(undef, \%orig_conf); # zeroed scores to mass_prefs my @zero = sort grep { defined $reuse{$_}->{skip} } keys %reuse; open(PREFS, ">> $opt_p/mass_prefs") || die "Unable to open $opt_p/mass_prefs: $!\nNeeded for --reuse to work properly"; for my $zero (@zero) { print PREFS "score $zero 0\n"; } close(PREFS); $spamtest->read_scoreonly_config("$opt_p/mass_prefs"); $spamtest->copy_config(undef, \%reuse_conf); $reuse_rules_loaded_p = 1; } my $who = `id -un 2>/dev/null`; my $where = `uname -n 2>/dev/null`; my $when = `date -u`; my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost'; chomp $who; chomp $where; chomp $when; chomp $host; my $revision = get_current_svn_revision(); my $cmdline = join(' ',@ORIG_ARGV); $cmdline =~ s/\s+/ /gs; my $isowhen = strftime("%Y%m%dT%H%M%SZ", gmtime(time)); # better my $log_header = "# mass-check results from $who\@$where, on $when\n" . "# M:SA version ".$spamtest->Version()."\n" . "# SVN revision: $revision\n" . "# Date: $isowhen\n" . "# Perl version: $] on $Config{archname}\n" . "# Switches: '$cmdline'\n"; my $updates = ($opt_noisy ? 100 : 10); my $total_count = 0; my $spam_count = 0; my $ham_count = 0; my $init_results = 0; my $showdots_active = ($opt_showdots || $opt_noisy); my $showdots_counter = 0; my $showdots_every = ($opt_showdots ? 1 : 20); # Deal with --rewrite if (defined $opt_rewrite) { my $rewrite = ($opt_rewrite ? $opt_rewrite : "/tmp/out"); open(REWRITE, "> $rewrite") || die "open of $rewrite failed: $!"; } # Deal with --before and --after foreach my $time ($opt_before, $opt_after) { if ($time && $time =~ /^-\d+$/) { $time = time + $time; } elsif ($time && $time !~ /^-?\d+$/) { if (HAS_TIME_PARSEDATE) { $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1); } else { die "You need Time::ParseDate if you use either the --before or --after option."; } } } if ($opt_before && $opt_after && $opt_after >= $opt_before) { die "--before ($opt_before) <= --after ($opt_after) -- conflict!"; } my $iter = new Mail::SpamAssassin::ArchiveIterator({ 'opt_j' => $opt_j, 'opt_n' => $opt_n, 'opt_all' => $opt_all, 'opt_head' => $opt_head, 'opt_tail' => $opt_tail, 'opt_cache' => $opt_cache, 'opt_cachedir' => $opt_cachedir, 'opt_after' => $opt_after, 'opt_before' => $opt_before, 'opt_restart' => $opt_restart, 'scan_progress_sub' => \&scan_progress_cb }); if ($opt_progress) { status('starting scan stage'); } sub scan_progress_cb { showdots_blip(); } $iter->set_functions(\&wanted, \&result); $iter->run(@targets); if ($opt_progress) { status('completed run stage'); } showdots_finish(); if (defined $opt_rewrite) { close(REWRITE); } $spamtest->finish(); # exit status: did we check at least one message correctly? exit(!($ham_count || $spam_count)); ########################################################################### sub target { my ($target) = @_; if (!defined($opt_format)) { push(@targets, $target); } else { $opt_o = 1; push(@targets, "spam:$opt_format:$target"); } } ########################################################################### sub init_results { showdots_finish(); # now, showdots only happens if --showdots was used $showdots_active = $opt_showdots; if ($opt_progress) { # make it a local variable for now $total_messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES; # round up since 100% will be caught at end already $statusevery = int($total_messages / $updates + 1); # if $messages < $updates, just give a status line per msg. $statusevery ||= 1; status("completed scan stage, $total_messages messages"); status('starting run stage'); } if ($opt_o) { autoflush STDOUT 1; print STDOUT $log_header; } else { open(HAM, "> $opt_hamlog") || die "open of $opt_hamlog failed: $!"; open(SPAM, "> $opt_spamlog") || die "open of $opt_spamlog failed: $!"; autoflush HAM 1; autoflush SPAM 1; print HAM $log_header; print SPAM $log_header; } $init_results = 1; } sub result { my ($class, $result, $time) = @_; # don't open results files until we get here to avoid overwriting files &init_results if !$init_results; if ($class eq "s") { if ($opt_o) { print STDOUT $result; } else { print SPAM $result; } $spam_count++; } elsif ($class eq "h") { if ($opt_o) { print STDOUT $result; } else { print HAM $result; } $ham_count++; } $total_count++; #warn ">> result: $total_count $class $time\n"; if ($opt_progress) { progress($time); } } sub wanted { my ($class, $id, $time, $dataref, $format) = @_; my $out; memory_track_start() if ($opt_logmem); my $ma = $spamtest->parse($dataref, 1); # remove SpamAssassin markup, if present and the mail was spam my $header = $ma->get_header("Received"); my $x_spam_status; if ($opt_reuse) { # get X-Spam-Status: header for rule hit resue $x_spam_status = $ma->get_header("X-Spam-Status"); } # previous hits my @previous; if ($x_spam_status) { $x_spam_status =~ s/,\s+/,/gs; if ($x_spam_status =~ m/tests=(\S*)/ && $x_spam_status !~ /\bshortcircuit=(?:ham|spam|default)\b/) { push @previous, split(/,/, $1); # we found previous tests, so move the reuse config into place unless ($reuse_rules_loaded_p) { $spamtest->copy_config(\%reuse_conf, undef); $reuse_rules_loaded_p = 1; } } } elsif ($opt_reuse) { if ($reuse_rules_loaded_p) { $spamtest->copy_config(\%orig_conf, undef); $reuse_rules_loaded_p = 0; } } if ($header && $header =~ /\bwith SpamAssassin\b/) { if (!$opt_deencap || message_should_be_deencapped($ma)) { my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1); $ma->finish(); $ma = $new_ma; } } # log-uris support my $status; my @uris; my $before; my $after; if ($opt_loguris) { my $pms = Mail::SpamAssassin::PerMsgStatus->new($spamtest, $ma); @uris = $pms->get_uri_list(); $pms->finish(); } else { $before = time; $status = $spamtest->check($ma); $after = time; } my @extra; # sample-based learning if ($opt_learn > 0) { my $spam; # spam learned as ham = 0.05% if ($class eq 's' && rand(100) < 0.05) { $spam = 0; } # ham learned as spam = 0.01% elsif ($class eq 'h' && rand(100) < 0.01) { $spam = 1; } # spam/ham learned correctly elsif (rand(100) < $opt_learn) { if ($class eq 's') { $spam = 1; } elsif ($class eq 'h') { $spam = 0; } else { die "unknown class, learning failed"; } } if (defined $spam) { my $result = ($spam ? "spam" : "ham"); my $status = $spamtest->learn($ma, undef, $spam, 0); $learned = $status->did_learn(); $result = "undef" if !defined $learned; push(@extra, "learn=".$result); } } if (defined($time)) { push(@extra, "time=".$time); } if ($status && defined $status->{bayes_score}) { push(@extra, "bayes=".$status->{bayes_score}); } if ($opt_mid) { my $mid = $ma->get_header("Message-Id"); if ($mid) { # message contains a Message-Id: while($mid =~ s/\([^\(\)]*\)//s) {}; # remove comments and $mid =~ s/^\s+|\s+$//sg; # leading and trailing spaces $mid =~ s/\s.*$//s; # keep only the first token } else { # it doesn't have a Message-Id: $mid = $id; # so build one from the id $mid =~ s,^.*/,,; # remove the path $mid = "<$mid\@$host.masses.spamassassin.org>"; # and put it together } $mid =~ tr/-A-Za-z0-9_!#%&=~<@>/./c; # replace dangerous chars with . (so regexp search just works) push(@extra, "mid=$mid"); } push(@extra, "scantime=" . ($after - $before)); push(@extra, "format=$format"); if ($opt_logmem) { my $mem = memory_track_finish(); if ($mem) { push(@extra, $mem); } } if ($reuse_rules_loaded_p) { push(@extra, "reuse=yes"); } else { push(@extra, "reuse=no"); } my $yorn; my $score; my $tests; my $extra; if ($opt_loguris) { $yorn = '.'; $score = 0; $tests = join(" ", sort @uris); $extra = ''; } else { $yorn = $status->is_spam() ? 'Y' : '.'; # don't bother adjusting scores for reuse $score = $status->get_score(); # list of tests hit my @tests; push @tests, split(/,/, $status->get_names_of_tests_hit()); push @tests, split(/,/, $status->get_names_of_subtests_hit()); # hit reuse if ($x_spam_status) { # generate mapping of hits to remove hits that are marked as skip @tests = grep { !$reuse{$_}->{skip} } @tests; # add hits from previous for (@previous) { push(@tests, $reuse{$_}->{reuse}) if $reuse{$_}->{reuse}; } } $tests = join(",", sort(@tests)); $extra = join(",", @extra); } if (defined $opt_rewrite) { print REWRITE $status->rewrite_mail(); } $id =~ s/\s/_/g; $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra); if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) { $out .= logkilled($ma, $id, "possible virus"); } if ($opt_loghits) { my $log = ''; foreach my $t (sort keys %{$status->{pattern_hits}}) { $_ = $status->{pattern_hits}->{$t}; $_ ||= ''; s/\r/\\r/gs; # fix unprintables s/\n/\\n/gs; $log .= "$t=\"$_\" "; } if ($log) { chomp $log; $out .= "# $log\n"; } } if (defined $status) { $status->finish(); } $ma->finish(); undef $ma; # clean 'em up undef $status; showdots_blip(); return $out; } sub showdots_blip { return unless ($showdots_active); $showdots_counter++; if ($showdots_counter % $showdots_every == 0) { print STDERR '.'; if ($showdots_counter % (60 * $showdots_every) == 0) { print STDERR "\n"; } } } sub showdots_finish { print STDERR "\n" if ($showdots_active); $showdots_counter = 0; } # ick. We have to go grovelling through the body parts to see if a message # is a report_safe-marked-up message, because a local scanner will overwrite # any remote scanner's X-Spam-Checker-Version header. # sub message_should_be_deencapped { my ($ma) = @_; # not sure why this is undefined, but it is sometimes if (defined $ma->{body_parts} && scalar @{$ma->{body_parts}} > 0) { my $firstpart = $ma->{body_parts}->[0]; if (!$firstpart->{headers}->{'content-type'} || $firstpart->{headers}->{'content-type'} ne 'text/plain') { return 0; # not a 'report_safe' encapsulation } if (scalar @{$firstpart->{raw}} < 3) { return 0; } # too short to be a report # grab first 2 lines my $text = $firstpart->{raw}->[0] . $firstpart->{raw}->[1]; $text =~ s/\s+/ /gs; if ($text =~ /^Spam detection software, running on the system \"(\S+)\"/) { my $hname = $1; if ($hname =~ /$opt_deencap/io) { return 1; } } } return 0; # a different host marked it up. pass it through! } sub logkilled { my ($ma, $id, $reason) = @_; my $from = $ma->get_header("From") || 'undef'; my $to = $ma->get_header("To") || 'undef'; my $subj = $ma->get_header("Subject") || 'undef'; my $mid = $ma->get_header("Message-Id") || 'undef'; chomp ($from); chomp ($to); chomp ($subj); chomp ($mid); return "# skipped killfiled message ($reason): from=$from to=$to subj=$subj mid=$mid id=$id\n"; } sub progress { my ($time) = @_; $time ||= 0; # Are we at the end or otherwise at a point we should print status? Then do it. if ($total_messages == $total_count || $total_count % $statusevery == 0) { my $time = strftime("%Y-%m-%d", localtime($time)); status(sprintf("%3d%% ham: %-6d spam: %-6d date: %s", int(($total_count / $total_messages) * 100), $ham_count, $spam_count, $time)); } } sub status { my($str) = @_; my $now = strftime("%Y-%m-%d %X", localtime(time)); printf STDERR "status: %-48s now: %s\n", $str, $now; } ########################################################################### our ($mem_size, $mem_rss, $mem_shared); sub memory_track_start { if ($^O =~ /linux/i) { if (open (IN, "; close IN; if ($statm =~ /^(\d+) (\d+) (\d+) /) { $mem_size = $1; $mem_rss = $2; $mem_shared = $3; } } } } sub memory_track_finish { my $str = ''; if ($^O =~ /linux/i) { if (open (IN, "; close IN; if ($statm =~ /^(\d+) (\d+) (\d+) /) { my $size = $1; my $rss = $2; my $shared = $3; $str = sprintf ("memsz=%d,memrss=%d,memshr=%d", ($size - $mem_size), ($rss - $mem_rss), ($shared - $mem_shared)); } } } $str; } sub get_current_svn_revision { my $revision; # this is usually "${TOPDIR}/masses" my $dir = $FindBin::Bin || "."; if (-d "$dir/.svn" || -f "$dir/svninfo.tmp") { if (-f "$dir/svninfo.tmp") { # created by build/automc/buildbot_ready for chrooted mass-checks open (SVNINFO, "< $dir/svninfo.tmp"); } else { # note, ".." since we want to pick up changes outside 'masses' # too! open (SVNINFO, "( svn info --non-interactive $dir/.. || svn info $dir/.. ) 2>&1 |"); } while () { # Revision: 383822 next unless /^Revision: (\d+)/; $revision = $1; last; } close SVNINFO; return $revision if $revision; } # this probably will never work due to Rules Project changes TODO if (open(TESTING, "$opt_c/70_testing.cf")) { chomp($revision = ); $revision =~ s/.*\$Rev:\s*(\S+).*/$1/; close(TESTING); return $revision if $revision; } return $revision || "unknown"; }