#!/usr/bin/perl -w

# options
#
# -c=file       set configuration/rules directory
# -f=file       read list of targets from <file>
# -j=jobs       specify the number of check jobs to run simultaneously
# --mh          find_missed('ham.log')
# --ms          find_missed('spam.log')
# --net         turn on network checks!
# --mid         report Message-ID from each message
# --debug       report debugging information
# --progress    show progress updates during check
# --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
# --showdots    print a dot for each scanned message
# --rules=RE    Only test rules matching the given regexp RE
# --restart=N   restart all of the children after processing N messages
#
# log options
# -o            write all logs to stdout
# --loghits     log the text hit for patterns (useful for debugging)
# --hamlog=log  use <log> as ham log ('ham.log' is default)
# --spamlog=log use <log> as spam log ('spam.log' is default)
#
# message selection options
# -n            no date sorting or spam/ham interleaving
# --all         don't skip big messages
# --head=N      only check first N ham and N spam (N messages if -n used)
# --tail=N      only check last N ham and N spam (N messages if -n used)
#
# simple target options (implies -o and no ham/spam classification)
# --dir         subsequent targets are directories
# --file        subsequent targets are files in RFC 822 format
# --mbox        subsequent targets are mbox files
#
#
# Just left over functions we should remove at some point:
# --bayes       report score from Bayesian classifier
#
# non-option arguments are used as target names (mail files and folders),
# the target format is: <class>:<format>:<location>
# <class>       is "spam" or "ham"
# <format>      is "dir", "file", or "mbox"
# <location>    is a file or directory name.  globbing of ~ and * is supported

###########################################################################

use vars qw($opt_c $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes
	    $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits
	    $opt_mid $opt_mh $opt_ms $opt_net $opt_nosort $opt_progress
	    $opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart
	    $opt_rewrite);

use FindBin;
use lib "$FindBin::Bin/../lib";
eval "use bytes";
use Mail::SpamAssassin::ArchiveIterator;
use Mail::SpamAssassin;
use Mail::SpamAssassin::NoMailAudit;
use Getopt::Long;
use POSIX qw(strftime);

# default settings
$opt_c = "$FindBin::Bin/../rules";
$opt_j = 1;
$opt_net = 0;
$opt_hamlog = "ham.log";
$opt_spamlog = "spam.log";

GetOptions("c=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
	   "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",
	   "progress", "rewrite:s", "showdots", "spamlog=s", "tail=i",
	   "rules=s", "restart=i",
	   "dir" => sub { $opt_format = "dir"; },
	   "file" => sub { $opt_format = "file"; },
	   "mbox" => sub { $opt_format = "mbox"; },
	   '<>' => \&target);

if ($opt_f) {
  open (F, $opt_f) || die $!;
  push (@targets, map { chomp; $_ } <F>);
  close (F);
}

if ($opt_ms) {
  find_missed($opt_spamlog);
}
elsif ($opt_mh) {
  find_missed($opt_hamlog);
}

$spamtest = new Mail::SpamAssassin ({
  'debug'              			=> $opt_debug,
  'rules_filename'     			=> $opt_c,
  'userprefs_filename' 			=> "$FindBin::Bin/spamassassin/user_prefs",
  'userstate_dir'     			=> "$FindBin::Bin/spamassassin",
  'save_pattern_hits'  			=> $opt_loghits,
  'dont_copy_prefs'   			=> 1,
  'local_tests_only'   			=> $opt_net ? 0 : 1,
  'only_these_rules'   			=> $opt_rules,
  'ignore_safety_expire_timeout'	=> 1,
  PREFIX				=> '',
  DEF_RULES_DIR        			=> $opt_c,
  LOCAL_RULES_DIR      			=> '',
});

$spamtest->compile_now(1);
$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");

my $who   = `id -un 2>/dev/null`;   chomp $who;
my $where = `uname -n 2>/dev/null`; chomp $where;
my $when  = `date -u`;              chomp $when;
my $revision = "unknown";
if (open(TESTING, "$opt_c/70_cvs_rules_under_test.cf")) {
  chomp($revision = <TESTING>);
  $revision =~ s/.*Revision:\s*(\S+).*/$1/;
  close(TESTING);
}
my $log_header = "# mass-check results from $who\@$where, on $when\n" .
		 "# M:SA version ".$spamtest->Version()."\n" .
		 '# CVS tag: $Name:  $' . "\n" .
		 "# CVS revision: $revision\n";
my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost';
chomp $host;

my $updates = 10;
my $total_count = 0;
my $spam_count = 0;
my $ham_count = 0;

if ($opt_o) {
  autoflush STDOUT 1;
  print STDOUT $log_header;
}
else {
  open(HAM, "> $opt_hamlog");
  open(SPAM, "> $opt_spamlog");
  autoflush HAM 1;
  autoflush SPAM 1;
  print HAM $log_header;
  print SPAM $log_header;
}

my $iter = new Mail::SpamAssassin::ArchiveIterator({
	'opt_j' => $opt_j,
	'opt_n' => $opt_n,
	'opt_all' => $opt_all,
	'opt_head' => $opt_head,
	'opt_tail' => $opt_tail,
	'opt_restart' => $opt_restart,
});

if ($opt_progress) {
  my $now = strftime("%Y-%m-%d %X", localtime(time));
  printf STDERR "status: pre-scanning and sorting. now: %s\n", $now;
}
$iter->set_functions(\&wanted, \&result);
$iter->run(@targets);
print STDERR "\n" if ($opt_showdots);
exit;

###########################################################################

sub target  {
  my ($target) = @_;
  if (!defined($opt_format)) {
    push (@targets, $target);
  }
  else {
    $opt_o = 1;
    push (@targets, "spam:$opt_format:$target");
  }
}

###########################################################################

sub result {
  my ($class, $result, $time) = @_;

  if ($class eq "s") {
    if ($opt_o) { print STDOUT $result; } else { print SPAM $result; }
    $spam_count++;
  }
  elsif ($class eq "h") {
    if ($opt_o) { print STDOUT $result; } else { print HAM $result; }
    $ham_count++;
  }

  $total_count++;
#warn ">> result: $total_count $class $time\n";

  if ($opt_progress) {
    progress($time);
  }
}

sub wanted {
  my ($id, $time, $dataref) = @_;
  my $out;

  my $ma = Mail::SpamAssassin::NoMailAudit->new('data' => $dataref);
  $ma->{noexit} = 1;

  # remove SpamAssassin markup, if present and the mail was spam
  $_ = $ma->get ("X-Spam-Status");
  if (defined($_) && /^Yes, hits=/) {
    my $newtext = $spamtest->remove_spamassassin_markup($ma);
    my @newtext = split (/^/m, $newtext);
    $dataref = \@newtext;
    $ma = Mail::SpamAssassin::NoMailAudit->new ('data' => $dataref);
  }

  my $status = $spamtest->check($ma);

  my @extra;
  if (defined($time)) {
    push(@extra, "time=$time");
  }
  if (defined $status->{bayes_score}) {
    push(@extra, "bayes=" . $status->{bayes_score});
  }
  if ($opt_mid) {
    my $mid = $ma->get_header("Message-Id");
    if ($mid) {     # message contains a Message-Id:
      while($mid =~ s/\([^\(\)]*\)//s) {};   # remove comments and
      $mid =~ s/^\s+|\s+$//sg;               # leading and trailing spaces
      $mid =~ s/\s.*$//s;                    # keep only the first token
    }
    else {          # it doesn't have a Message-Id:
      $mid = $id;             # so build one from the id
      $mid =~ s,^.*/,,;       # remove the path
      $mid = "<$mid\@$host.masses.spamassassin.org>";  # and put it together
    }
    $mid =~ tr/-A-Za-z0-9_!#%&=~<@>/./c;     # replace dangerous chars with . (so regexp search just works)
    push(@extra, "mid=$mid");
  }

  my $yorn = $status->is_spam() ? 'Y' : '.';
  my $hits = $status->get_hits();
  my $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
  my $extra = join(",", @extra);

  if (defined $opt_rewrite) {
    $status->rewrite_mail();
    open(REWRITE, "> " . ($opt_rewrite ? $opt_rewrite : "/tmp/out"));
    print REWRITE $status->get_full_message_as_text();
    close(REWRITE);
  }

  $id =~ s/\s/_/g;

  $out .= sprintf("%s %2d %s %s %s\n", $yorn, $hits, $id, $tests, $extra);

  if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) {
    $out .= logkilled($ma, $id, "possible virus");
  }

  if ($opt_loghits) {
    my $log = '';
    foreach my $t (sort keys %{$status->{pattern_hits}}) {
      $_ = $status->{pattern_hits}->{$t};
      $_ ||= '';
      s/\r/\\r/gs;      # fix unprintables
      s/\n/\\n/gs;
      $log .= "$t=\"$_\" ";
    }
    if ($log) {
      chomp $log;
      $out .= "# $log\n";
    }
  }

  $status->finish();
  undef $ma;		# clean 'em up
  undef $status;

  if ($opt_showdots) {
    print STDERR '.';
  }

  return $out;
}

sub logkilled {
  my ($ma, $id, $reason) = @_;

  my $from = $ma->get_header("From")       || 'undef';
  my $to   = $ma->get_header("To")         || 'undef';
  my $subj = $ma->get_header("Subject")    || 'undef';
  my $mid  = $ma->get_header("Message-Id") || 'undef';
  chomp ($from);
  chomp ($to);
  chomp ($subj);
  chomp ($mid);
  return "# skipped killfiled message ($reason): from=$from to=$to subj=$subj mid=$mid id=$id\n";
}

sub progress {
  my ($time) = @_;
  $time ||= 0;

  my $messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES;
  my $statusevery = int($messages / $updates);
  $statusevery ||= 1; # if $messages < $updates, just give a status line per msg.

  # Are we at the end or otherwise at a point we should print status?  Then do it.
  if ($messages == $total_count || $total_count % $statusevery == 0) {
    my $time = strftime("%Y-%m-%d", localtime($time));
    my $now = strftime("%Y-%m-%d %X", localtime(time));
    printf STDERR "status: %3.0f%% ham: %-6d spam: %-6d date: %s now: %s\n",
	($total_count / $messages) * 100, $ham_count, $spam_count, $time, $now;
  }
}

###########################################################################

sub find_missed {
  my $file = shift;

  my $threshold = 5;

  my $shouldbespam = 1;
  if ($file =~ /ham/) { $shouldbespam = 0; }

  my $scores = readscores();

  open (IN, "<$file");
  while (<IN>) {
    next if /^#/;
    /^.\s+\d+\s+(\S+)\s*/ or next;
    my $id = $1;
    my $score = 0.0;

    $_ = $'; s/bayes=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;
    foreach my $tst (split (/,/, $_)) {
      next if ($tst eq '');
      if (!defined $scores->{$tst}) {
	warn "unknown test in $file, ignored: $tst\n";
	next;
      }
      $score += $scores->{$tst};
    }

    if ($shouldbespam && $score < $threshold) {
      found_missed($score, $id, $_);
    }
    elsif (!$shouldbespam && $score > $threshold) {
      found_missed($score, $id, $_);
    }
  }
  close IN;
}

sub readscores {
  warn "Reading scores from \"$opt_c\"...\n";
  system("./parse-rules-for-masses -d \"$opt_c\"") and die;
  use vars qw(%scores);
  require "./tmp/rules.pl";
  return \%scores;
}

sub found_missed {
  my $score = shift;
  my $id = shift;
  my $tests = shift;

  print "$score $id $tests\n";
}