#!/bin/sh
#
# bayes-10pcv-driver - run 10-fold cross-validation test on SpamAssassin Bayes
#
# Since Bayesish probability analysis requires training on a corpus, the
# traditional SpamAssassin 10-pass cross-validation suite can't be used.  Also,
# Bayes requires its own ten-pass testing, separately, to judge the effects of
# tweaks.  So that's what this is.
#
# Before running, you need to create a test corpus, as "cor/spam" and
# "cor/ham".  Here's how to do this:

#   cd TEST
#   SADIR/tools/split_corpora -n 10 -l 2000 -p cor/spam/bucket spf1 spf2 spf3 ...
#   SADIR/tools/split_corpora -n 10 -l 2000 -p cor/ham/bucket ham1 ham2 ham3 ...
#
# SADIR = top-level directory of SpamAssassin distro
# TEST  = the directory where the corpus and results are to be written
# spfN   = mail folders full of spam
# hamN   = mail folders full of ham

# It will produce a directory of results called "results".  The most important
# are "hist_all": a histogram of scores and frequencies, and "thresholds_all":
# the output of analysis of all scores and frequencies from the
# bayes-thresholds script.

# CHANGE ME: the path to the version of SpamAssassin you are testing.
SADIR=/home/jm/ftp/spamassassin

###########################################################################

testdir=`pwd`

learnargs=
if [ "$#" -gt 0 ] ; then
  learnargs="$*"
fi

cd $SADIR/masses
PATH=$SADIR:$SADIR/masses:$PATH

results=$testdir/results
tmpdir=$results/config

rm -rf $results $tmpdir

# now, just copy in the Bayes ruleset
mkdir -p $results $tmpdir/rules
cp ../rules/23_bayes.cf $tmpdir/rules
cp ../rules/50*.cf $tmpdir/rules

# tell SpamAssassin to use this path for DBs
# TODO: for tests of these settings, read from a test-specific file
echo "bayes_path $tmpdir/dbs/bayes
bayes_use_chi2_combining 1
bayes_expiry_use_scan_count 0
bayes_expiry_scan_count 500
" > $tmpdir/rules/30bayes_path.cf
mkdir $tmpdir/dbs

INTERLEAVE_TESTS=1

# learn the lot, then forget the ones we're testing on each time.
# faster than learning from scratch for each fold

# note: we use randseed=1 so that every run will always pick the
# same messages if --learnprob is used.

(
echo -n "Learning from all ham buckets..." ; date
time sa-learn-nonspam --randseed=1 --no-rebuild $learnargs \
	--config-file=$tmpdir/rules $testdir/cor/ham/*

echo -n "Learning from all spam buckets..." ; date
time sa-learn-spam --randseed=1 --no-rebuild $learnargs \
	--config-file=$tmpdir/rules $testdir/cor/spam/*

time sa-learn-rebuild $learnargs --config-file=$tmpdir/rules

echo -n "Done learning. " ; date
) 2>&1 | tee $results/learn.log

backup_dbs () {
  echo "Backing up full learned DBs..."
  ( cd $tmpdir; tar cvf learned-all.tar dbs )
}
restore_dbs () {
  echo "Restoring full learned DBs..."
  ( cd $tmpdir; tar xf learned-all.tar )
}

echo "Dumping bayes DB..."
( cd .. ; tools/check_bayes_db --dbpath=$tmpdir/dbs/bayes ) \
      > $results/bayes_db.dump

backup_dbs

(

echo -n "Starting test..." ; date
for bucket in 1 2 3 4 5 6 7 8 9 10 ; do
  echo -n "Bucket $bucket..." ; date

  if [ $bucket != 1 ] ; then restore_dbs ; fi

  rdir=$results/bucket$bucket
  mkdir $rdir

  echo "Forgetting contents of test ham bucket..."
  time sa-forget --config-file=$tmpdir/rules --showdots \
			$testdir/cor/ham/bucket.$bucket

  echo "Forgetting contents of test spam bucket..."
  time sa-forget --config-file=$tmpdir/rules --showdots \
			$testdir/cor/spam/bucket.$bucket

  time sa-learn-rebuild --config-file=$tmpdir/rules

  if [ $INTERLEAVE_TESTS = 1 ] ; then
    # now split the ham and spam test bucket into 10 sub-buckets,
    # so we interleave ham and spam while testing. important for
    # judging expiry effects
    : > $rdir/nonspam.log
    : > $rdir/spam.log
    
    mkdir $rdir/testbuckets
    ( cd .. ; tools/split_corpora -n 10 -p $rdir/testbuckets/ham \
	  $testdir/cor/ham/bucket.$bucket
      tools/split_corpora -n 10 -p $rdir/testbuckets/spam \
	  $testdir/cor/spam/bucket.$bucket
    )

    for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do
      echo "Running mass-check on ham test-bucket $subbucket..."
      time ./mass-check -c=$tmpdir/rules --showdots \
	    --bayes $rdir/testbuckets/ham.$subbucket \
	    >> $rdir/nonspam.log

      echo "Running mass-check on spam test-bucket $subbucket..."
      time ./mass-check -c=$tmpdir/rules --showdots \
	    --bayes $rdir/testbuckets/spam.$subbucket \
	    >> $rdir/spam.log
    done

  else
    echo "Running mass-check on ham bucket..."
    time ./mass-check -c=$tmpdir/rules --showdots \
	  --bayes $testdir/cor/ham/bucket.$bucket \
	  > $rdir/nonspam.log

    echo "Running mass-check on spam bucket..."
    time ./mass-check -c=$tmpdir/rules --showdots \
	  --bayes $testdir/cor/spam/bucket.$bucket \
	  > $rdir/spam.log
  fi

  echo "Reporting..."
  ./bayes-testing/draw-bayes-histogram \
	$rdir/spam.log $rdir/nonspam.log \
	> $rdir/hist

  ./bayes-testing/bayes-thresholds \
	$rdir/spam.log $rdir/nonspam.log \
	> $rdir/thresholds

done
echo -n "Done test..." ; date

) 2>&1 | tee $results/test.log

cat $results/bucket*/spam.log > $results/spam_all.log
cat $results/bucket*/nonspam.log > $results/nonspam_all.log

./bayes-testing/draw-bayes-histogram \
	$results/spam_all.log $results/nonspam_all.log \
	> $results/hist_all
./bayes-testing/bayes-thresholds \
	$results/spam_all.log $results/nonspam_all.log \
	> $results/thresholds_all

echo "Done."
ls -l $results