#!/bin/sh # # bayes-10pcv-driver - run 10-fold cross-validation test on SpamAssassin Bayes # # Since Bayesish probability analysis requires training on a corpus, the # traditional SpamAssassin 10-pass cross-validation suite can't be used. Also, # Bayes requires its own ten-pass testing, separately, to judge the effects of # tweaks. So that's what this is. # # Before running, you need to create a test corpus, as "cor/spam" and # "cor/ham". Here's how to do this: # cd TEST # SADIR/tools/split_corpora -n 10 -l 2000 -p cor/spam/bucket spf1 spf2 spf3 ... # SADIR/tools/split_corpora -n 10 -l 2000 -p cor/ham/bucket ham1 ham2 ham3 ... # # SADIR = top-level directory of SpamAssassin distro # TEST = the directory where the corpus and results are to be written # spfN = mail folders full of spam # hamN = mail folders full of ham # It will produce a directory of results called "results". The most important # are "hist_all": a histogram of scores and frequencies, and "thresholds_all": # the output of analysis of all scores and frequencies from the # bayes-thresholds script. # NOTE: by default you will need *AT LEAST* 2000 of either type to use # this, since bayes will not be activated without 200 messages in the db, # and each fold is run using 10% of the corpus -- and 2000/10 = 200. ########################################################################### testdir=`pwd` learnargs= if [ "$#" -gt 0 ] ; then learnargs="$*" fi cd $SADIR/masses PATH=$SADIR:$SADIR/masses:$PATH results=$testdir/results tmpdir=$results/config rm -rf $results $tmpdir # now, just copy in the Bayes ruleset mkdir -p $results $tmpdir/rules cp ../rules/23_*.cf $tmpdir/rules cp ../rules/50*.cf $tmpdir/rules cp ../rules/*.pre $tmpdir/rules # ensure we have plugins # tell SpamAssassin to use this path for DBs # TODO: for tests of these settings, read from a test-specific file echo " bayes_path $tmpdir/dbs/bayes bayes_auto_learn 0 bayes_min_ham_num 10 bayes_min_spam_num 10 bayes_store_module Mail::SpamAssassin::BayesStore::SDBM " > $tmpdir/rules/30bayes_path.cf mkdir $tmpdir/dbs INTERLEAVE_TESTS=0 TEST_AGAINST_10PC=0 LEARN_ALL_THEN_FORGET_TEST_SET=0 backup_dbs () { echo "Backing up full learned DBs..." ( cd $tmpdir; tar cvf learned-all.tar dbs ) } restore_dbs () { echo "Restoring full learned DBs..." ( cd $tmpdir; rm -rf dbs; tar xf learned-all.tar ) } runcmd () { echo "$*" time $* } if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then # learn the lot, then forget the ones we're testing on each time. # faster than learning from scratch for each fold # note: we use randseed=1 so that every run will always pick the # same messages if --learnprob is used. ( echo -n "Learning from all ham buckets..." ; date runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \ --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/ham/* echo -n "Learning from all spam buckets..." ; date runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \ --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/spam/* runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules echo -n "Done learning. " ; date ) 2>&1 | tee $results/learn.log echo "Dumping bayes DB..." ( cd .. ; tools/check_bayes_db --dbpath=$tmpdir/dbs/bayes ) \ > $results/bayes_db.dump fi backup_dbs ( echo -n "Starting test..." ; date for bucket in 1 2 3 4 5 6 7 8 9 10 ; do echo -n "Bucket $bucket..." ; date if [ $bucket != 1 ] ; then restore_dbs ; fi rdir=$results/bucket$bucket mkdir $rdir : > $rdir/hbucketlearn : > $rdir/sbucketlearn : > $rdir/hbuckettest : > $rdir/sbuckettest for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do type=l [ $TEST_AGAINST_10PC = 1 -a $subbucket = $bucket ] && type=t [ $TEST_AGAINST_10PC = 0 -a $subbucket != $bucket ] && type=t if [ $type = l ] ; then echo "Using bucket for learn: $subbucket ..." cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbucketlearn cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbucketlearn else echo "Using bucket for test: $subbucket ..." cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbuckettest cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbuckettest fi done if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then echo "Forgetting contents of test ham bucket..." runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \ --mbox $rdir/hbuckettest echo "Forgetting contents of test spam bucket..." runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \ --mbox $rdir/sbuckettest else echo "Learning contents of learn ham bucket..." runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \ --showdots --mbox --config-file=$tmpdir/rules $rdir/hbucketlearn echo "Learning contents of learn spam bucket..." runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \ --showdots --mbox --config-file=$tmpdir/rules $rdir/sbucketlearn runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules echo "Dumping bayes DB..." ( cd .. ; sa-learn --dump --dbpath=$tmpdir/dbs/bayes ) \ > $rdir/bayes_db.dump fi runcmd sa-learn --sync --config-file=$tmpdir/rules # take a copy of the trained Bayes DBs, gzipped ( cd $tmpdir ; tar cf - dbs | gzip -c > $rdir/dbs.tgz ) if [ $INTERLEAVE_TESTS = 1 ] ; then # now split the ham and spam test bucket into 10 sub-buckets, # so we interleave ham and spam while testing. important for # judging expiry effects : > $rdir/nonspam.log : > $rdir/spam.log mkdir $rdir/testbuckets ( cd .. tools/split_corpora -n 10 -p $rdir/testbuckets/ham \ $rdir/hbuckettest tools/split_corpora -n 10 -p $rdir/testbuckets/spam \ $rdir/sbuckettest ) for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do echo "Running mass-check on ham test-bucket $subbucket..." time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ --bayes --mbox $rdir/testbuckets/ham.$subbucket \ >> $rdir/nonspam.log echo "Running mass-check on spam test-bucket $subbucket..." time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ --bayes --mbox $rdir/testbuckets/spam.$subbucket \ >> $rdir/spam.log done else echo "Running mass-check on ham bucket..." runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ --bayes --mbox $rdir/hbuckettest \ > $rdir/nonspam.log echo "Running mass-check on spam bucket..." runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ --bayes --mbox $rdir/sbuckettest \ > $rdir/spam.log fi echo "Reporting..." ./bayes-testing/draw-bayes-histogram \ $rdir/spam.log $rdir/nonspam.log \ > $rdir/hist ./bayes-testing/bayes-thresholds \ $rdir/spam.log $rdir/nonspam.log \ > $rdir/thresholds ./bayes-testing/bayes-static-thresholds \ $rdir/spam.log $rdir/nonspam.log \ > $rdir/thresholds.static # remove these, they're too big. rm -f $rdir/hbucketlearn $rdir/sbucketlearn # but keep these to find FPs/FNs later gzip $rdir/hbuckettest $rdir/sbuckettest done echo -n "Done test..." ; date ) 2>&1 | tee $results/test.log cat $results/bucket*/spam.log > $results/spam_all.log cat $results/bucket*/nonspam.log > $results/nonspam_all.log ./bayes-testing/draw-bayes-histogram \ $results/spam_all.log $results/nonspam_all.log \ > $results/hist_all ./bayes-testing/bayes-thresholds \ $results/spam_all.log $results/nonspam_all.log \ > $results/thresholds_all ./bayes-testing/bayes-static-thresholds \ $results/spam_all.log $results/nonspam_all.log \ > $results/thresholds_all.static echo "Done." ls -l $results