#!/bin/sh # # bayes-10pcv-driver - run 10-fold cross-validation test on SpamAssassin Bayes # # Since Bayesish probability analysis requires training on a corpus, the # traditional SpamAssassin 10-pass cross-validation suite can't be used. Also, # Bayes requires its own ten-pass testing, separately, to judge the effects of # tweaks. So that's what this is. # # Before running, you need to create a test corpus, as "cor/spam" and # "cor/ham". Here's how to do this: # cd TEST # SADIR/tools/split_corpora -n 10 -l 2000 -p cor/spam/bucket spf1 spf2 spf3 ... # SADIR/tools/split_corpora -n 10 -l 2000 -p cor/ham/bucket ham1 ham2 ham3 ... # # SADIR = top-level directory of SpamAssassin distro # TEST = the directory where the corpus and results are to be written # spfN = mail folders full of spam # hamN = mail folders full of ham # It will produce a directory of results called "results". The most important # are "hist_all": a histogram of scores and frequencies, and "thresholds_all": # the output of analysis of all scores and frequencies from the # bayes-thresholds script. # CHANGE ME: the path to the version of SpamAssassin you are testing. SADIR=/home/jm/ftp/spamassassin ########################################################################### testdir=`pwd` learnargs= if [ "$#" -gt 0 ] ; then learnargs="$*" fi cd $SADIR/masses PATH=$SADIR:$SADIR/masses:$PATH results=$testdir/results tmpdir=$results/config rm -rf $results $tmpdir # now, just copy in the Bayes ruleset mkdir -p $results $tmpdir/rules cp ../rules/23_bayes.cf $tmpdir/rules cp ../rules/50*.cf $tmpdir/rules # tell SpamAssassin to use this path for DBs # TODO: for tests of these settings, read from a test-specific file echo "bayes_path $tmpdir/dbs/bayes bayes_use_chi2_combining 1 bayes_expiry_use_scan_count 0 bayes_expiry_scan_count 500 " > $tmpdir/rules/30bayes_path.cf mkdir $tmpdir/dbs INTERLEAVE_TESTS=1 # learn the lot, then forget the ones we're testing on each time. # faster than learning from scratch for each fold # note: we use randseed=1 so that every run will always pick the # same messages if --learnprob is used. ( echo -n "Learning from all ham buckets..." ; date time sa-learn-nonspam --randseed=1 --no-rebuild $learnargs \ --config-file=$tmpdir/rules $testdir/cor/ham/* echo -n "Learning from all spam buckets..." ; date time sa-learn-spam --randseed=1 --no-rebuild $learnargs \ --config-file=$tmpdir/rules $testdir/cor/spam/* time sa-learn-rebuild $learnargs --config-file=$tmpdir/rules echo -n "Done learning. " ; date ) 2>&1 | tee $results/learn.log backup_dbs () { echo "Backing up full learned DBs..." ( cd $tmpdir; tar cvf learned-all.tar dbs ) } restore_dbs () { echo "Restoring full learned DBs..." ( cd $tmpdir; tar xf learned-all.tar ) } echo "Dumping bayes DB..." ( cd .. ; tools/check_bayes_db --dbpath=$tmpdir/dbs/bayes ) \ > $results/bayes_db.dump backup_dbs ( echo -n "Starting test..." ; date for bucket in 1 2 3 4 5 6 7 8 9 10 ; do echo -n "Bucket $bucket..." ; date if [ $bucket != 1 ] ; then restore_dbs ; fi rdir=$results/bucket$bucket mkdir $rdir echo "Forgetting contents of test ham bucket..." time sa-forget --config-file=$tmpdir/rules --showdots \ $testdir/cor/ham/bucket.$bucket echo "Forgetting contents of test spam bucket..." time sa-forget --config-file=$tmpdir/rules --showdots \ $testdir/cor/spam/bucket.$bucket time sa-learn-rebuild --config-file=$tmpdir/rules if [ $INTERLEAVE_TESTS = 1 ] ; then # now split the ham and spam test bucket into 10 sub-buckets, # so we interleave ham and spam while testing. important for # judging expiry effects : > $rdir/nonspam.log : > $rdir/spam.log mkdir $rdir/testbuckets ( cd .. ; tools/split_corpora -n 10 -p $rdir/testbuckets/ham \ $testdir/cor/ham/bucket.$bucket tools/split_corpora -n 10 -p $rdir/testbuckets/spam \ $testdir/cor/spam/bucket.$bucket ) for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do echo "Running mass-check on ham test-bucket $subbucket..." time ./mass-check -c=$tmpdir/rules --showdots \ --bayes $rdir/testbuckets/ham.$subbucket \ >> $rdir/nonspam.log echo "Running mass-check on spam test-bucket $subbucket..." time ./mass-check -c=$tmpdir/rules --showdots \ --bayes $rdir/testbuckets/spam.$subbucket \ >> $rdir/spam.log done else echo "Running mass-check on ham bucket..." time ./mass-check -c=$tmpdir/rules --showdots \ --bayes $testdir/cor/ham/bucket.$bucket \ > $rdir/nonspam.log echo "Running mass-check on spam bucket..." time ./mass-check -c=$tmpdir/rules --showdots \ --bayes $testdir/cor/spam/bucket.$bucket \ > $rdir/spam.log fi echo "Reporting..." ./bayes-testing/draw-bayes-histogram \ $rdir/spam.log $rdir/nonspam.log \ > $rdir/hist ./bayes-testing/bayes-thresholds \ $rdir/spam.log $rdir/nonspam.log \ > $rdir/thresholds done echo -n "Done test..." ; date ) 2>&1 | tee $results/test.log cat $results/bucket*/spam.log > $results/spam_all.log cat $results/bucket*/nonspam.log > $results/nonspam_all.log ./bayes-testing/draw-bayes-histogram \ $results/spam_all.log $results/nonspam_all.log \ > $results/hist_all ./bayes-testing/bayes-thresholds \ $results/spam_all.log $results/nonspam_all.log \ > $results/thresholds_all echo "Done." ls -l $results