#!/bin/sh # set SCORESET . config LEARN_RATE="${LEARN_RATE:-2.0}" RUNS=10 PASSES="1 2 3 4 5 6 7 8 9 10" NAME="set$SCORESET" LOGDIR="vm-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS" CACHEDIR="vm-cache/$NAME" if [ "$NOTE" != "" ]; then LOGDIR="$LOGDIR-$NOTE" fi if [ ! -d $CACHEDIR ]; then mkdir -p $CACHEDIR fi if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then echo "Couldn't find logs for $NAME" >&2 exit 1 fi echo "[Doing a scoreset $SCORESET score-generation run]" # clear out the old logs rm -rf $LOGDIR # Create a directory to organize the logs with this group of settings mkdir $LOGDIR ( echo "[config]" cat config ) | tee -a $LOGDIR/log for PASS in $PASSES; do # Clean out old runs echo "[Cleaning up for pass $PASS]" rm -rf spam-validate.log ham-validate.log spam.log ham.log \ NSBASE SPBASE tmp freqs perceptron.scores make clean >/dev/null # revert to the previous scoring svn revert ../rules/50_scores.cf if [ ! -d $CACHEDIR/$PASS ]; then # Generate 90/10 split logs echo "[Generating 90/10 split ham]" mkdir NSBASE SPBASE cd NSBASE perl ../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null for p in $PASSES; do if [ "$p" != "$PASS" ]; then cat split-$p.log >> ham.log else mv split-$p.log ham-validate.log fi done rm -f split-*.log echo "[Generating 90/10 split spam]" cd ../SPBASE perl ../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null for p in $PASSES; do if [ "$p" != "$PASS" ]; then cat split-$p.log >> spam.log else mv split-$p.log spam-validate.log fi done rm -f split-*.log cd .. echo "[Setting up for pass $PASS]" # Ok, setup for a run ln -s SPBASE/spam.log . ln -s NSBASE/ham.log . ln -s SPBASE/spam-validate.log . ln -s NSBASE/ham-validate.log . # try to find number of processors numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1` else echo "[Retrieving from $CACHEDIR/$PASS]" ln -s $CACHEDIR/$PASS/SPBASE . ln -s $CACHEDIR/$PASS/NSBASE . ln -s $CACHEDIR/$PASS/tmp . ln -s $CACHEDIR/$PASS/freqs . ln -s SPBASE/spam.log . ln -s NSBASE/ham.log . ln -s SPBASE/spam-validate.log . ln -s NSBASE/ham-validate.log . fi echo "[Generating perceptron]" # Generate perceptron with full logs make -j $numcpus SCORESET=$SCORESET > $LOGDIR/make.output 2>&1 ( echo "[pass $PASS start]" pwd date ./perceptron -p $HAM_PREFERENCE -t $THRESHOLD -e $EPOCHS -l $LEARN_RATE mv perceptron.scores $LOGDIR/scores.$PASS echo "[pass $PASS end]" ) | tee -a $LOGDIR/log perl ./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf $LOGDIR/scores.$PASS > /tmp/runGA.$$ mv /tmp/runGA.$$ ../rules/50_scores.cf echo "[evaluating performance]" | tee -a $LOGDIR/log perl ./fp-fn-statistics --ham ham-validate.log --spam spam-validate.log --scoreset $SCORESET --fnlog $LOGDIR/false_negatives.$PASS --fplog $LOGDIR/false_positives.$PASS > $LOGDIR/validate.$PASS 2> /dev/null if [ ! -d $CACHEDIR/$PASS ]; then echo "[Saving object files in $CACHEDIR/$PASS for faster runs]" mkdir -p $CACHEDIR/$PASS mv tmp freqs SPBASE NSBASE $CACHEDIR/$PASS fi done perl ./extract-results $LOGDIR/validate.* > $LOGDIR/validate perl ./model-statistics $LOGDIR/validate exit 0