#!/bin/sh # TODO: add FPRATE instead of HAM_PREFERENCE # set SCORESET # must use a / in the arg to a 'source' command to avoid searching the PATH . ./config LEARN_RATE="${LEARN_RATE:-2.0}" NAME="set$SCORESET" # TODO: add $FPRATE instead of HAM_PREFERENCE LOGDIR="gen-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS-ga" # ensure sandbox T_ rules aren't used in the GA and don't appear in output KILL_SANDBOX_RULES=y ########################################################################### [ -d gen-cache ] || mkdir gen-cache # a cache, woo if [ "$NOTE" != "" ]; then LOGDIR="$LOGDIR-$NOTE" fi if [ "x$1" = "x" ]; then # ------------------------------------------------------------------------- # Initial rescoring if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then echo "Couldn't find logs for $NAME" >&2 exit 1 fi ( # log this set -x # trace commands to the log # Create a directory to organize the logs with this group of settings mkdir -p $LOGDIR $LOGDIR/NSBASE $LOGDIR/SPBASE if ! [ -d $LOGDIR ] ; then echo "Failed to mkdir $LOGDIR, dying" 1>&2 exit 1 fi # This should be in here instead. Prevents testing. # svn revert ../rules/50_scores.cf rm -rf tmprules cp -r ../rules tmprules cp tmprules/50_scores.cf orig_scores.cf # fix all scores to non-zero (avoid a possible bug, not quite sure) ./enable-all-evolved-rules < tmprules/50_scores.cf \ > tmprules/50_scores.cf.new || exit 1 mv tmprules/50_scores.cf.new tmprules/50_scores.cf [ $KILL_SANDBOX_RULES = y ] && rm tmprules/70_sandbox.cf echo "[Doing a scoreset $SCORESET score-generation run]" # Clean out old runs echo "[Cleaning up]" rm -rf spam-test.log ham-test.log spam.log ham.log \ NSBASE SPBASE tmp make.output freqs perceptron.scores \ garescorer.scores make clean # Generate 90/10 split logs # keep the *-split*.logs in cwd so it's cacheable echo "[Generating 90/10 split ham]" perl tenpass/split-log-into-buckets-cached \ 9:gen-cache/ham-split9.log 1:gen-cache/ham-split1.log ORIG/ham-$NAME.log ln gen-cache/ham-split9.log $LOGDIR/NSBASE/ham.log ln gen-cache/ham-split1.log $LOGDIR/NSBASE/ham-test.log echo "[Generating 90/10 split spam]" perl tenpass/split-log-into-buckets-cached \ 9:gen-cache/spam-split9.log 1:gen-cache/spam-split1.log ORIG/spam-$NAME.log ln gen-cache/spam-split9.log $LOGDIR/SPBASE/spam.log ln gen-cache/spam-split1.log $LOGDIR/SPBASE/spam-test.log echo "[Setting up for gen run]" # Ok, setup for a run ln -s $LOGDIR/SPBASE/spam.log . ln -s $LOGDIR/NSBASE/ham.log . ln -s $LOGDIR/SPBASE/spam-test.log . ln -s $LOGDIR/NSBASE/ham-test.log . # try to find number of processors ostype=`uname` if [ $ostype = "FreeBSD" ]; then numcpus=`/sbin/sysctl -n kern.smp.cpus` elif [ $ostype = "SunOS" ]; then numcpus=`/usr/sbin/psrinfo | wc -l` else numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1` fi if [ ${numcpus:=0} -le 0 ]; then numcpus=1; fi echo "[Generating GA]" # Generate GA with full logs make -j $numcpus SCORESET=$SCORESET garescorer > $LOGDIR/make.output 2>&1 cp freqs $LOGDIR/freqs echo "[config]" cat config echo "[gen run start]" pwd date # TODO: use -f $FPRATE instead of -b $HAM_PREFERENCE time ./garescorer -b $HAM_PREFERENCE -e $EPOCHS -t $THRESHOLD || exit $? date # POST-GA COMMANDS: mv garescorer.scores $LOGDIR/scores echo "[gen run end]" cp orig_scores.cf tmprules/50_scores.cf perl ./rewrite-cf-with-new-scores --scoreset $SCORESET \ --old-scores tmprules/50_scores.cf \ --new-scores $LOGDIR/scores \ --cffile tmprules \ > tmprules/50_newscores.cf mv tmprules/50_newscores.cf tmprules/50_scores.cf cp tmprules/50_scores.cf $LOGDIR/50_scores.cf perl ./fp-fn-statistics --ham ham-test.log --spam spam-test.log \ --scoreset $SCORESET --cffile=tmprules \ --fnlog $LOGDIR/false_negatives --fplog $LOGDIR/false_positives \ > $LOGDIR/test # END OF POST-GA COMMANDS ) | tee $LOGDIR/log else # ------------------------------------------------------------------------- # Statistics generation, once everyone likes the scores # use the logs we saved fulllogh=$LOGDIR/NSBASE/ham.log fulllogs=$LOGDIR/SPBASE/spam.log testlogh=$LOGDIR/NSBASE/ham-test.log testlogs=$LOGDIR/SPBASE/spam-test.log if [ ! -f "$testlogh" -o ! -f "$testlogs" ]; then echo "Couldn't find logs for $NAME: $testlogh $testlogs" >&2 exit 1 fi rm -f ham-test.log spam-test.log ln -s $testlogh ham-test.log ln -s $testlogs spam-test.log rm -f ham.log spam.log ln -s $fulllogh ham.log ln -s $fulllogs spam.log [ $KILL_SANDBOX_RULES = y ] && rm ../rules/70_sandbox.cf # This needs to have ../rules/50_scores.cf in place first ... echo "[gen test results for set $SCORESET]" perl ./fp-fn-statistics --ham $testlogh --spam $testlogs \ --scoreset $SCORESET --cffile=../rules | tee $LOGDIR/test echo "[STATISTICS file generation for set $SCORESET]" bash ./mk-baseline-results $SCORESET | tee $LOGDIR/statistics cp $LOGDIR/statistics ../rules/STATISTICS-set${SCORESET}.txt ls -l ../rules/STATISTICS-set${SCORESET}.txt fi exit 0