#!/usr/bin/perl -w # Remove tokens from the bayes_probs db. Don't remove tokens from the # bayes_toks db, since feeding the learner more messages might change # the state of the token so it wouldn't have been removed; just re-run # this tool after each learning sessions use strict; use Fcntl; BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); } use AnyDBM_File; # for the DB format... use constant FORMAT_FLAG => 0xc0; # 11000000 use constant ONE_BYTE_FORMAT => 0xc0; # 11000000 use constant TWO_LONGS_FORMAT => 0x00; # 00000000 use constant ONE_BYTE_SSS => 0x38; # 00111000 use constant ONE_BYTE_HHH => 0x07; # 00000111 use vars qw{ %prob_db %toks_db $opt_dbpath $opt_regexp $opt_min_hits $opt_min_prob_strength $opt_help }; sub usage { print " Usage: trim_bayes_db [--dbpath=path] [--regexp=regexp] [--min-hits=int] [--min-prob-strength==float]\n"; exit(1); } # usage() use Getopt::Long; GetOptions("dbpath=s", "regexp=s", "min-hits=i", "min-prob-strength=f", "help"); usage() if ($opt_help); # At least one of the filtering options must be set if (!$opt_regexp && !$opt_min_hits && !$opt_min_prob_strength) { print "At least one of the filtering options must be set\n"; usage(); } my ($MPS1, $MPS2); if ($opt_min_prob_strength) { $MPS1 = 0.5 - $opt_min_prob_strength; $MPS2 = 0.5 + $opt_min_prob_strength; } my $path = $opt_dbpath; $path ||= $ENV{HOME}."/.spamassassin/bayes"; my $toks_name = "${path}_toks"; tie %toks_db, "AnyDBM_File", $toks_name, O_RDONLY, 0600 or die "Cannot open file $toks_name: $!\n"; my $prob_name = "${path}_probs"; tie %prob_db, "AnyDBM_File", $prob_name, O_RDWR, 0666 or die "Cannot open file $prob_name: $!\n"; foreach my $key ( keys(%prob_db) ) { if ($opt_regexp && ($key =~ m/$opt_regexp/o)) { delete $prob_db{$key}; next; } if ($opt_min_hits) { my ($ts, $th) = tok_unpack ($toks_db{$key}); my $hits = ($ts || 0) + ($th || 0); if ($hits < $opt_min_hits) { delete $prob_db{$key}; next; } } # if ($opt_min_hits) if ($opt_min_prob_strength) { my $prob = unpack ('f', $prob_db{$key}); if (($MPS1 < $prob) && ($prob < $MPS2)) { delete $prob_db{$key}; next; } } } # foreach my $key ( keys(%prob_db) ) untie %prob_db; untie %toks_db; if ($AnyDBM_File::ISA[0] eq "GDBM_File") { # GDBM_File::reorganize() can't be perfomed on a AnyDBM_File # tied hash, even if the underlying implementation is GDBM, # so we have to tie it again; bleh. print "Re-tieing db as GDBM_File to reduce db size\n"; tie %prob_db, "GDBM_File", $prob_name, O_RDWR, 0666 or die "Cannot open file $prob_name: $!\n"; GDBM_File::reorganize(tied(%prob_db)); untie %prob_db; } ################################################## sub tok_unpack { my ($packed, $ts, $th) = unpack("CLL", $_[0] || 0); if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) { $ts = ($packed & ONE_BYTE_SSS) >> 3; $th = ($packed & ONE_BYTE_HHH); } # else use $ts and $th we just unpacked. return ($ts, $th); } # tok_unpack()