#!/usr/bin/perl use strict; use Fcntl ':DEFAULT',':flock'; BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); } use AnyDBM_File; # for the DB format... use constant FORMAT_FLAG => 0xc0; # 11000000 use constant ONE_BYTE_FORMAT => 0xc0; # 11000000 use constant TWO_LONGS_FORMAT => 0x00; # 00000000 use constant ONE_BYTE_SSS_BITS => 0x38; # 00111000 use constant ONE_BYTE_HHH_BITS => 0x07; # 00000111 use constant ATIME_EPOCH_START => 1038000000; # Fri Nov 22 21:20:00 2002 use constant ATIME_GRANULARITY => 21600; # 6 hours use vars qw{ %h $k $v @DBNAMES $NSPAM_MAGIC_TOKEN $NHAM_MAGIC_TOKEN $LAST_EXPIRE_MAGIC_TOKEN $NTOKENS_MAGIC_TOKEN $OLDEST_TOKEN_AGE_MAGIC_TOKEN $opt_dbpath $opt_min_hits $opt_max_hits $opt_regexp $robinson_x $robinson_s_dot_x }; use Getopt::Long; GetOptions("dbpath=s", "min-hits=i", "max-hits=i", "regexp=s"); @DBNAMES = qw(toks); $NSPAM_MAGIC_TOKEN = '**NSPAM'; $NHAM_MAGIC_TOKEN = '**NHAM'; $OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE'; $LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE'; $NTOKENS_MAGIC_TOKEN = '**NTOKENS'; use constant ROBINSON_S_CONSTANT => 0.53; $robinson_x = 0.43; $robinson_s_dot_x = ($robinson_x * ROBINSON_S_CONSTANT); my $path = $opt_dbpath; $path ||= $ENV{HOME}."/.spamassassin/bayes"; foreach my $dbname (@DBNAMES) { my $name = $path.'_'.$dbname; my $db_var = 'db_'.$dbname; if (!tie %{$h{$db_var}}, "AnyDBM_File",$name, O_RDONLY, 0600) { warn "Cannot open file $name: $!\n"; } } my $dbformat = 'on-the-fly probs, expiry'; my $ns = $h{db_toks}->{$NSPAM_MAGIC_TOKEN} || 0; my $nn = $h{db_toks}->{$NHAM_MAGIC_TOKEN} || 0; my $nt = $h{db_toks}->{$NTOKENS_MAGIC_TOKEN} || 0; my $le = $h{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} || 0; my $oa = $h{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} || 0; printf ("%3.3f %8d %8d %8d %s\n", 0.0, 0, 0, 0, 'non-token data: db format = '.$dbformat); printf ("%3.3f %8d %8d %8d %s\n", 0.0, 0, $ns, 0, 'non-token data: nspam'); printf ("%3.3f %8d %8d %8d %s\n", 0.0, 0, $nn, 0, 'non-token data: nham'); printf ("%3.3f %8d %8d %8d %s\n", 0.0, 0, $nt, 0, 'non-token data: ntokens'); printf ("%3.3f %8d %8d %8d %s\n", 0.0, 0, $oa, 0, 'non-token data: oldest age'); printf ("%3.3f %8d %8d %8d %s (%s)\n", 0.0, 0, 0, 0, 'non-token data: last expire', scalar(localtime($le))); my %seen = (); my $prob; my $now = time; for my $key ( keys(%{$h{db_toks}}) ) { next if ($key eq $NSPAM_MAGIC_TOKEN || $key eq $NHAM_MAGIC_TOKEN || $key eq $LAST_EXPIRE_MAGIC_TOKEN || $key eq $NTOKENS_MAGIC_TOKEN || $key eq $OLDEST_TOKEN_AGE_MAGIC_TOKEN); next if (exists $seen{$key}); $seen{$key} = 1; my ($ts, $th, $atime) = tok_unpack ($h{db_toks}->{$key}); $ts ||= 0; $th ||= 0; next if ($opt_min_hits && (($ts + $th) < $opt_min_hits)); next if ($opt_max_hits && (($ts + $th) > $opt_max_hits)); next if ($opt_regexp && ($key !~ /$opt_regexp/o)); $prob = compute_prob_for_token ($ts, $th, $ns, $nn); my $age = $atime; $prob ||= 0.5; printf ("%3.3f %8d %8d %8d %s\n", $prob, $ts, $th, $age, $key); } foreach my $dbname (@DBNAMES) { my $name = $path.'_'.$dbname; my $db_var = 'db_'.$dbname; untie %{$h{$db_var}}; } sub tok_unpack { my ($packed, $atime) = unpack("CS", $_[0] || 0); if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) { return (($packed & ONE_BYTE_SSS_BITS) >> 3, $packed & ONE_BYTE_HHH_BITS, $atime || 0); } elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) { my ($packed, $ts, $th, $atime) = unpack("CLLS", $_[0] || 0); return ($ts || 0, $th || 0, $atime || 0); } # other formats would go here... else { warn "unknown packing format for Bayes db, please re-learn: $packed"; return (0, 0, 0); } } sub compute_prob_for_token { my ($s, $n, $ns, $nn) = @_; return if ($s == 0 && $n == 0); return if ($ns== 0 || $nn== 0); my $ratios = ($s / $ns); my $ration = ($n / $nn); my $prob; if ($ratios == 0 && $ration == 0) { warn "oops? ratios == ration == 0"; return 0.5; } else { $prob = ($ratios) / ($ration + $ratios); } my $robn = $s+$n; return ($robinson_s_dot_x + ($robn * $prob)) / (ROBINSON_S_CONSTANT + $robn); } ########################################################################### # 2-byte time format: expiry is after the time_t epoch, so time_t calculations # will fail before this will. sub atime_to_time_t { my ($self, $atime) = @_; return ($atime * ATIME_GRANULARITY) + ATIME_EPOCH_START; } sub time_t_to_atime { my ($self, $tt) = @_; return int (($tt - ATIME_EPOCH_START) / ATIME_GRANULARITY); } ###########################################################################