#!/usr/bin/perl # # TODO: should this be made a top-level script, called "sa-awl"? sub usage { die " usage: check_whitelist [--clean] [--min n] [dbfile] "; } use strict; use Fcntl; use Getopt::Long; use vars qw( $opt_clean $opt_min $opt_help ); GetOptions( 'clean' => \$opt_clean, 'min:i' => \$opt_min, 'help' => \$opt_help ) or usage(); $opt_help and usage(); $opt_min ||= 2; BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); } use AnyDBM_File ; my $db; if ($#ARGV == -1) { $db = $ENV{HOME}."/.spamassassin/auto-whitelist"; } else { $db = $ARGV[0]; } my %h; if ($opt_clean) { tie %h, "AnyDBM_File",$db, O_RDWR,0600 or die "Cannot open r/w file $db: $!\n"; } else { tie %h, "AnyDBM_File",$db, O_RDONLY,0600 or die "Cannot open file $db: $!\n"; } my @k = grep(!/totscore$/,keys(%h)); for my $key (@k) { my $totscore = $h{"$key|totscore"}; my $count = $h{$key}; next unless defined($totscore); if ($opt_clean) { if ($count >= $opt_min) { next; } print "cleaning: "; } printf "% 8.1f %15s -- %s\n", $totscore/$count, (sprintf "(%.1f/%d)",$totscore,$count), $key; if ($opt_clean) { delete $h{"$key|totscore"}; delete $h{$key}; } } untie %h; =head1 NAME check_whitelist - examine and manipulate SpamAssassin's auto-whitelist db =head1 SYNOPSIS B [--clean] [--min n] [dbfile] =head1 DESCRIPTION Check or clean a SpamAssassin auto-whitelist (AWL) database file. The name of the file is specified after any options, as C. The default is C<$HOME/.spamassassin/auto-whitelist>. =head1 OPTIONS =over 4 =item --clean Clean out infrequently-used AWL entries. The C<--min> switch can be used to select the threshold at which entries are kept or deleted. =item --min n Select the threshold at which entries are kept or deleted when C<--clean> is used. The default is C<2>, so entries that have only been seen once are deleted. =back =head1 OUTPUT The output looks like this: AVG (TOTSCORE/COUNT) -- EMAIL|ip=IPBASE For example: 0.0 (0.0/7) -- dawson@example.com|ip=208.192 21.8 (43.7/2) -- mcdaniel_2s2000@example.com|ip=200.106 C is the average score; C is the total score of all mails seen so far; C is the number of messages seen from that sender; C is the sender's email address, and C is the B. B is a way to identify the sender's IP address they frequently send from, in an approximate way, but remaining hard for spammers to spoof. The algorithm is as follows: - take the last Received header that contains a public IP address -- namely one which is not in private, unrouted IP space. - chop off the last two octets, assuming that the user may be in an ISP's dynamic address pool. =cut