#!/usr/bin/perl # # kill_bad_patterns - filter out a number of risky rule strings, or spoor from the # spamtrapping process itself while (<>) { # skip our whitelist next if / # spamtrap strings (?:taint\\.org|jmason\\.org|spamassassin\\.org|spamtrap| # long _________ or ---------- separators [-_=\s]{30}| \/[-_=\s]+\/| \*\*\*\*\*{20}| \\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*\\\*| # markup DOCTYPE\sHTML\sPUBLIC| \\{Spam\\?\\}| \\*\\*SPAM\\*\\*| # dates \d+\s+[A-Z][a-z]{2}\s\S+\s..:..:..\s| # phish text PayPal\sInc\.\sAll\srights\sreserved\.| about\seBay\\'s\scommunication\spolicies\\.| See\sour\sPrivacy\sPolicy\sand\sUser\sAgreement\sif\syou\shave| Learn\show\syou\scan\sprotect\syourself\sfrom\sspoof| access\sto\syour\saccount\swas\slimited\\.\sYour\saccount\saccess\swill\sremain| In\saccordance\swith\sPayPal\\'s\sUser\sAgreement| To\ssecure\syour\saccount\sand\squickly\srestore\sfull\saccess| To\slearn\smore\sabout\sprotecting\syourself\sfrom\sfraud,\svisit\sthe\sSecurity\sCenter| PayPal\swill\snever\sask\syou\sto\senter\syour\spassword\sin\san\semail| # asian spam; could be any text \s\.\.\s\.\.\s\.\.\s\.\.\s| # bug 6143 \\x{00}.\\x{00}.\\x{00}.\\x{00}.\\x{00}.\\x{00}| )/ix; # skip S/O/hit% comments for publishing # # 1.000 0.967 0.000 next if /^#\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+/; # skip high-bit strings, too my $nospcs = $_; $nospcs =~ s/\s//gs; $nospcs =~ s/^body\s+\S+\s+\///; $nospcs =~ s/\/$//; my $c_hi = do { my @tmp = ($nospcs =~ /[.\x80-\xff]/g); scalar @tmp; }; my $c_lo = do { my @tmp = ($nospcs =~ /[^\x80-\xff]/g); scalar @tmp; }; next if ($c_hi > $c_lo && $c_hi > 6); print; }