#!/usr/bin/perl =head1 NAME seek-phrases-in-log - extract good-looking rules from a text-dump mc log =cut # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # --------------------------------------------------------------------------- sub usage { die " usage: seek-phrases-in-log [--reqhitrate n] [--reqpatlength n] [--rules] [--ruletype 'type'] [--ruleprefix FOO] [--maxtextread n] --ham hamlog --spam spamlog --reqhitrate: percentage hit-rate against spam required (default: 0.5) (multiple values can be specified, separated by spaces) --reqpatlength: required pattern length, in characters (default: 0) --maxtextread: bytes of message text examined (default: 32768) --rules: generate SpamAssassin rule output (default: 0) --ruleprefix: specify prefix string for rules (default: 'SEEK_') --ruletype 'type': generate rules of type: 'header', 'body' "; } # --------------------------------------------------------------------------- use warnings; use strict; use Getopt::Long qw(:config no_ignore_case); use Data::Dumper; sub logmsg; my %opt = (); $opt{reqhitrate} = 0.5; $opt{reqpatlength} = 0; $opt{maxtextread} = 32768; $opt{rules} = 0; $opt{ruleprefix} = 'SEEK_'; $opt{ruletype} = 'body'; my $fs; my $fh; my @files = (); GetOptions( "rules" => \$opt{rules}, "ruleprefix=s" => \$opt{ruleprefix}, "reqhitrate=s" => \$opt{reqhitrate}, "reqpatlength=s" => \$opt{reqpatlength}, "ruletype=s" => \$opt{ruletype}, "maxtextread=s" => \$opt{maxtextread}, "phase2=s" => \$opt{phase2}, "ham=s" => \$fh, "spam=s" => \$fs, 'help' => \&usage ) or usage(); usage() unless (($fs && $fh) || $opt{phase2}); my @hitratethresholds = (); if ($opt{reqhitrate} =~ /\S\s+\S/) { # multiple values @hitratethresholds = reverse sort {$a<=>$b} split (' ', $opt{reqhitrate}); $opt{reqhitrate} = pop @hitratethresholds; # the lowest # @hitratethresholds is now sorted, highest to lowest } # n-gram reading state my %word2sym = ('' => ''); my %sym2word = ('' => ''); my $sym_acc = 'a'; # symbols are represented using IDs from this counter my $msgcount = 0; # these are shared with the assembly stage (via $asmstate) my @text_string = (); my %ngram_count = (); my %msg_subset_hit = (); my $asmstate; if ($opt{phase2}) { $asmstate = load_state($opt{phase2}); } else { logmsg "reading $fs..."; open IN, "<$fs" or die "cannot open spam log $fs"; while () { /^text: (.*)$/ and proc_text_spam($1); } close IN; # only do this if we have read enough spam messages, otherwise we could # discard tokens for which reqhitrate has been achieved if ($msgcount > 2 * (100 / $opt{reqhitrate})) { discard_hapaxes(); } logmsg "n-grams active: ".(scalar keys %ngram_count); logmsg "reading $fh..."; open IN, "<$fh" or die "cannot open ham log $fh"; while () { /^text: (.*)$/ and proc_text_ham($1); } close IN; logmsg "n-grams active: ".(scalar keys %ngram_count); # move onto the next step; assembly. free stuff we no longer need here undef %word2sym; # free this, no longer needed # create the assembly state object $asmstate = { }; filter_into_message_subsets(); undef %sym2word; # no longer needed after that write_state($asmstate, "assemble.state"); } assemble_regexps(); exit; # --------------------------------------------------------------------------- # PHASE 1: PARSING, NGRAM PROBABILITIES sub proc_text_spam { my ($text) = @_; # we only need to chop off the end of spam samples if (length($text) > $opt{maxtextread}) { $text = substr $text, 0, $opt{maxtextread}; # chop! } $text =~ s/ +/ /gs; # single spaces, please # we only need to save spam samples in memory, ignore ham samples push @text_string, $text; my $cp = pack "l", $msgcount; $msgcount++; (($msgcount % 1000) == 999) and discard_hapaxes(); my %tokens = (); foreach my $line (split(/\[p\]/, $text)) { my $w1 = ''; my $w2 = ''; my $w3 = ''; foreach my $w (split(' ', $line)) { # if (length $w > 20) { $w = "sk:".substr($w, 0, 5); } $w3 = $w2; $w2 = $w1; $w1 = $word2sym{$w}; if (!$w1) { $word2sym{$w} = $w1 = $sym_acc; $sym2word{$sym_acc} = $w; $sym_acc++; } # simple bayesian N-grams to start $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1; } # deal with leftovers if ($w2 && $w1) { $tokens{"$w2.$w1"} = 1; } } foreach my $tok (keys %tokens) { # incr the counter for this token $ngram_count{$tok}++; $msg_subset_hit{$tok} .= $cp; # the message subset hit by this tok } } sub discard_hapaxes { my $before = (scalar keys %ngram_count); foreach my $tok (keys %ngram_count) { if ($ngram_count{$tok} <= 1) { delete $ngram_count{$tok}; delete $msg_subset_hit{$tok}; } } my $after = (scalar keys %ngram_count); my $killed = ($before - $after); logmsg "shrunk dbs: $killed hapaxes killed, kept $after entries"; } sub proc_text_ham { my ($text) = @_; my $w1 = ''; my $w2 = ''; my $w3 = ''; my %tokens = (); foreach my $w (split(' ', $text)) { $w3 = $w2; $w2 = $w1; $w1 = $word2sym{$w}; if (!$w1) { # since we're deleting, there's no need to add new words # to the dictionary; just use the final $sym_acc to mean # "unknown ham word", and don't increment it $w1 = $sym_acc; } $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1; } if ($w2 && $w1) { $tokens{"$w2.$w1"} = 1; } foreach my $tok (keys %tokens) { # we're not tracking hits; we're killing false positives. # as soon as a single FP appears, kill all record of that token, # it cannot be used delete $ngram_count{$tok}; delete $msg_subset_hit{$tok}; } } sub filter_into_message_subsets { logmsg "filtering into message subsets..."; $asmstate = { }; # hash all msg_subset_hit lists; we don't need the full data, so this # saves space foreach my $id (keys %msg_subset_hit) { $msg_subset_hit{$id} = unpack("%32C*", $msg_subset_hit{$id}); } # note: we don't care about stuff that hits *any* ham at all $asmstate->{msg_count_spam} = scalar @text_string; $asmstate->{msg_count_spam} ||= 0.000001; $asmstate->{all_patterns_for_set} = { }; foreach my $id (keys %ngram_count) { my $count = $ngram_count{$id}; my $bad; # must occur more than once! if ($count <= 1) { $bad++; } # require N% spam hits elsif (($count*100) / $asmstate->{msg_count_spam} < $opt{reqhitrate}) { $bad++; } if ($bad) { # we don't need to remember anything about this pattern after here delete $ngram_count{$id}; delete $msg_subset_hit{$id}; next; } my $set = $msg_subset_hit{$id}; push @{$asmstate->{all_patterns_for_set}->{$set}}, decode_sym2words($id); } logmsg "message subsets found: ".(scalar keys %{$asmstate->{all_patterns_for_set}}); $asmstate->{ngram_count} = \%ngram_count; $asmstate->{msg_subset_hit} = \%msg_subset_hit; $asmstate->{text_string} = \@text_string; } sub decode_sym2words { my $ids = shift; my $r; if ($ids =~ /^([^.]*)\.([^.]*)\.([^.]*)$/) { $r = "$sym2word{$1} $sym2word{$2} $sym2word{$3}"; } elsif ($ids =~ /^([^.]*)\.([^.]*)$/) { $r = "$sym2word{$1} $sym2word{$2}"; } else { warn "bad decode_sym2words format: '$ids'"; } $r =~ s/^\s+//; return $r; } sub write_state { my ($state, $statefile) = @_; my $dump = Data::Dumper->new ([ $state ]); $dump->Deepcopy(1); $dump->Purity(1); $dump->Indent(1); my $text = $dump->Dump.";1;"; open (OUT, ">$statefile") or die "cannot write $statefile"; print OUT $text; close OUT or die "cannot write $statefile"; } # --------------------------------------------------------------------------- # PHASE 2: REGEXP ASSEMBLY sub load_state { my $file = shift; if (open(IN, "<".$file)) { my $str = join("", ); close IN; my $VAR1; # Data::Dumper if (eval $str) { return $VAR1; # Data::Dumper's naming } } die "failed to load state from $file"; } sub assemble_regexps { my %done_set = (); my @done_pats = (); logmsg "deduping and assembling regexps..."; if (!$opt{rules}) { printf ("%6s %6s %6s %s\n", "RATIO", "SPAM%", "HAM%", "DATA"); } $| = 1; my $count = 0; my $count_out = 0; foreach my $id (sort { $asmstate->{ngram_count}->{$b} <=> $asmstate->{ngram_count}->{$a} } keys %{$asmstate->{ngram_count}}) { my $set = $asmstate->{msg_subset_hit}->{$id}; next if $done_set{$set}; $done_set{$set}++; if ($count++ % 200 == 0) { logmsg "working on message subset $count ($count_out)..."; } # we now have several patterns. see if we can expand them sideways # to make the pattern bigger, and collapse into a smaller number of # pats at the same time my @pats = collapse_pats($asmstate->{all_patterns_for_set}->{$set}); # my @pats = @{$asmstate->{all_patterns_for_set}->{$set}}; @pats = quote_all_metas(@pats); @pats = subsume_with_dotstars(@pats); @pats = expand_with_dots(@pats); @pats = ensure_reqpatlength(@pats); # at this point, patterns are all valid regexps # now check to see if any of these pats have been subsumed in an # already-output pattern (one with more hits!) my @pats_new = (); foreach my $pat (@pats) { my $subsumed = 0; foreach my $done (@done_pats, @pats_new) { # pattern == existing pattern, or existing pattern is contained by # pattern, or pattern is contained in existing pattern if ($pat eq $done || $pat =~ /\Q${done}\E/ || $done =~ /\Q${pat}\E/) { $subsumed=1; last; } # or one pattern contains the other (but interpreted as a regexp!) # this deals with /foo.{0,10} bar/ vs /foo ish bar/ if ($pat =~ /$done/) { $subsumed=1; last; } if ($done =~ /$pat/) { $subsumed=1; last; } } next if $subsumed; # add to the current list push @pats_new, $pat; } @pats = @pats_new; push @done_pats, @pats_new; # if we have no non-subsumed pats at this stage, skip this set next unless @pats; if (($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam} < $opt{reqhitrate}) { # quit if we hit the final hitrate threshold return; } if (defined($hitratethresholds[0]) && ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam} < $hitratethresholds[0]) { print "# passed hit-rate threshold: $hitratethresholds[0]\n"; shift @hitratethresholds; } if ($opt{rules}) { printf "# %6.3f %6.3f %6.3f\n", 1.0, ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam}, 0; # sort, to ensure ordering always remains the same foreach my $pat (sort @pats) { my $name = generate_rule_name($pat); if ($opt{ruletype} eq 'header') { # deal with header-specific munging. # "\[\\n\]" is the result of "[\n]", at this stage $pat =~ s/\Q\[\\n\]\E/\\n/gs; $pat =~ s/\Q\[\\t\]\E/\\t/gs; } print "$opt{ruletype} $opt{ruleprefix}${name} /$pat/\n"; $count_out++; } } else { my $pats = '/'.join ('/, /', map { s/\//\\\//gs; $_; } @pats).'/'; printf "%6.3f %6.3f %6.3f %s\n", 1.0, ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam}, 0, $pats; $count_out++; } } } sub collapse_pats { my $pataryref = $_[0]; my @ret = (); while (1) { my $pat = shift(@{$pataryref}); last unless defined($pat); # warn "JMD $pat"; $pat =~ s/^\s+//; # TODO: optimise, second-slowest line my @hits = grep /\Q$pat\E/, @{$asmstate->{text_string}}; if (scalar @hits == 0) { warn "supposed pattern /$pat/ is 0-hitter"; warn "JMD strings:\n ".join("\n ", @{$asmstate->{text_string}})."\n"; push @ret, "[BROKEN: 0 hitter]$pat"; next; } # we don't have all day! my $pat_maxlen = $opt{maxtextread}; my $s = $hits[0]; # Now, expand the pattern using a BLAST-style algorithm # expand towards start of string while (1) { my $l = length($pat); last if ($l > $pat_maxlen); # too long my $found; if ($s =~ /(.)\Q$pat\E/s && $s !~ /\[p\]\Q$pat\E/s) { $found = $1; } if (!defined $found) { # start of string. break last; } # give up if there are a differing number of hits for the new pat my $newpat = $found.$pat; # note: we can search just in @hits, instead of in # @{$asmstate->{text_string}}, because there's no way we can # *increase* the hitrate in the corpus; we can only reduce it. # this is double-checked after these 2 expansion loops anyway if (scalar (grep /\Q$newpat\E/, @hits) != scalar @hits) { last; } $pat = $newpat; # and carry on } # warn "JMD $pat"; # expand towards end of string while (1) { if (length($pat) > $pat_maxlen || $s =~ /\Q$pat\E\[p\]/s || $s !~ /\Q$pat\E(.)/s) { # end of string. break last; } my $newpat = $pat.$1; if (scalar (grep /\Q$newpat\E/, @hits) != scalar @hits) { last; } $pat = $newpat; # and carry on } # double-check to ensure we haven't somehow INCREASED our hitrate # beyond the initial pattern if (scalar (grep /\Q$pat\E/, @{$asmstate->{text_string}}) != scalar @hits) { die "oops! went too far"; } # now remove subsumed patterns @{$pataryref} = grep { $pat !~ /\Q$_\E/s } @{$pataryref}; # warn "JMD $pat"; # skip recording this if it's already inside one of the results next if grep { $_ =~ /\Q$pat\E/s } @ret; # also, remove cases where this pattern contains previous results @ret = grep { $pat !~ /\Q$_\E/s } @ret; # warn "JMD $pat"; push (@ret, $pat); } return @ret; } sub quote_all_metas { return map { s/([!-+\`\?\^\~\\\/\|\.\{\}\(\)\[\]\@])/\\$1/gs; $_; } @_; } sub subsume_with_dotstars { my @working = @_; # attempt to resolve . and .* my @ret = (); while (1) { my $p1 = shift(@working); last unless defined($p1); # TODO: optimise, 4th-slowest line my @hits = grep /$p1/, @{$asmstate->{text_string}}; if (scalar @hits == 0) { warn "supposed pattern /$p1/ is 0-hitter"; push @ret, "[BROKEN: 0 hitter]$p1"; next; } foreach my $p2 (@working) { next if ($p1 eq $p2); DOTLOOP: for my $dotstar ( ".", ".?", ".{0,3}", ".{0,5}", ".{0,20}", ".{0,40}" ) { my $newpatcapture = $p1."(".$dotstar.")".$p2; my $newpat = $p1.$dotstar.$p2; # drop the attempt if any of the existing hits becomes a miss, # or the .* includes a paragraph boundary in any of the hits foreach my $t (@hits) { next DOTLOOP if ($t !~ /$newpatcapture/); next DOTLOOP if ($1 =~ /\[p\]/); } next if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) != scalar @hits); # it works! yay # TODO: for /./, see if we can capture the options and # construct a /[oO0]/ char class instead $p1 = $newpat; } } # skip recording this if it's already inside one of the results next if grep { $_ =~ /\Q$p1\E/s } @ret; push (@ret, $p1); } return @ret; } sub expand_with_dots { my @working = @_; my @ret = (); my $pat_maxlen = $opt{maxtextread}; while (1) { my $p1 = shift(@working); last unless defined($p1); # TODO: optimise, 5th-slowest line my @hits = grep /$p1/, @{$asmstate->{text_string}}; if (scalar @hits == 0) { warn "supposed pattern /$p1/ is 0-hitter"; push @ret, "[BROKEN: 0 hitter]$p1"; next; } my $s = $hits[0]; # expand towards end of string (with .) while (1) { last if (length($p1) >= $pat_maxlen); if ($s !~ /$p1\[p\]/s && $s =~ /$p1(.)/s) { my $extn = $1; quote_all_metas($extn); my $newpat = $p1.$extn; # TODO: optimise, 3rd-slowest line if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) { $p1 = $newpat; next; } } if ($s !~ /$p1.?\[p\]/s && $s =~ /$p1[^\\](.)/s) { my $extn = $1; quote_all_metas($extn); my $newpat = $p1.".".$extn; if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) { $p1 = $newpat; next; } } if ($s !~ /$p1.{0,2}\[p\]/s && $s =~ /$p1[^\\][^\\](.)/s) { my $extn = $1; quote_all_metas($extn); my $newpat = $p1."..".$extn; if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) { $p1 = $newpat; next; } } last; } push (@ret, $p1); } @ret; } sub ensure_reqpatlength { my @ret = @_; if ($opt{reqpatlength}) { @ret = grep { length($_) >= $opt{reqpatlength} } @ret; return () unless @ret; } return @ret; } sub generate_rule_name { my $str = shift; use Digest::SHA1 qw(sha1_base64); $str = sha1_base64($str); $str =~ s/^(.{6}).*$/$1/gs; $str =~ tr/a-z/A-Z/; $str =~ s/[^A-Z0-9]/_/gs; return $str; } # --------------------------------------------------------------------------- # used by all phases sub logmsg { warn "".(scalar localtime time).": $_[0]\n"; }