#!/usr/bin/perl -w # (rough) graphic demo of this algorithm: # 0.0 = -limit [......] 0 ........ limit # 0.25 = -limit ..[..... 0 .]...... limit # 0.5 = -limit ....[... 0 ...].... limit # 0.75 = -limit ......[. 0 .....].. limit # 1.0 = -limit ........ 0 [......] limit my $sliding_window_limits = 4.8; # limits = [-$range, +$range] my $sliding_window_size = 5.5; # scores have this range within limits # 0.0 = -limit [......] 0 ........ limit # 0.25 = -limit ....[... 0 ]....... limit # 0.5 = -limit ......[. 0 .]...... limit (note: tighter) # 0.75 = -limit .......[ 0 ...].... limit # 1.0 = -limit ........ 0 [......] limit my $shrinking_window_lower_base = 0.00; my $shrinking_window_lower_range = 1.00; # *ratio, added to above my $shrinking_window_size_base = 1.00; my $shrinking_window_size_range = 1.00; # *ratio, added to above my $use_sliding_window = 0; my $argcffile = shift @ARGV; my $scoreset = shift @ARGV; $scoreset = 0 if ( !defined $scoreset ); if (defined ($argcffile) && $argcffile eq '-test') { # use this to debug the ranking -> score-range mapping: for $rat (0.0, 0.25, 0.5, 0.75, 1.0) { my ($lo, $hi); if ($use_sliding_window) { ($lo, $hi) = sliding_window_ratio_to_range($rat); } else { ($lo, $hi) = shrinking_window_ratio_to_range($rat); } warn "test: $rat => [ $lo $hi ]\n"; } exit; } my %freq_spam = (); my %freq_nonspam = (); my $num_spam; my $num_nonspam; my $num_total; my %mutable_tests = (); my %ranking = (); my %soratio = (); my %is_nice = (); if (!defined $argcffile) { $argcffile = "../rules"; } system ("./parse-rules-for-masses -d \"$argcffile\" -s $scoreset") and die; if (-e "tmp/rules.pl") { # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem require "./tmp/rules.pl"; } else { die "parse-rules-for-masses had no error but no tmp/rules.pl!?!"; } while (<>) { /^\s*([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+\S+\s+(.+)\s*$/ or next; my $overall = $1+0; my $spam = $2+0; my $nonspam = $3+0; my $soratio = $4+0; my $ranking = $5+0; my $test = $6; if ($test eq '(all messages)') { $num_spam = $spam; $num_nonspam = $nonspam; $num_total = $spam+$nonspam; next; } next if ($test eq '(all messages as %)'); if (!defined ($rules{$test})) { warn "rule $test no longer exists; ignoring\n"; next; } $freq{$test} = $overall; $freq_spam{$test} = $spam; $freq_nonspam{$test} = $nonspam; my $tflags = $rules{$test}->{tflags}; $tflags ||= ''; if ($tflags =~ /\buserconf\b/ || ( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ )) { $mutable_tests{$test} = 0; } else { $mutable_tests{$test} = 1; } if ($tflags =~ m/\bnice\b/i) { $is_nice{$test} = 1; } else { $is_nice{$test} = 0; } if ($overall < 0.01) { # less than 0.01% of messages were hit $mutable_tests{$test} = 0; $soratio{$test} = 0.5; $ranking{$test} = 0.0; $rules{$test}->{score} = 0; # tvd - disable these rules automagically } else { $soratio{$test} = $soratio; $ranking{$test} = $ranking; } } if ( ! mkdir "tmp", 0755 ) { warn "Couldn't create tmp directory!: $!\n"; } open (OUT, ">tmp/ranges.data"); foreach my $test (sort { $ranking{$b} <=> $ranking{$a} } keys %freq) { if (!defined ($rules{$test})) { warn "no rule $test"; print OUT ("0 0 0 $test\n"); next; } my $overall = $freq{$test}; my $spam = $freq_spam{$test}; my $nonspam = $freq_nonspam{$test}; my $soratio = $soratio{$test}; my $ranking = $ranking{$test}; my $mutable = $mutable_tests{$test}; if (!$mutable || $rules{$test}->{score} == 0) { # didn't look for score 0 - tvd printf OUT ("%3.3f %3.3f 0 $test\n", $rules{$test}->{score}, $rules{$test}->{score}); next; } # 0.0 = best nice, 1.0 = best nonnice if ($is_nice{$test}) { $ranking = .5 - ($ranking / 2); } else { $ranking = .5 + ($ranking / 2); } my ($lo, $hi); if ($use_sliding_window) { ($lo, $hi) = sliding_window_ratio_to_range($ranking); } else { ($lo, $hi) = shrinking_window_ratio_to_range($ranking); } # tvd my $tflags = $rules{$test}->{tflags}; $tflags ||= ''; if ( $is_nice{$test} && ( $ranking < .5 ) ) { # proper nice rule if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score # -5.4 $lo *=1.8; } elsif ($soratio <= 0.05 && $nonspam > 0.5) { # let good rules be larger if they want to, -4.5 $lo *= 1.5; } $hi = ($soratio == 0) ? $lo : ($soratio <= 0.005 ) ? $lo/1.1 : ($soratio <= 0.010 && $nonspam > 0.2) ? $lo/2.0 : ($soratio <= 0.025 && $nonspam > 1.5) ? $lo/10.0 : 0; if ( $soratio >= 0.35 ) { # auto-disable bad rules ($lo,$hi) = (0,0); } } elsif ( !$is_nice{$test} && ( $ranking >= .5 ) ) { # proper spam rule if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score $hi *=1.8; } elsif ( $soratio >= 0.99 && $spam > 1.0 ) { $hi *= 1.5; # let good rules be larger if they want to } $lo = ($soratio == 1) ? $hi: ($soratio >= 0.995 ) ? $hi/4.0 : ($soratio >= 0.990 && $spam > 1.0) ? $hi/8.0 : ($soratio >= 0.900 && $spam > 10.0) ? $hi/24.0 : 0; if ( $soratio <= 0.65 ) { # auto-disable bad rules ($lo,$hi) = (0,0); } } else { # rule that has bad nice setting ($lo,$hi) = (0,0); } $mutable = 0 if ( $hi == $lo ); printf OUT ("%3.1f %3.1f $mutable $test\n", $lo, $hi); } close OUT; exit; sub sliding_window_ratio_to_range { my $ratio = shift; my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio); my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio)); if ($lo > $hi) { # ??? ($lo,$hi) = ($hi,$lo); } ($lo, $hi); } sub shrinking_window_ratio_to_range { my $ratio = shift; my $is_nice = 0; my $adjusted = ($ratio -.5) * 2; # adj [0,1] to [-1,1] if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; } #$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd my $lower = $shrinking_window_lower_base + ($shrinking_window_lower_range * $adjusted); my $range = $shrinking_window_size_base + ($shrinking_window_size_range * $adjusted); my $lo = $lower; my $hi = $lower + $range; if ($is_nice) { my $tmp = $hi; $hi = -$lo; $lo = -$tmp; } if ($lo > $hi) { # ??? ($lo,$hi) = ($hi,$lo); } ($lo, $hi); }