#!/usr/bin/perl -w my $argcffile = shift @ARGV; my @unmutated_tests = qw( USER_IN_ALL_SPAM_TO USER_IN_MORE_SPAM_TO USER_IN_WHITELIST_TO A_FROM_IN_AUTO_WLIST USER_IN_WHITELIST USER_IN_BLACKLIST A_HREF_TO_IP ANOTHER_NET_AD BRAND_NEW_PAGER BUGGY_CGI_DE BUGGY_CGI_DE_2 BUGGY_CGI_DE_3 BUGGY_CGI_ES BUGGY_CGI_PT_2 CLICKSFORMONEY_NET DCC_CHECK DIFF_C_PATCH EGP_HTML_BANNER EMAIL_HARVEST EU_200_32_CE E_WEBHOSTCENTRAL_URL EXCUSE_5 EXCUSE_9 EXCUSE_ES_02 EXCUSE_ES_03 EXCUSE_ES_04 FREEMEGS_URL FREEWEBCO_NET_URL FREEWEBHOSTINGCENTRAL FROM_FORGED_HOTMAIL FROM_UGETMORE HUNZA_DIET_BREAD INTERNET_TERROR_RANT JUST_MAILED_PAGE LASER_PRINTER LONG_NUMERIC_HTTP_ADDR MAIL_IN_ORDER_FORM NIGERIAN_SCAM NIGERIAN_SCAM_3 NIGERIAN_SCAM_8 NO_MX_FOR_FROM OE_PI PORN_2 PRINT_OUT_AND_FAX Q_FOR_SELLER RAZOR_CHECK RCVD_IN_BL_SPAMCOP_NET RCVD_IN_DSBL RCVD_IN_DUL RCVD_IN_MULTIHOP_DSBL RCVD_IN_ORBS RCVD_IN_OSIRUSOFT_COM RCVD_IN_RBL RCVD_IN_RELAYS_ORDB_ORG RCVD_IN_RFCI RCVD_IN_RSS RCVD_IN_UNCONFIRMED_DSBL RCVD_IN_VISI REMOVE_ES_01 REMOVE_ES_02 REMOVE_ES_04 SEXY_PICS SHOES_GUY SPAM_FORM_INPUT SPAM_PHRASES_100 STAINLESS_STEEL TO_INVESTORS UNDESIRED_LANGUAGE_BODY USER_IN_ALL_SPAM_TO USER_IN_BLACKLIST USER_IN_MORE_SPAM_TO USER_IN_WHITELIST_TO WEB4PORNO_URL X_OSIRU_SPAM_SRC X_OSIRU_SPAMWARE_SITE X_UIDL_SPAMSIGN YR_MEMBERSHIP_EXCH ); my $threshold = 5; my $iterlimit = 0; my %is_spam = (); my %tests_hit = (); my $scores; readscores(); my $origscores = $scores; print "Reading per-message hit stat logs and scores...\n"; my $total; my $totspam; my $totnonspam; readlogs(); read_ranges(); print "Writing logs and current scores as C code...\n"; writescores_c(); writetests_c(); exit 0; sub readlogs { my $count = $totspam = $totnonspam = 0; foreach my $file ("spam.log", "nonspam.log") { open (IN, "<$file"); while () { if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; } my $hits = $1; $_ = $'; s/,,+/,/g; s/^\s+//; s/\s+$//; my @tests = (); foreach my $tst (split (/,/, $_)) { next if ($tst eq ''); if (!defined $scores->{$tst}) { warn "unknown test in $file, ignored: $tst\n"; next; } push (@tests, $tst); } $tests_hit{$count} = \@tests; if ($file eq "spam.log") { $totspam++; $is_spam{$count} = 1; } else { $totnonspam++; $is_spam{$count} = 0; } $count++; } close IN; } $total = $count; } sub readscores { $scores = { }; my @files; if (!defined $argcffile) { $argcffile = "../rules"; } if (-d $argcffile ) { @files = <$argcffile/[0-9]*.cf>; } else { @files = ($argcffile); } foreach $cffile (@files) { print "Reading scores from \"$cffile\"...\n"; open (IN, "<$cffile") or warn "cannot read $cffile\n"; while () { s/#.*$//g; s/^\s+//; s/\s+$//; if (/^(header|body|rawbody|full|uri)\s+(\S+)\s+/) { $scores->{$2} ||= 1; } elsif (/^score\s+(\S+)\s+(.+)$/) { $scores->{$1} = $2; } } close IN; } } sub writescores_c { my $size = (scalar keys %{$scores}); # adding 100 here makes the GA take a lot longer since the genome's 100 genes longer! plus messes up scores output - crh open (DAT, ">tmp/scores.data"); print DAT "N$size\n"; my $count = 0; foreach my $name (sort keys %{$scores}) { if (!defined $is_mutatable{$name}) { $is_mutatable{$name} = 1; } else { $range_lo{$name} = $range_hi{$name} = $scores->{$name}; } $range_lo{$name} ||= 0.1; $range_hi{$name} ||= 1.5; print DAT ".".$count."\n"; print DAT "n".$name."\n"; print DAT "b".$scores->{$name}."\n"; print DAT "m".$is_mutatable{$name}."\n"; print DAT "l".$range_lo{$name}."\n"; print DAT "h".$range_hi{$name}."\n"; $score_c_index{$name} = $count; $count++; } close DAT; open (OUT, ">tmp/scores.h"); print OUT " int num_scores; unsigned char is_mutatable[$size]; /* er, is_mutable I think ;) */ double range_lo[$size]; double range_hi[$size]; double bestscores[$size]; double scores[$size]; char *score_names[$size]; /* readscores() is defined in tests.h */ "; close OUT; } sub writetests_c { my $file; # figure out max hits per message my $max_hits_per_msg = 0; for ($file = 0; $file < $total; $file++) { my $hits = scalar @{$tests_hit{$file}} + 1; if ($hits > $max_hits_per_msg) { $max_hits_per_msg = $hits; } } open (TOP, ">tmp/tests.h"); print TOP " int num_tests = $total; int num_spam = $totspam; int num_nonspam = $totnonspam; int max_hits_per_msg = $max_hits_per_msg; unsigned char num_tests_hit[$total]; unsigned char is_spam[$total]; unsigned short tests_hit[$total][$max_hits_per_msg]; "; $_ = join ('', ); print TOP $_; close TOP; open (DAT, ">tmp/tests.data"); for ($file = 0; $file < $total; $file++) { print DAT ".".$file."\n"; my $out = ''; $out .= "s".$is_spam{$file}."\n"; my $num_tests_hit = 0; foreach my $test (@{$tests_hit{$file}}) { if ($test eq '') { next; } if (!defined $score_c_index{$test}) { warn "test with no C index: $test\n"; } $num_tests_hit++; $out .= "t".$score_c_index{$test}."\n"; if ($num_tests_hit >= $max_hits_per_msg) { die "Need to increase \$max_hits_per_msg"; } } print DAT "n".$num_tests_hit."\n".$out; } close DAT; } sub read_ranges { %is_mutatable = (); foreach my $t (@unmutated_tests) { $is_mutatable{$t} = 0; } if (!-f 'tmp/ranges.data') { system ("make tmp/ranges.data"); } # read ranges, and mutatableness, from ranges.data. open (IN, ") { /^(\S+) (\S+) (\d+) (\S+)$/ or next; my $t = $4; $range_lo{$t} = $1+0; $range_hi{$t} = $2+0; my $mut = $3+0; if (!$mut) { $is_mutatable{$t} = 0; } } close IN; } __DATA__ void loadtests (void) { FILE *fin = fopen ("tmp/tests.data", "r"); char buf[256]; int file = 0; int tnum = 0; while (fgets (buf, 255, fin) != NULL) { char cmd; long arg; cmd = (char) *buf; arg = strtol (buf+1, NULL, 10); if (cmd == '.') { file = arg; } else if (cmd == 'n') { tnum = 0; num_tests_hit[file] = arg; } else if (cmd == 's') { is_spam[file] = arg; } else if (cmd == 't') { tests_hit[file][tnum] = arg; tnum++; } } fclose(fin); printf ("Read test results for %d messages.\n", file+1); } void loadscores (void) { FILE *fin = fopen ("tmp/scores.data", "r"); char buf[256]; int snum = 0; while (fgets (buf, 255, fin) != NULL) { char cmd; long arg; float argf; char *str, *white; cmd = (char) *buf; arg = strtol (buf+1, NULL, 10); argf = strtod (buf+1, NULL); str = buf+1; while ((white = strchr (str, '\n')) != NULL) { *white = '\0'; } if (cmd == '.') { snum = arg; } else if (cmd == 'N') { num_scores = arg; } else if (cmd == 'b') { bestscores[snum] = argf; } else if (cmd == 'l') { range_lo[snum] = argf; } else if (cmd == 'h') { range_hi[snum] = argf; } else if (cmd == 'n') { score_names[snum] = strdup (str); /* leaky leak ;) */ } else if (cmd == 'm') { is_mutatable[snum] = arg; } } fclose(fin); printf ("Read scores for %d tests.\n", num_scores); }