#!/usr/bin/perl my $HOW_MANY = 1000; my $freqs = { }; my $only_in_spam = { }; my $only_in_nonspam = { }; my $lines = 0; open (IN, ") { $lines++; } close IN; my $spamlines = $lines; my $lines = 0; open (IN, ") { $lines++; } close IN; my $nonspamlines = $lines; $nonspam_bias = ($spamlines) / ($nonspamlines+1); open (IN, ") { /^\s*(\d+)\s+(\S+\s\S+)$/ or next; my $c = $1; my $w = $2; if (defined $freqs->{$w}) { $freqs->{$w} += $c; } else { $freqs->{$w} = $c; $only_in_spam->{$w} = 1; } } close IN; open (IN, ") { /^\s*(\d+)\s+(\S+\s\S+)$/ or next; my $c = $1; my $w = $2; if (defined $freqs->{$w}) { $freqs->{$w} -= ($c * $nonspam_bias); delete $only_in_spam->{$w}; } else { if ($count_negatives) { $freqs->{$w} = (-$c) * 10; $only_in_nonspam->{$w} = 1; } } } close IN; foreach my $w (keys %{$only_in_spam}) { $freqs->{$w} *= 10; } summarise(); exit; sub summarise { my $num; if ($count_negatives) { $halflim = $HOW_MANY / 2; } else { $halflim = $HOW_MANY; } # positives first $num = 0; my $highest = undef; foreach my $w (sort { $freqs->{$b} <=> $freqs->{$a} } keys %{$freqs}) { next if ($freqs->{$w} < 3); if (!defined $highest) { printf ("spamphrase-highest-score %d\n", $freqs->{$w}); $highest = $freqs->{$w}; } printf ("spamphrase %d %s\n", $freqs->{$w}, $w); $num++; last if ($num == $halflim); } if ($count_negatives) { # now negatives $num = 0; $halflim = $HOW_MANY / 2; foreach my $w (sort { $freqs->{$b} <=> $freqs->{$a} } keys %{$only_in_nonspam}) { next if ($freqs->{$w} > -3); printf ("spamphrase %d %s\n", $freqs->{$w}, $w); $num++; last if ($num == $halflim); } } }