#!/usr/bin/perl use warnings; use strict; use File::Basename; my %opt = (); $opt{percents} = 1; # --------------------------------------------------------------------------- my $date_lut = create_date_lookup_table(); my $pairs = { }; foreach my $f (@ARGV) { my ($class, $who, $daterev); if ($f =~ m,LOGS\.\S+?-(ham|nonspam|spam)-([^\.]+)\.([^\.]+)\.log,) { # LOGS.all-spam-bb-jhardin.20090714-r793817-n.log.gz ($class, $who, $daterev) = ($1, $2, $3); } elsif ($f =~ m,(ham|nonspam|spam)-([^\.]+)\.([^\.]+)\.log,) { # LOGS.all-spam-bb-jhardin.20090714-r793817-n.log.gz ($class, $who, $daterev) = ($1, $2, $3); } elsif ($f =~ m,(ham|nonspam|spam)-([^\.]+)\.log,) { # ham-jm.log ($class, $who) = ($1, $2); } elsif ($f =~ m,(ham|nonspam|spam),) { ($class) = ($1); $who = 'unknown'; } else { die "cannot parse filename: $f\n"; } $class = 'ham' if $class eq 'nonspam'; push @{$pairs->{$who}}, [ $f, $class ]; } my $byuser = {}; my $total_counts = {}; foreach my $who (keys %{$pairs}) { my $buckets = {}; foreach my $file (@{$pairs->{$who}}) { my ($f, $class) = @{$file}; load_log($buckets, $total_counts, $f, $class, $who); } $byuser->{$who}->{buckets} = $buckets; } my $all_tspam = 0; my $all_tham = 0; foreach my $who (sort keys %{$byuser}) { report($byuser->{$who}->{buckets}, $total_counts, $who); } final_report($total_counts); exit; # --------------------------------------------------------------------------- sub load_log { my ($buckets, $total_counts, $f, $class, $who) = @_; my ($caught, $score, $restofline); if ($f =~ /\.gz$/) { open (IN, "gunzip -cd $f|") or die "cannot read $f"; } else { open (IN, "<$f") or die "cannot read $f"; } while () { ($caught, $score, $restofline) = split(' ', $_, 3); next unless ($caught =~ /^[Y\.]$/ && $restofline); next unless ($restofline =~ /(?: |,)time=(\d+)(?:\D|$)/); my $t = $1; my $tbucket = time_to_bucket($t); if (!exists $buckets->{$tbucket}) { $buckets->{$tbucket} ||= { }; } if (!exists $buckets->{$tbucket}->{$class}) { $buckets->{$tbucket}->{$class} = { count => 0, range_lo => undef, range_hi => undef, }; } $total_counts->{$class}++; my $b = $buckets->{$tbucket}->{$class}; $b->{count}++; update_range_lo(\$b->{range_lo}, $score); update_range_hi(\$b->{range_hi}, $score); } close IN; } # --------------------------------------------------------------------------- # bb-jhardin Spam messages Score range Ham messages Score range # in 2009-06 39 (0%) [0,29] 0 # in 2009-07 8 (0%) [1,24] 2 (0%) [1,4] # TOTAL: 73 (0%) [0,29] 2 (0%) [1,4] sub report { my ($buckets, $total_counts, $who) = @_; printf "%-16s %-15s %-14s %-15s %-14s\n", $who, "Spam messages", "Score range", "Ham messages", "Score range"; my $tspam = 0; my $tham = 0; my ($trslo, $trshi, $trhlo, $trhhi); foreach my $tbucket (sort keys %{$buckets}) { my $buck = $buckets->{$tbucket}; my $nspam = $buck->{spam}->{count} || 0; my $nham = $buck->{ham}->{count} || 0; printf "%-16s %7s %6s %-14s %7s %6s %-14s\n", " in $tbucket", $nspam, as_percent($nspam, $total_counts->{spam}), format_score_range($buck->{spam}->{range_lo}, $buck->{spam}->{range_hi}), $nham, as_percent($nham, $total_counts->{ham}), format_score_range($buck->{ham}->{range_lo}, $buck->{ham}->{range_hi}); $tspam += $nspam; $tham += $nham; update_range_lo(\$trslo, $buck->{spam}->{range_lo}); update_range_hi(\$trshi, $buck->{spam}->{range_hi}); update_range_lo(\$trhlo, $buck->{ham}->{range_lo}); update_range_hi(\$trhhi, $buck->{ham}->{range_hi}); } printf "%-16s %7s %6s %-14s %7s %6s %-14s\n", " TOTAL:", $tspam, as_percent($tspam, $total_counts->{spam}), format_score_range($trslo, $trshi), $tham, as_percent($tham, $total_counts->{ham}), format_score_range($trhlo, $trhhi); $all_tspam += $tspam; $all_tham += $tham; print "\n"; } # --------------------------------------------------------------------------- sub final_report { my ($total_counts) = @_; printf "%-16s %7s %6s %-14s %7s %6s %-14s\n", "OVERALL TOTAL:", $all_tspam, '', '', $all_tham, '', ''; } # --------------------------------------------------------------------------- use Time::Local; sub create_date_lookup_table { my ($sec,$min,$hour,$mday,$cmon,$cyear,$x) = gmtime time; my @month_starts = (); my ($year, $mon); for ($year = $cyear; $year >= 70; $year--) { for ($mon = 11; $mon >= 0; $mon--) { next if ($year == $cyear && $mon > $cmon); # in the future if ($year < $cyear-1 || ($year==$cyear-1 && $mon < $cmon)) { # just record January 1 for times over a year ago next unless ($mon == 0); push @month_starts, [ timegm(0,0,0,1,$mon,$year), $year+1900 ]; } else { push @month_starts, [ timegm(0,0,0,1,$mon,$year), sprintf("%04d-%02d", $year+1900, $mon+1) ]; } } } return \@month_starts; } sub time_to_bucket { my ($t) = @_; # could binary-search here, but the win is probably not worth it foreach my $pair (@{$date_lut}) { if ($pair->[0] < $t) { return $pair->[1]; } } return "1970"; } # --------------------------------------------------------------------------- sub as_percent { my ($num, $total) = @_; if (!$opt{percents} || !$num) { return ''; } if (!$total) { return '(100%)'; } return sprintf("(%d%%)", (($num||0) *100.0) / $total); } # --------------------------------------------------------------------------- sub format_score_range { my ($rlo, $rhi) = @_; if (!defined $rlo && !defined $rhi) { return ''; } if (!defined $rlo) { $rlo = ''; } if (!defined $rhi) { $rhi = ''; } return "[$rlo,$rhi]"; } # --------------------------------------------------------------------------- sub update_range_lo { my ($rloref, $score) = @_; return unless defined $score; if (!defined $$rloref || $score < $$rloref) { $$rloref = $score; } } # --------------------------------------------------------------------------- sub update_range_hi { my ($rhiref, $score) = @_; return unless defined $score; if (!defined $$rhiref || $score > $$rhiref) { $$rhiref = $score; } }