#!/usr/bin/perl -w # # rule-hits-over-time - produce graphs of rule hits over time, using gnuplot # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use GD; use Statistics::DEA; use strict; use warnings; use Fcntl; use Getopt::Long; use SDBM_File; sub usage { die q{ usage: rule-hits-over-time [options] --rule rulename log1 [log2 ...] --rule=rulename specify rule to map --period=secs specify period (default: 1 day) --ignore_older=days ignore hits older than N days (default: 0 = none) --scale_period=n scale period up to N items of data, 0=no scaling (default: 0) --size_x=pixels width of output graphs, in pixels (def: 800) --size_y=pixels height of ONE of the output graphs, in pixels (default: 400) --cgi CGI output, to stdout with HTTP headers --text text output only }; } use vars qw( $opt_rule $opt_size_x $opt_size_y $opt_text $opt_cgi $opt_period $opt_scale_period $opt_ignore_older $opt_debug ); GetOptions( 'rule=s', 'size_x=i', 'size_y=i', 'text', 'cgi', 'scale_period=i', 'ignore_older=i', 'period=i', 'debug' ) or usage(); usage() unless $opt_rule; my $DEBUG_TMPDIR = $opt_debug; # keep the tmpdir around after exiting, for debug # $DEBUG_TMPDIR = 1; # fix PATHs for sucky Solaris compatibility. $ENV{PATH} = "/local/gnuplot-4.0.0/bin:/opt/sfw/bin:".$ENV{PATH}; $ENV{LD_LIBRARY_PATH} .= ":/local/gd-2.0.33/lib"; my $rule_re = qr/[, ]${opt_rule}[, ]/; # my $period = $opt_period || (24 * 60 * 60 * 1); my $period = $opt_period || 3600; my $graph_x = $opt_size_x || 800; my $graph_y = $opt_size_y || 400; my $fname_counter = 1; my %graph_png_data = (); my %allbuckets = (); my %allresults = (); my @allfiles = (); my $graph_times = []; my $graph_data = []; my $this_file_results; my $lastbucket; my $nextbucket; my $seen_y; my $seen_n; my $tmpdir = "/tmp/rulehits.$$"; if ($DEBUG_TMPDIR) { $tmpdir = "/tmp/rulehits.tmp"; system("rm -rf $tmpdir"); } mkdir ($tmpdir) or die "collided on $tmpdir"; my $outdir = "."; if ($opt_cgi) { $outdir = $tmpdir; } my $file_sets = [ ]; # split into ham and spam $file_sets = [ [ 'TITLE:hits in spam' ], [ 'TITLE:hits in ham' ] ]; foreach my $file (@ARGV) { if ($file =~ /\bham\b/) { push @{$file_sets->[1]}, $file; } else { push @{$file_sets->[0]}, $file; } } foreach my $set (@{$file_sets}) { @allfiles = (); %allbuckets = (); %allresults = (); my $settitle = ''; if ($set->[0] =~ /^TITLE:(.*)$/) { $settitle = $1; shift(@{$set}); } create_gp("$opt_rule $settitle"); foreach my $file (@{$set}) { if (!$opt_text) { my $title = $file; $title =~ s/^.*\///; } push (@allfiles, $file); if (1) { # use an on-disk file to avoid massive VM usage for this hash # on huge datasets unlink("$tmpdir/graph.tmp.dir"); unlink("$tmpdir/graph.tmp.pag"); tie (%{$allresults{$file}}, 'SDBM_File', "$tmpdir/graph.tmp", O_RDWR|O_CREAT, 0600) or die "tie failed: $!"; } else { %{$allresults{$file}} = (); } $this_file_results = $allresults{$file}; read_logs($file); $graph_times = []; $graph_data = []; summarise(); } $opt_scale_period and collapse_periods(); plot_gp(); } my $format = "gif"; { my $both = GD::Image->new($graph_x, 15 + ($graph_y * 2)); my $file01 = GD::Image->newFromPngData($graph_png_data{"file01"}, 1); my $file02 = GD::Image->newFromPngData($graph_png_data{"file02"}, 1); if (!$file01 || !$file02) { warn "bad input. leaving graph blank"; } else { $both->copy($file01, 0, 5, 0, 0, $graph_x-1, $graph_y-1); $both->copy($file02, 0, 10 + $graph_y, 0, 0, $graph_x-1, $graph_y-1); } if ($opt_cgi) { use CGI qw(:standard); print header("image/$format"); binmode STDOUT; print STDOUT $both->$format(); } else { open(IMG, ">both.$format") or die $!; binmode IMG; print IMG $both->$format(); close IMG; } $both->gif(); } if (!$DEBUG_TMPDIR) { unlink(<$tmpdir/*.*>); rmdir $tmpdir; } else { system ("ls -l $tmpdir/*.* 1>&2"); } exit; sub summarise { foreach my $bucket (sort keys %allbuckets) { my @cols = (); foreach my $file (@allfiles) { my $res = $allresults{$file}->{$bucket}; my $sy; my $sn; if (!$res) { $sn = $sy = -1; } elsif ($res !~ /^y(\d+)n(\d+)$/) { warn "bad results: $res for $file $bucket"; next; } else { $sy = $1; $sn = $2; } if (!defined $sy && !defined $sn) { $sn = $sy = -1; } elsif (!defined $sy || !defined $sn) { # assert: enforce both < 0, if either is warn "oops? sy=$sy sn=$sn, should be both < 0"; $sn = $sy = -1; } if (($sy+$sn) > 0) { push @cols, ($sy / ($sy + $sn)) * 100.0; } else { push @cols, -1; } } if ($opt_text) { print $bucket," ".join(' ',@cols)."\n"; } else { push (@{$graph_times}, $bucket); push (@{$graph_data}, \@cols); } } } sub collapse_periods { while (scalar @{$graph_data} > $opt_scale_period) { my $num_files = (scalar @allfiles - 1); my $newtimes = [ ]; my $newdata = [ ]; my $i; for ($i = 0; $i < (scalar @{$graph_data}); $i += 2) { $newtimes->[$i >> 1] = $graph_times->[$i]; foreach my $j (0 .. $num_files) { my $v1 = $graph_data->[$i]->[$j]; my $v2 = $graph_data->[$i+1]->[$j]; if (!defined $v2) { $v2 = -1; } if ($v1 >= 0.0 && $v2 >= 0.0) { # both are valid. take their mean $v1 = ($v1 + $v2) / 2.0; } elsif ($v2 >= 0.0) { # only one is valid; use it and ignore the invalid one $v1 = $v2; } else { # we're good, v1 is the valid one anyway } $newdata->[$i >> 1]->[$j] = $v1; } } @{$graph_times} = @{$newtimes}; @{$graph_data} = @{$newdata}; $period *= 2; } } sub read_logs { my $file = shift; # limit to a range from [4 years ago, today] to avoid OOM craziness # from corrupt input # if ($opt_ignore_older <= 0) { $opt_ignore_older = 365 * 4; } my $limit_hi = time; my $limit_lo = $limit_hi - (24*60*60*$opt_ignore_older); $lastbucket = undef; $nextbucket = undef; $seen_y = 0; $seen_n = 0; if ($file =~ /\.gz$/) { open (IN, "gunzip -cd '$file'|") or die "cannot gunzip $file"; } else { open (IN, "<$file") or die "cannot read $file"; } while () { next if /^#/; my $t; /\btime=(\d+),/ and $t = $1; next unless $t; if ($t < $limit_lo || $t > $limit_hi) { warn "ignoring out-of-range time $t (limit: $limit_lo < t < $limit_hi)"; next; } my $found = ($_ =~ $rule_re); if (!defined $lastbucket) { $lastbucket = $t - ($t % $period); $nextbucket = $lastbucket + $period; } if ($t < $nextbucket) { if ($found) { $seen_y++; } else { $seen_n++; } } else { while ($t >= $nextbucket) { completeline(); $lastbucket = $nextbucket; $nextbucket += $period; } } } close IN; completeline(); } sub completeline { return unless ($lastbucket); $allbuckets{$lastbucket} = undef; $this_file_results->{$lastbucket} = "y".$seen_y."n".$seen_n; $seen_y = 0; $seen_n = 0; } sub create_gp { my $title = shift; my $mailtype = 'mail'; if ($title =~ /\b(ham|spam)\b/) { $mailtype = $1; } my $y_label = "\%age of $mailtype in period"; $SIG{PIPE} = sub { die "unexpected SIGPIPE received!"; }; open (GP, "| gnuplot - > $tmpdir/gp.log 2>&1") or die "cannot run gnuplot"; # eye-candy my $niceperiod = "$period secs"; if ($period % (24*60*60) == 0) { $niceperiod = ($period / (24*60*60))." days"; } # (NOTE: -1% hitrate means no data for that time period)' print GP qq{ set terminal png small \\ interlace size $graph_x,$graph_y \\ xffffff x444444 x33cc00 \\ xff3300 x0000cc x99cc00 xff9900 \\ xcccc00 x333333 x999999 x9500d3 set out '$tmpdir/out.png' set grid back xtics ytics set xlabel 'Time, in blocks of $niceperiod' set xdata time set timefmt "%Y-%m-%d-%H" set format x "%04Y%02m%02d" set ylabel '$y_label' set yrange [0:*] set title "$title" set key left top Left nobox }; } sub fmt_time_t { my $tt = shift; use POSIX qw(strftime); return strftime "%Y-%m-%d-%H", gmtime($tt); } sub plot_gp { my $num_files = (scalar @allfiles - 1); my $num_datapoints = (scalar @{$graph_data} - 1); # specify a number of alphas for Statistics::DEA. Right now, # the graph is pretty unreadable with more than one. my $dea_alphas = [ 0.9 ]; my $num_alphas = (scalar @{$dea_alphas} - 1); my $times = [ ]; my $avgs = [ ]; my $graphname = sprintf("file%02d", $fname_counter++); if (!$opt_text) { if (@{$graph_data}) { my $deas = (); foreach my $i (0 .. $num_files) { foreach my $a (0 .. $num_alphas) { $deas->[$a]->[$i] = Statistics::DEA->new($dea_alphas->[$a], $period * 3); } } foreach my $j (0 .. $num_datapoints) { my (@datas) = @{$graph_data->[$j]}; $times->[$j] = fmt_time_t($graph_times->[$j]); foreach my $i (0 .. $num_files) { my $d = $datas[$i]; foreach my $a (0 .. $num_alphas) { if ($d >= 0) { $deas->[$a]->[$i]->update($d, $j); } my $avg; eval { # this can die if it hasn't received enough data! # so trap with an eval. $avg = $deas->[$a]->[$i]->average(); }; $avgs->[$a]->[$j]->[$i] = (defined $avg) ? $avg : -1; } } } } # write the data plotfile open (DATA, ">$tmpdir/plot.$graphname.data") or die; if (@{$graph_data}) { foreach my $j (0 .. $num_datapoints) { print DATA $times->[$j]," ",join(' ', @{$graph_data->[$j]}),"\n"; } } else { # a fake datapoint so gnuplot doesn't puke on us print DATA fmt_time_t(0)," 0 0\n"; } close DATA or die; # write the avgs plotfiles foreach my $a (0 .. $num_alphas) { open (DATA, ">$tmpdir/avgs$a.$graphname.data") or die; if (@{$graph_data}) { foreach my $j (0 .. $num_datapoints) { print DATA $times->[$j]," ", defined $avgs->[$a]->[$j] ? join ' ', @{$avgs->[$a]->[$j]} : '0', "\n"; } } else { # a fake datapoint so gnuplot doesn't puke on us print DATA fmt_time_t(0)," 0 0\n"; } close DATA or die; } # and the commands file my @plot = (); foreach my $i (0 .. $num_files) { my $legend = filename_to_legend ($allfiles[$i]); my $style = $i+1; my $col = $i+2; push @plot, qq{ '$tmpdir/plot.$graphname.data' using }. qq{ 1:(\$$col >= 0 ? \$$col : 1/0) }. # note: using "lt $style" gives us points in the same # colour as the lines in the smoothed graph below qq{ with points lt $style pt $style ps 1 }. qq{ title '$legend' }; foreach my $a (0 .. $num_alphas) { push @plot, qq{ '$tmpdir/avgs$a.$graphname.data' using }. qq{ 1:(\$$col >= 0 ? \$$col : 1/0) }. qq{ with lines lt $style lw 3 }. qq{ title ' (DEA a=$dea_alphas->[$a])' }; } } print GP "plot ",join(", ", @plot), "\n"; close GP or warn "gnuplot command exited: $?"; $graph_png_data{$graphname} = readfile("$tmpdir/out.png"); } } sub readfile { open (IN, "<$_[0]") or die "cannot read $_[0]"; binmode IN; my $str = join('',); close IN; return $str; } sub filename_to_legend { my $f = shift; $f =~ s/^.*\///; $f =~ s/LOGS\.all-//; $f =~ s/\.log\.\S+$//; return $f; }