#!/usr/bin/perl # # split-log-into-buckets-cached x:output [y:output2 ...] # # Split a mass-check log into several identically-sized buckets, evenly # taking messages from all checked corpora and preserving comments, # writing output to the files listed on the line. # It does this evenly by running through all buckets sequentially # as each line is read. # # Each output file must be listed, and the count of the "n" buckets # for that file specified; so for example 1 file can contain # 9 buckets. # # This variant operates randomly (as per -random) but caches results. my $input; my @outs = (); foreach my $arg (@ARGV) { if ($arg =~ /^(\d+):(.*)$/) { my $c = $1; my $out = $2; print "Creating $out with $c buckets\n"; push (@outs, { c => $c, out => $out }); } else { print "Reading from $arg\n"; $input = $arg; } } die "usage\n" unless $input; my @instat = stat($input); my $rebuild = 0; foreach my $out (@outs) { my @outstat = stat($out->{out}); if (!($outstat[9] && $instat[9] && $instat[9] < $outstat[9])) { $rebuild = 1; } } if ($rebuild == 0) { print "Existing outputs are up-to-date\n"; exit; } my %buckets = (); my $numbuckets = 0; foreach my $out (@outs) { my $last = $numbuckets + $out->{c}; for ( ; $numbuckets < $last; $numbuckets++) { # exploit the auto-syncing semantics of >> open ($buckets{$numbuckets}, ">>".$out->{out}.".tmp"); } } srand (1); # explicitly static seed, for reproducability open (IN, "<$input") or die "cannot open $input"; while () { select $buckets{1+int(rand()*$numbuckets)}; $| = 1; print $_; } close IN; foreach my $i (1 .. $numbuckets) { close $buckets{$i}; } foreach my $out (@outs) { rename $out->{out}.".tmp", $out->{out}; }