#!/usr/bin/perl -w use FindBin; use lib "$FindBin::Bin/../lib"; use strict; use Mail::SpamAssassin::ArchiveIterator; use Getopt::Std; use FileHandle; ########### sub usage { print STDERR "split-corpora [-n num_buckets] [-p outfile_prefix] ". "[-l max_messages] ". "folder1 ....\n"; exit(1); } # usage() ########### our ($opt_n, $opt_p, $opt_h, $opt_l); getopt('n:p:l:h'); usage() if ($opt_h); my $num_buckets = $opt_n || 2; my $prefix = $opt_p || "bucket"; my @IN_FILES = @ARGV; usage() if (@IN_FILES == 0); my @targets = (); foreach (@IN_FILES) { if (-d $_) { push (@targets, "ham:dir:$_"); } else { push (@targets, "ham:mbox:$_"); } } my @bucket_fhs = (); foreach my $bucket (1 .. $num_buckets) { my $bucket_fh = new FileHandle(); if (!$bucket_fh->open(">$prefix.$bucket")) { die "Could not open '$prefix.$bucket' for writing: $!\n"; } push(@bucket_fhs, $bucket_fh); } # foreach my $bucket (1 .. $num_buckets) my $current_bucket = 0; my $iter = new Mail::SpamAssassin::ArchiveIterator({ 'opt_all' => 1, }); $iter->set_functions(\&wanted, sub { }); my $messagecount = 0; eval { $iter->run(@targets); }; if ($@) { die $@ unless ($@ =~ /HITLIMIT/); } foreach my $fh (@bucket_fhs) { $fh->close(); } if ($opt_l && $messagecount < $opt_l) { warn "warning: only found $messagecount messages instead of $opt_l\n"; } ############################################# sub wanted { my (undef, $msg_id, $time, $data_ref) = @_; if ($opt_l && $messagecount++ > $opt_l) { die 'HITLIMIT'; } # Make sure message can be used for outputing mbox format if ($data_ref->[0] !~ /^From \S+ +... ... /) { unshift(@$data_ref, "From abc\@xyz.com Mon Jan 1 00:00:00 2000\n"); } $bucket_fhs[$current_bucket]->print( join("", @$data_ref) ); $current_bucket = ($current_bucket + 1) % $num_buckets; } # wanted()