#! /usr/bin/perl -w use strict; # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Written by Theo Van Dinter # Please feel free to mail with any questions. :) # This goes with the run-masses script to take the ham/spam directories # and spit out the appropriate spam:mbox:path statements for mass-check. # The directory structure is assumed to look something like this: # # $CORPUS (this script, run-masses, etc) # |-- ham (dir with mbox files for ham) # | |-- hamtrap (dirs split into YYYY/MM/DD) # | `-- personal (dirs split into YYYY/MM/DD) # `-- spam (empty) # |-- personal (dirs split into YYYY/MM/DD) # `-- spamtrap (dirs split into YYYY/MM/DD) # if you don't have {ham,spam}trap mail broken out, set this to 0. my $include_traps = 1; # which dirs have mbox files? my @dirs = ( 'ham' ); # how many days should we limit to searching for the dir areas? # ie: assuming we have years of messages in YYYY/MM/DD directories, only look # at the most recent X so that mass-check will go faster in the scan stage. # comment out the line if you don't want to limit. my $RECENT = 120; my $actualdir = "./"; if (@ARGV) { $actualdir = shift(@ARGV) . "/"; chdir $actualdir; } my @do_dirs; foreach ( 'ham', 'spam' ) { push(@do_dirs, "$_/personal"); push(@do_dirs, "$_/${_}trap") if $include_traps; } # mbox laden areas while (my $dir = shift @dirs) { if (-d $dir) { $dir =~ m@^([^/]+)@; print "$1:mbox:$actualdir$dir\n"; } else { die "$dir isn't a directory!\n"; } } # Ok, now figure out the most recent X days of spam ... foreach my $pdir ( @do_dirs ) { $pdir =~ m@^([^/]+)@; my $type = $1; my @dlist = (); if (opendir(DIR1, $pdir)) { while(my $dir = readdir(DIR1)) { next unless ($dir =~ /^\d+$/); $dir = "$pdir/$dir"; next unless (opendir(DIR2, $dir)); while(my $dir2 = readdir(DIR2)) { next unless ($dir2 =~ /^\d+$/ && opendir(DIR3, "$dir/$dir2")); $dir2 = "$dir/$dir2"; push(@dlist, map { "$type:dir:$actualdir$dir2/$_" } grep(-d "$dir2/$_" && /^\d+$/, readdir(DIR3))); closedir(DIR3); } closedir(DIR2); } closedir(DIR1); @dlist = reverse sort @dlist; splice @dlist, $RECENT if (defined $RECENT && @dlist > $RECENT); push(@dirs, @dlist); } } print join("\n", @dirs, ""); exit;