/[Apache-SVN]/spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm
ViewVC logotype

Diff of /spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

--- spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm	2005/10/17 22:56:00	325997
+++ spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm	2005/10/17 23:17:21	325998
@@ -27,12 +27,14 @@ use IO::Socket;
 use Mail::SpamAssassin::Util;
 use Mail::SpamAssassin::Constants qw(:sa);
 use Mail::SpamAssassin::Logger;
+use Mail::SpamAssassin::AICache;
 
 use constant BIG_BYTES => 256*1024;	# 256k is a big email
 use constant BIG_LINES => BIG_BYTES/65;	# 65 bytes/line is a good approximation
 
 use vars qw {
   $MESSAGES
+  $AICache
 };
 
 my @ISA = qw($MESSAGES);
@@ -45,9 +47,10 @@ Mail::SpamAssassin::ArchiveIterator - fi
 
   my $iter = new Mail::SpamAssassin::ArchiveIterator(
     { 
-      'opt_j'   => 0,
-      'opt_n'   => 1,
-      'opt_all' => 1,
+      'opt_j'     => 0,
+      'opt_n'     => 1,
+      'opt_all'   => 1,
+      'opt_cache' => 1,
     }
   );
 
@@ -158,6 +161,11 @@ in the C<wanted_sub> callback below.  Se
 it's a good idea to set this to 0 if you can, as it imposes a performance
 hit.
 
+=item opt_cache
+
+Set to 0 (default) if you don't want to use cached information to help speed
+up ArchiveIterator.  Set to 1 to enable.
+
 =item wanted_sub
 
 Reference to a subroutine which will process message data.  Usually
@@ -194,6 +202,7 @@ sub new {
   $self->{opt_head} = 0 unless (defined $self->{opt_head});
   $self->{opt_tail} = 0 unless (defined $self->{opt_tail});
   $self->{opt_want_date} = 1 unless (defined $self->{opt_want_date});
+  $self->{opt_cache} = 0 unless (defined $self->{opt_cache});
 
   # If any of these options are set, we need to figure out the message's
   # receive date at scan time.  opt_n == 0, opt_after, opt_before
@@ -887,9 +896,19 @@ sub scan_directory {
     return;
   }
 
+  if ($self->{opt_cache}) {
+    $AICache = Mail::SpamAssassin::AICache->new({	'type' => 'dir',
+      							'path' => $folder,
+					        });
+  }
+
   foreach my $mail (@files) {
     $self->scan_file($class, $mail);
   }
+
+  if (defined $AICache) {
+    $AICache = $AICache->finish();
+  }
 }
 
 sub scan_file {
@@ -899,14 +918,23 @@ sub scan_file {
     push(@{$self->{$class}}, scan_index_pack(AI_TIME_UNKNOWN, $class, "f", $mail));
     return;
   }
-  my $header;
-  mail_open($mail) or return;
-  while (<INPUT>) {
-    last if /^\s*$/;
-    $header .= $_;
+
+  my $date;
+
+  unless (defined $AICache and $date = $AICache->check($mail)) {
+    my $header;
+    mail_open($mail) or return;
+    while (<INPUT>) {
+      last if /^\s*$/;
+      $header .= $_;
+    }
+    close(INPUT);
+    $date = Mail::SpamAssassin::Util::receive_date($header);
+    if (defined $AICache) {
+      $AICache->update($mail, $date);
+    }
   }
-  close(INPUT);
-  my $date = Mail::SpamAssassin::Util::receive_date($header);
+
   return if !$self->message_is_useful_by_date($date);
   push(@{$self->{$class}}, scan_index_pack($date, $class, "f", $mail));
 }
@@ -935,45 +963,69 @@ sub scan_mailbox {
       die "archive-iterator: compressed mbox folders are not supported at this time\n";
     }
 
-    mail_open($file) or return;
+    my $info = {};
+    my $count;
+
+    if ($self->{opt_cache}) {
+      $AICache = Mail::SpamAssassin::AICache->new({	'type' => 'mbox',
+      							'path' => $file,
+					          });
+      if ($count = $AICache->count()) {
+        $info = $AICache->check();
+      }
+    }
+
+    unless ($count) {
+      mail_open($file) or return;
     
-    my $start = 0;		# start of a message
-    my $where = 0;		# current byte offset
-    my $first = '';		# first line of message
-    my $header = '';		# header text
-    my $in_header = 0;		# are in we a header?
-    while (!eof INPUT) {
-      my $offset = $start;	# byte offset of this message
-      my $header = $first;	# remember first line
-      while (<INPUT>) {
-	if ($in_header) {
-	  if (/^\s*$/) {
-	    $in_header = 0;
+      my $start = 0;		# start of a message
+      my $where = 0;		# current byte offset
+      my $first = '';		# first line of message
+      my $header = '';		# header text
+      my $in_header = 0;		# are in we a header?
+      while (!eof INPUT) {
+        my $offset = $start;	# byte offset of this message
+        my $header = $first;	# remember first line
+        while (<INPUT>) {
+	  if ($in_header) {
+	    if (/^\s*$/) {
+	      $in_header = 0;
+	    }
+	    else {
+	      $header .= $_;
+	    }
 	  }
-	  else {
-	    $header .= $_;
+	  if (substr($_,0,5) eq "From ") {
+	    $in_header = 1;
+	    $first = $_;
+	    $start = $where;
+	    $where = tell INPUT;
+	    last;
 	  }
-	}
-	if (substr($_,0,5) eq "From ") {
-	  $in_header = 1;
-	  $first = $_;
-	  $start = $where;
 	  $where = tell INPUT;
-	  last;
+        }
+        if ($header) {
+	  $info->{$offset} = Mail::SpamAssassin::Util::receive_date($header);
 	}
-	$where = tell INPUT;
       }
-      if ($header) {
-	my $date = Mail::SpamAssassin::Util::receive_date($header);
+      close INPUT;
+    }
 
-	if ($self->{determine_receive_date}) {
-	  next if !$self->message_is_useful_by_date($date);
-	}
+    while(my($k,$v) = each %{$info}) {
+      if (defined $AICache && !$count) {
+	$AICache->update($k, $v);
+      }
 
-	push(@{$self->{$class}}, scan_index_pack($date, $class, "m", "$file.$offset"));
+      if ($self->{determine_receive_date}) {
+        next if !$self->message_is_useful_by_date($v);
       }
+
+      push(@{$self->{$class}}, scan_index_pack($v, $class, "m", "$file.$k"));
+    }
+
+    if (defined $AICache) {
+      $AICache = $AICache->finish();
     }
-    close INPUT;
   }
 }
 
@@ -1000,46 +1052,72 @@ sub scan_mbx {
     if ($folder =~ /\.(?:gz|bz2)$/) {
       die "archive-iterator: compressed mbx folders are not supported at this time\n";
     }
-    mail_open($file) or return;
 
-    # check the mailbox is in mbx format
-    $fp = <INPUT>;
-    if ($fp !~ /\*mbx\*/) {
-      die "archive-iterator: error: mailbox not in mbx format!\n";
+    my $info = {};
+    my $count;
+
+    if ($self->{opt_cache}) {
+      $AICache = Mail::SpamAssassin::AICache->new({	'type' => 'mbx',
+      							'path' => $file,
+					          });
+      if ($count = $AICache->count()) {
+        $info = $AICache->check();
+      }
     }
 
-    # skip mbx headers to the first email...
-    seek(INPUT, 2048, 0);
+    unless ($count) {
+      mail_open($file) or return;
 
-    my $sep = MBX_SEPARATOR;
+      # check the mailbox is in mbx format
+      $fp = <INPUT>;
+      if ($fp !~ /\*mbx\*/) {
+        die "archive-iterator: error: mailbox not in mbx format!\n";
+      }
 
-    while (<INPUT>) {
-      if ($_ =~ /$sep/) {
-	my $offset = tell INPUT;
-	my $size = $2;
-
-	# gather up the headers...
-	my $header = '';
-	while (<INPUT>) {
-	  last if (/^\s*$/);
-	  $header .= $_;
-	}
+      # skip mbx headers to the first email...
+      seek(INPUT, 2048, 0);
 
-	my $date = Mail::SpamAssassin::Util::receive_date($header);
+      my $sep = MBX_SEPARATOR;
 
-	if ($self->{determine_receive_date}) {
-	  next if !$self->message_is_useful_by_date($date);
-	}
+      while (<INPUT>) {
+        if ($_ =~ /$sep/) {
+	  my $offset = tell INPUT;
+	  my $size = $2;
+
+	  # gather up the headers...
+	  my $header = '';
+	  while (<INPUT>) {
+	    last if (/^\s*$/);
+	    $header .= $_;
+	  }
 
-	push(@{$self->{$class}}, scan_index_pack($date, $class, "b", "$file.$offset"));
+	  $info->{"$file.$offset"} = Mail::SpamAssassin::Util::receive_date($header);
 
-	seek(INPUT, $offset + $size, 0);
+	  # go onto the next message
+	  seek(INPUT, $offset + $size, 0);
+	}
+        else {
+	  die "archive-iterator: error: failure to read message body!\n";
+        }
       }
-      else {
-	die "archive-iterator: error: failure to read message body!\n";
+      close INPUT;
+    }
+
+    while(my($k,$v) = each %{$info}) {
+      if (defined $AICache && !$count) {
+	$AICache->update($k, $v);
       }
+
+      if ($self->{determine_receive_date}) {
+        next if !$self->message_is_useful_by_date($v);
+      }
+
+      push(@{$self->{$class}}, scan_index_pack($v, $class, "b", "$file.$k"));
+    }
+
+    if (defined $AICache) {
+      $AICache = $AICache->finish();
     }
-    close INPUT;
   }
 }
 

 

infrastructure at apache.org
ViewVC Help
Powered by ViewVC 1.1.26