#!/usr/bin/perl # # Uses a fuzzy hash to find closely-related mails, which might only # differ by an ID string somewhere or similar. Consider it "Razor Lite" ;) # # usage: find-dups corpusdir/* > possible-dups.list # or: find-dups corpusdir1 [dir2 dir3 ...] > possible-dups.list $DUMP_WITH_KEY = 0; # TODO: use getopt $COMMANDS = 1; my $hashes2 = { }; my $hashes3 = { }; my $hashes4 = { }; use Digest::SHA1 qw(sha1_base64); use File::Find; my @files = (); foreach my $file (@ARGV) { if (-d $file) { find (\&wanted, $file); sub wanted { (-f $_) and push (@files, $File::Find::name); } } else { push (@files, $file); } } foreach my $file (@files) { open (STDIN, "<$file") or warn "$file cannot be opened"; my @hash = do_one(); close STDIN; #print "$hash\t$file\n"; my $hash2 = $hash[0].$hash[1]; my $hash3 = $hash[0].$hash[1].$hash[2]; my $hash4 = $hash[0].$hash[1].$hash[2].$hash[3]; $hashes4->{$hash4} .= " ".$file; $hashes3->{$hash3} .= " ".$file; $hashes2->{$hash2} .= " ".$file; } check_collisions ($hashes4); check_collisions ($hashes3); check_collisions ($hashes2); exit; sub check_collisions { my ($db, $hash, $file) = @_; foreach $k (sort keys %{$db}) { $_ = $db->{$k}; next unless (/\S \S/); s/^ //g; if ($DUMP_WITH_KEY) { print "$_ [$k]\n"; # to print the key } elsif ($COMMANDS) { my $count = 0; while (m/ /g) { $count++; } /^(\S+) (.*)$/; print "echo \"$1 : $count dups\"; rm -f $2\n"; } else { print "$_\n"; } delete $hashes4->{$k}; delete $hashes3->{$k}; delete $hashes2->{$k}; } } sub do_one { while () { /^$/ and last; } my $str = join ('', ); # strip HTML tags, email addresses, queries # Add more strippings here if you like. $str =~ s/<[^>]+?>/ /igs; $str =~ s/"[^\"\s]+\?[^\"\s]+\"/ /igs; $str =~ s/\S+\?\S+/ /igs; $str =~ s/\S+\@\S+/ /igs; $str =~ s/TRCK:\S+//; $str =~ s/^[a-z0-9]{6,}[-_a-z0-9]{12,}[a-z0-9]{6,}\s*\z//is; $str =~ s/^\s*\S{24,}\s*\z//is; #print $str; my @data = split (/\n/, $str); my $lpb = ($#data+1) / 4; #warn "JMD $#lines $lpb"; my @blks = (); push (@blks, join ('', splice (@data, 0, $lpb))); push (@blks, join ('', splice (@data, 0, $lpb))); push (@blks, join ('', splice (@data, 0, $lpb))); push (@blks, join ('', splice (@data, 0, $lpb))); my @ret = (); foreach my $blk (@blks) { #warn "JMD $blk"; #my $digest = sprintf ("%05d", unpack ("%16C*", $blk)); my $digest = sha1_base64($blk); push (@ret, $digest); } @ret; }