#!/usr/bin/env python # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This script processes a folder containing mbox archives and generates # the data file that is later used by the rendering system to # visualize the data cloud # """Usage: %(program)s [OPTIONS] ... Where OPTIONS is one or more of: -h show usage and exit -s output stats from the history file in CSV format. These stats describe processing that has been carried out in the past. -l LIST only process the mailing list identified -m PATH set the path to the directory that contains the mailbox to process [default to './mail']. The directory should contain subdirectories for each mailing list to be processed. -d PATH set the path to the directory that will contain the processed data [defaults to './data'] -p PATH set the path to the directory that will be used for the PID file [defaults to '.'] -f force the processing even if already performed -q quiet mode; no output """ import os, re, sys, time, getopt, pickle, email, email.Utils, mailbox, string, gzip program = sys.argv[0] loud = True force = False list = None datadir = "data" #-----------------------------------------------------------------------# def running(piddir): return os.path.exists(os.path.join(piddir,program + ".pid")) def set_running(piddir): f = open(os.path.join(piddir,program + ".pid"),'w') f.write(str(os.getpid())) f.close() def clear_running(piddir): os.remove(os.path.join(piddir,program + ".pid")) #-----------------------------------------------------------------------# def load_history(): global datadir if os.path.exists(os.path.join(datadir,"history.dat")): f = open(os.path.join(datadir,"history.dat"),"r") history = pickle.load(f) f.close() else: history = {} return history def save_history(history): global datadir f = open(os.path.join(datadir,"history.dat"),"w") pickle.dump(history,f) f.close() #-----------------------------------------------------------------------# def msgfactory(file): """ create a mail message from the given file """ try: return email.message_from_file(file) except: return '' #-----------------------------------------------------------------------# msgid_pattern = re.compile(r'<(.*?)>\s*',re.S) def get_backlink(mail): """ try to estimate the backlink in the given email message I'm using the info provided by D.J. Bernstein at http://cr.yp.to/immhf/thread.html""" # start with an empty list of references references = [] # first, get all the references to anything that looks like a message ID reference = mail.get('References') if reference: references = msgid_pattern.findall(reference) # second, if we have an reply-to header, get the message ID out of it and # append it to the references, in case it's not there yet. reply = mail.get('In-Reply-To') if reply: replies = msgid_pattern.findall(reply) if len(replies) > 0: for reply in replies: if not reply in references: references.append(reply) # finally, return the last one of the references, which should be the # most immediate parent. if len(references) > 0: return references[-1] else: return "" #-----------------------------------------------------------------------# def process(read,write): total = valid = invalid = error = missing_date = missing_address = missing_msgID = missing_backlink = 0 if read[-3:] == '.gz': input = gzip.open(read,'r') else: input = open(read,'r') output = open(write,'w') mbox = mailbox.UnixMailbox(input, msgfactory) for mail in mbox: total += 1 if (mail != ''): valid += 1 try: date_header = mail.get('Date') if (date_header): date = int(time.mktime(email.Utils.parsedate(date_header))) else: missing_date += 1 date = None address_header = mail.get('From') if (address_header): address = email.Utils.parseaddr(address_header)[1].lower() else: missing_address += 1 address = None msgID_header = mail.get('Message-Id') if (msgID_header): msgID = email.Utils.unquote(msgID_header) else: missing_msgID += 1 msgID = None backlink_header = get_backlink(mail) if (backlink_header): backlink = string.replace(email.Utils.unquote(backlink_header),"\n","") else: missing_backlink += 1 backlink = None if (msgID and date and address): if (backlink): output.write(msgID + " " + repr(date) + " " + address + " " + backlink + "\n") else: output.write(msgID + " " + repr(date) + " " + address + "\n") if loud: print "Processed msg", msgID except (OverflowError,ValueError,TypeError): error += 1 else: invalid += 1 output.close() input.close() return (total, valid, invalid, error, missing_date, missing_address, missing_msgID, missing_backlink) #-----------------------------------------------------------------------# mbox_pattern = re.compile(r'^.*?(\d{4})-?(\d{2})(:?.gz)?',re.S) def crawl(history,archives,input_path,output_path,file): global list input = os.path.join(input_path,file) if os.path.isdir(input): if loud: print "Process directory", input output = os.path.join(output_path,file) if (not os.path.exists(output)): os.mkdir(output) if file: archives.write("+:" + file + "\n") files = os.listdir(input) files.sort() for f in files: crawl(history,archives,input,output,f) if file: archives.write("-:\n") else: if list is not None and list in file: print "Ignorning file", input if loud: print "Process file", input m = mbox_pattern.match(file) if m: name = m.group(1) + "-" + m.group(2) archives.write("*:" + name + "\n") output = os.path.join(output_path,name) last_modified = os.stat(input).st_mtime if force or (input not in history) or (history[input][0] != last_modified) or (not os.path.exists(output)): results = process(input,output) if loud: print "processed", input, results history[str(input)] = (last_modified,results) save_history(history) else: if loud: print "skipping", input else: if loud: print "Ignoring", input #-----------------------------------------------------------------------# def usage(code, msg=''): """ Print usage message and sys.exit(code). """ if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) #-----------------------------------------------------------------------# def main(): """ Main program; parse options and go. """ global loud, force, list, datadir try: opts, args = getopt.getopt(sys.argv[1:], 'hsfqvd:m:p:l:') except getopt.error, msg: usage(2, msg) maildir = "mail" piddir = "." for opt, arg in opts: if opt == '-h': usage(0) elif opt == "-l": list = arg elif opt == "-s": display_stats() sys.exit(0) elif opt == "-q": loud = False elif opt == "-f": force = True elif opt == "-m": maildir = arg elif opt == "-d": datadir = arg elif opt == "-p": piddir = arg else: usage(0) if (running(piddir)): sys.stderr.write("Cannot run since another instance is already running.\n") sys.exit(-1) elif not os.path.exists(datadir): sys.stderr.write("Data directory '" + datadir + "' could not be found.\n") sys.exit(-1) elif not os.path.exists(maildir): sys.stderr.write("Mail directory '" + maildir + "' could not be found.\n") sys.exit(-1) elif not os.path.exists(piddir): sys.stderr.write("PID directory '" + piddir + "' could not be found.\n") sys.exit(-1) else: if loud: print "Process files in", maildir set_running(piddir) history = load_history() archives = open(os.path.join(datadir,"archives.dat"),"w") crawl(history,archives,maildir,datadir,"") archives.close() clear_running(piddir) #-----------------------------------------------------------------------# def display_stats(): global list print "Date, Total emails, Not in reply-to" history = load_history() for file in history: if list is not None and list in file: last_modified = history[file][0] stats = history[file][1] total = int(stats[0]) valid = stats[1] invalid = stats[2] error = stats[3] missing_date = stats[4] missing_address = stats[5] missing_msgID = stats[6] missing_backlink = int(stats[7]) start = file.index(list) + len(list) + 1 if file[-3:] == '.gz': end = len(file) - 3 else: end = len(file) print file[start:end], ",", total, ",", missing_backlink #-----------------------------------------------------------------------# if __name__ == "__main__": main() #---------------------------- End of File ------------------------------#