#!/usr/bin/env python

#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

#
# This script processes a folder containing mbox archives and generates
# the data file that is later used by the rendering system to 
# visualize the data cloud
#

"""Usage: %(program)s [OPTIONS] ...

Where OPTIONS is one or more of:

    -h
        show usage and exit
    -s
        output stats from the history file in CSV format. These stats describe processing 
        that has been carried out in the past.
    -l LIST
        only process the mailing list identified
    -m PATH
        set the path to the directory that contains the mailbox to process
        [default to './mail']. The directory should contain subdirectories 
        for each mailing list to be processed.
    -d PATH
        set the path to the directory that will contain the processed data
        [defaults to './data']
    -p PATH
        set the path to the directory that will be used for the PID file
        [defaults to '.']
    -f
        force the processing even if already performed
    -q
        quiet mode; no output
"""

import os, re, sys, time, getopt, pickle, email, email.Utils, mailbox, string, gzip

program = sys.argv[0]
loud = True
force = False
list = None
datadir = "data"

#-----------------------------------------------------------------------# 

def running(piddir):
    return os.path.exists(os.path.join(piddir,program + ".pid"))

def set_running(piddir):
    f = open(os.path.join(piddir,program + ".pid"),'w')
    f.write(str(os.getpid()))
    f.close()

def clear_running(piddir):
    os.remove(os.path.join(piddir,program + ".pid"))

#-----------------------------------------------------------------------# 

def load_history():
    global datadir
    if os.path.exists(os.path.join(datadir,"history.dat")):
        f = open(os.path.join(datadir,"history.dat"),"r")
        history = pickle.load(f)
        f.close()
    else:
        history = {}
    return history
    
def save_history(history):
    global datadir
    f = open(os.path.join(datadir,"history.dat"),"w")
    pickle.dump(history,f)
    f.close()

#-----------------------------------------------------------------------# 

def msgfactory(file):
    """ create a mail message from the given file """
    
    try:
        return email.message_from_file(file)
    except:
        return ''

#-----------------------------------------------------------------------# 

msgid_pattern = re.compile(r'<(.*?)>\s*',re.S)

def get_backlink(mail):
    """ try to estimate the backlink in the given email message 
I'm using the info provided by D.J. Bernstein at http://cr.yp.to/immhf/thread.html"""

    # start with an empty list of references
    references = []
    
    # first, get all the references to anything that looks like a message ID
    reference = mail.get('References')
    if reference:
        references = msgid_pattern.findall(reference)

    # second, if we have an reply-to header, get the message ID out of it and 
    # append it to the references, in case it's not there yet.
    reply = mail.get('In-Reply-To')
    if reply:
        replies = msgid_pattern.findall(reply)
        if len(replies) > 0: 
            for reply in replies:
                if not reply in references:
                    references.append(reply)

    # finally, return the last one of the references, which should be the
    # most immediate parent.
    
    if len(references) > 0:
        return references[-1]
    else:
        return ""
    
#-----------------------------------------------------------------------# 

def process(read,write):

    total = valid = invalid = error = missing_date = missing_address = missing_msgID = missing_backlink = 0
        
    if read[-3:] == '.gz':
        input = gzip.open(read,'r')
    else:
        input = open(read,'r')
    
    output = open(write,'w')

    mbox = mailbox.UnixMailbox(input, msgfactory)

    for mail in mbox:
        total += 1
        if (mail != ''):
            valid += 1
            try:
                date_header = mail.get('Date')
                if (date_header): 
                    date = int(time.mktime(email.Utils.parsedate(date_header)))
                else:
                    missing_date += 1 
                    date = None
                
                address_header = mail.get('From')
                if (address_header): 
                    address = email.Utils.parseaddr(address_header)[1].lower()
                else:
                    missing_address += 1 
                    address = None
                
                msgID_header = mail.get('Message-Id')
                if (msgID_header): 
                    msgID = email.Utils.unquote(msgID_header)
                else:
                    missing_msgID += 1 
                    msgID = None
                
                backlink_header = get_backlink(mail)
                if (backlink_header): 
                    backlink = string.replace(email.Utils.unquote(backlink_header),"\n","")
                else:
                    missing_backlink += 1 
                    backlink = None
                
                if (msgID and date and address): 
                	if (backlink):
	                    output.write(msgID + " " + repr(date) + " " + address + " " + backlink + "\n")
	                else:
	                    output.write(msgID + " " + repr(date) + " " + address + "\n")	
                        
                if loud:
                    print "Processed msg", msgID                
            except (OverflowError,ValueError,TypeError):
                error += 1
        else:
            invalid += 1
        
    output.close()
    input.close()
    
    return (total, valid, invalid, error, missing_date, missing_address, missing_msgID, missing_backlink)
            
#-----------------------------------------------------------------------# 

mbox_pattern = re.compile(r'^.*?(\d{4})-?(\d{2})(:?.gz)?',re.S)

def crawl(history,archives,input_path,output_path,file):
    global list
    
    input = os.path.join(input_path,file)
    if os.path.isdir(input):
        if loud:
            print "Process directory", input
        output = os.path.join(output_path,file)
        if (not os.path.exists(output)): 
            os.mkdir(output)
        if file: 
            archives.write("+:" + file + "\n")
        files = os.listdir(input)
        files.sort()
        for f in files:
            crawl(history,archives,input,output,f)
        if file: 
            archives.write("-:\n")
    else:
        if list is not None and list in file:
            print "Ignorning file", input
        if loud:
          print "Process file", input
        m = mbox_pattern.match(file)
        if m:
            name = m.group(1) + "-" + m.group(2)
            archives.write("*:" + name + "\n")
            output = os.path.join(output_path,name)
            last_modified = os.stat(input).st_mtime
            if force or (input not in history) or (history[input][0] != last_modified) or (not os.path.exists(output)):
                results = process(input,output)
                if loud: 
                    print "processed", input, results
                history[str(input)] = (last_modified,results)
                save_history(history)
            else:
                if loud: 
                    print "skipping", input
        else:
            if loud:
                print "Ignoring", input

#-----------------------------------------------------------------------# 

def usage(code, msg=''):
    """ Print usage message and sys.exit(code). """
    
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

#-----------------------------------------------------------------------# 

def main():
    """ Main program; parse options and go. """

    global loud, force, list, datadir

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hsfqvd:m:p:l:')
    except getopt.error, msg:
        usage(2, msg)

    maildir = "mail"
    piddir = "."

    for opt, arg in opts:
        if opt == '-h':
            usage(0)
        elif opt == "-l":
            list = arg
        elif opt == "-s":
            display_stats()
            sys.exit(0)
        elif opt == "-q":
            loud = False
        elif opt == "-f":
            force = True
        elif opt == "-m":
            maildir = arg
        elif opt == "-d":
            datadir = arg
        elif opt == "-p":
            piddir = arg
        else:
            usage(0)
            
    if (running(piddir)):
        sys.stderr.write("Cannot run since another instance is already running.\n")
        sys.exit(-1)
    elif not os.path.exists(datadir):
        sys.stderr.write("Data directory '" + datadir + "' could not be found.\n")
        sys.exit(-1)
    elif not os.path.exists(maildir):
        sys.stderr.write("Mail directory '" + maildir + "' could not be found.\n")
        sys.exit(-1)
    elif not os.path.exists(piddir):
        sys.stderr.write("PID directory '" + piddir + "' could not be found.\n")
        sys.exit(-1)
    else:
        if loud:
          print "Process files in", maildir
        set_running(piddir)
        history = load_history()
        archives = open(os.path.join(datadir,"archives.dat"),"w")
        crawl(history,archives,maildir,datadir,"")
        archives.close()
        clear_running(piddir)

#-----------------------------------------------------------------------# 

def display_stats():
    global list
    
    print "Date, Total emails, Not in reply-to"
                
    history = load_history()
    for file in history:
        if list is not None and list in file:
            last_modified = history[file][0]
            stats = history[file][1]
            total = int(stats[0])
            valid = stats[1]
            invalid = stats[2]
            error = stats[3]
            missing_date = stats[4]
            missing_address = stats[5]
            missing_msgID = stats[6]
            missing_backlink = int(stats[7])
            
            start = file.index(list) + len(list) + 1
            if file[-3:] == '.gz':
                end = len(file) - 3
            else:
                end = len(file)

            print file[start:end], ",", total, ",", missing_backlink
        
#-----------------------------------------------------------------------# 

if __name__ == "__main__":
    main()

#---------------------------- End of File ------------------------------#