#!/usr/bin/env python # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ''' Purpose: Clutch gathers details about projects currently in incubation. The core resource is the ReportingSchedule wiki page. As soon as a project is accepted into incubation, please add its entry. This script reads that page, and the i.a.o/projects table, and other resources. The assembled metadata is stored in various data files. See further explanation at http://incubator.apache.org/clutch.html Note: Please keep the dependencies as minimal as possible, so this script can be operated by any Incubator committer. It uses only standard modules. For example, if they are not really into python then the default version on Mac OSX Tiger is still python-2.3.5 Note: The 'svn log' queries will only run on UNIX. ''' # FIXME: mentorsList needs better splitting. # FIXME: Mail list detection could be improved. # FIXME: Mail list detection. See svn comments with 2009-11-13 rush bug fix. # FIXME: Occasional trailing slash issue in Clutch cache. # FIXME: Some projects use different names in different contexts, and cannot # be automatically handled, e.g. Lucene.Net, log4php (some of their stats # are missing). # FIXME: Perhaps send some error reporting to a log file: # - validate the dates. # - detect short description, e.g. Hama = Hama # FIXME: Better/more exception handling, e.g. url open # FIXME: Need various output formats: # - source docs xml file in site-author/clutch.xml (now happening) # - simple text list of project names and basic data clutch.txt (now happening) # - Notation3 or DOAP or RDFa or some such? (not yet) # - python pickle (now happening) # FIXME: Parse Robert's "audit" stuff. # FIXME: Add option "--verbose". # FIXME: Detect if they have SVN repo yet. # - http://svn.apache.org/repos/asf/incubator/* ensure more than ".." # FIXME: Similarly with website. Ensure that there is some content length. # FIXME: See some other suggestions on the general@ list. # FIXME: See some other suggestions in issue INCUBATOR-78. import commands import datetime import HTMLParser import os.path import pickle import re import urllib2 import xml.dom.minidom # FIXME: use proper options # Issue some extra debug information. optionVerbose = False # Use the persistent data to speed operations. # Occasionally bad data is cached (e.g. experimenting with developing new code). # So need to ignore the cached data and perform all resource availability tests. optionUseClutchState = True projects = {} # internal data otherIssues = [] output = {} # output persistent data to be utilised by other tools mentorsProjects = {} # internal data gatherDate = datetime.datetime.utcnow() gatherDateString = datetime.datetime.utcnow().ctime() delta = datetime.timedelta(days=61) statusTallyDate1 = gatherDate - delta delta = datetime.timedelta(days=122) statusTallyDate2 = gatherDate - delta delta = datetime.timedelta(days=273) statusTallyDate3 = gatherDate - delta # These expressions are used often, so compile them early. commentsRE = re.compile("(.*) *\(([^)]+)\)") startDateRE = re.compile("([0-9]+)-0?([0-9]+)-?0?([0-9]+)?") statusLogRE = re.compile("^([0-9]+)-0?([0-9]+)-0?([0-9]+)") svnRevisionSkipRE = re.compile("707389|708087|708420|708791|709356|709648|711153|744365|761864|788239|796085|804825") mailListRE = re.compile("^([-a-z0-9]+)@([a-z]+)\.apache\.org") mailListNameRE = re.compile("^([a-z]+)-([-a-z0-9]+)") mailListNameUrlRE = re.compile("/([a-z]+)-([-a-z0-9]+)/$") urlHttpRE = re.compile("^http") newCommitterRE = re.compile("[nN]ew [cC]ommitt?er") releasesRE = re.compile("dist/incubator/([-a-z]+)/") linkReportingSchedule = 'ReportingSchedule' linkIncubationTable = 'IncubationTable' # Import the persistent data. # This enables us to skip detection of website etc. if already detected. inputFile = open('clutch.pkl', 'rb') state = pickle.load(inputFile) inputFile.close() def ignorecasecmp(left, right): return cmp(left.upper(), right.upper()) print "Gather data from the ReportingSchedule ..." # Parse the wiki ReportingSchedule to gather project details req = urllib2.Request( url='http://wiki.apache.org/incubator/ReportingSchedule?action=raw') text = urllib2.urlopen(req).read() tokens = re.findall('\*.+', text) for token in tokens: token = token.strip() # strip whitespace token = token.replace('* ', '') # strip bullet symbol token = token.replace('!', '') # strip wiki markup if re.match('Monthly', token): group = 'month' continue if re.match('January, ', token): group = 'group-1' continue if re.match('February, ', token): group = 'group-2' continue if re.match('March, ', token): group = 'group-3' continue # print '%(1)s: %(2)s' % {'1': group, '2': token} match = re.search(commentsRE, token) reportingCommments = "" if match: name = match.group(1).strip() reportingComments = match.group(2).strip() else: name = token id = name.lower() id = id.replace(' ', '') # strip spaces from project ID try: projects[id] except KeyError: projects[id] = {} projects[id]['name'] = name # Set some defaults projects[id]['reportingMonthly'] = False projects[id]['reportingComments'] = "" projects[id]['hasReportingGroup'] = True projects[id]['hasStatusEntry'] = True projects[id]['statusFileName'] = id projects[id]['statusLastUpdated'] = "" projects[id]['statusAge'] = 0 projects[id]['statusUpdateCounts'] = "" projects[id]['urlSvn'] = "" projects[id]['urlTracker'] = "" projects[id]['urlWww'] = "" projects[id]['urlDist'] = "" projects[id]['urlKeys'] = "" projects[id]['hasEntryIssues'] = False projects[id]['description'] = "" projects[id]['sponsor'] = "? not known" projects[id]['mentors'] = "" projects[id]['startDate'] = "" projects[id]['entryDate'] = None projects[id]['committersSvn'] = None projects[id]['hintMailListDev'] = "" projects[id]['hasMailListDev'] = "" projects[id]['hintMailListCommits'] = "" projects[id]['hasMailListCommits'] = "" projects[id]['numberCommitters'] = 0 projects[id]['numberCommittersNew'] = 0 # Is it reporting monthly? if group.find('month') >= 0: projects[id]['reportingMonthly'] = True projects[id]['reportingComments'] = reportingComments projects[id]['hasEntryIssues'] = True # See if we can use some persistent data, i.e. not new during this run. try: state[id] except KeyError: projects[id]['hasClutchState'] = False else: projects[id]['hasClutchState'] = True # Add their group. # If this still equals "month" at end, then there is a schedule error. projects[id]['reportingGroup'] = group # End of processing the ReportingSchedule wiki page # Process the reporting schedule data, detect some potential issues. projectNames = projects.keys() projectNames.sort(ignorecasecmp) for k in projectNames: #print "Name: %s" % k if projects[k]['reportingGroup'].find('month') >= 0: print 'ERROR: %s: missing group' % k projects[k]['hasReportingGroup'] = False # Parse the projects table, ensure each is present, grab more details projectsTable = {} def getText(nodelist): """http://www.python.org/doc/2.5.2/lib/minidom-example.txt""" rc = "" for node in nodelist: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc print "Gather data from the projects-in-incubation table ..." dom = xml.dom.minidom.parse("site-author/projects/index.xml") # FIXME: dom.getElementById("current") ? table = dom.getElementsByTagName("table")[0] for row in table.getElementsByTagName("tr")[1:]: # FIXME: surely this can be improved name = getText(row.getElementsByTagName("td")[0].childNodes[1].childNodes) id = name.lower() #print "Name: %s" % name statusFileStr = row.getElementsByTagName("td")[0].childNodes[1].attributes["href"].value (dirName, fileName) = os.path.split(statusFileStr) (fileBaseName, fileExtension) = os.path.splitext(fileName) values = [getText(td.childNodes) for td in row.getElementsByTagName("td")[1:]] description, sponsor, mentors, startDate = values[:4] try: projectsTable[id] except KeyError: projectsTable[id] = {} projectsTable[id]['name'] = name projectsTable[id]['description'] = description projectsTable[id]['sponsor'] = sponsor projectsTable[id]['startDate'] = startDate projectsTable[id]['statusFileName'] = fileBaseName projectsTable[id]['mentors'] = mentors.strip() # determine projects for each mentor if projectsTable[id]['mentors'] != "": mentorsList = projectsTable[id]['mentors'].split(", ") for mentor in mentorsList: try: mentorsProjects[mentor] except KeyError: mentorsProjects[mentor] = "%s" % projectsTable[id]['name'] else: mentorsProjects[mentor] += ", %s" % projectsTable[id]['name'] else: print 'ERROR: %s: row exists' % id print "Gather data from the projects-graduated table ..." graduatedProjects = {} table2 = dom.getElementsByTagName("table")[1] for row in table2.getElementsByTagName("tr")[1:]: # FIXME: surely this can be improved nameGraduate = getText(row.getElementsByTagName("td")[0].getElementsByTagName("a")[0].childNodes) graduatedProjects[nameGraduate.lower()] = True dom.unlink() # Process the incubation table data, detect some potential issues. print "Gather details from project status files ..." projectNames = projectsTable.keys() projectNames.sort(ignorecasecmp) for k in projectNames: #print "Name: %s" % k try: projects[k] except KeyError: print 'ERROR: %s: Missing from reporting schedule' % k errorMsg = """%(a)s: Not listed in %(b)s, yet listed in %(c)s""" % \ {'a': k, 'b': linkReportingSchedule, 'c': linkIncubationTable} errorMsg += ". See help." otherIssues.append(errorMsg) continue if projects[k]['name'] != projectsTable[k]['name']: print "WARN: Name '%(a)s' differs from reporting schedule name '%(b)s'" % \ {'a': projectsTable[k]['name'], 'b': projects[k]['name']} statusFile = "site-author/projects/%s.xml" % projectsTable[k]['statusFileName'] if os.path.exists(statusFile): dom = xml.dom.minidom.parse(statusFile) # get the project info hints table = dom.getElementsByTagName("table")[0] for row in table.getElementsByTagName("tr")[1:]: cell = row.getElementsByTagName("td")[2] if cell.attributes.has_key('id'): values = [getText(item.childNodes) for item in cell.childNodes] value = " ".join(values).strip() #print "%(a)s=%(b)s" % {'a': cell.getAttribute('id'), 'b': value} if cell.getAttribute('id') == "mail-dev": value = value.replace(' at ', '@') value = value.replace(' Subscribe Unsubscribe', '') value = value.replace(' Archive', '') value = value.replace(' ', '@', 1) value = value.replace(' ', '') value = value.replace('@@', '@') matchMail = re.search(mailListRE, value) if matchMail: projects[k]['hintMailListDev'] = "%(a)s-%(b)s" % \ {'a': matchMail.group(2), 'b': matchMail.group(1)} continue if cell.getAttribute('id') == "mail-commits": value = value.replace(' at ', '@') value = value.replace(' Subscribe Unsubscribe', '') value = value.replace(' Archive', '') value = value.replace(' ', '@', 1) value = value.replace(' ', '') value = value.replace('@@', '@') matchMail = re.search(mailListRE, value) if matchMail: projects[k]['hintMailListCommits'] = "%(a)s-%(b)s" % \ {'a': matchMail.group(2), 'b': matchMail.group(1)} continue # Get hints for various url-based resources matchUrl = re.search(urlHttpRE, value) if not matchUrl: for item in cell.getElementsByTagName('a'): if item.attributes.has_key('href'): value = item.getAttribute('href') break matchUrl = re.search(urlHttpRE, value) if matchUrl: hasUrl = True else: hasUrl = False if cell.getAttribute('id') == "svn" and hasUrl: projects[k]['urlSvn'] = value continue if cell.getAttribute('id') == "tracker" and hasUrl: projects[k]['urlTracker'] = value continue if cell.getAttribute('id') == "www" and hasUrl: projects[k]['urlWww'] = value continue # Scan the project News section and count new commiters. for section in dom.getElementsByTagName("section"): if section.attributes.has_key('id') and section.getAttribute('id') == "News": for line in section.toxml().splitlines(): matchNewCommitter = re.search(newCommitterRE, line) if matchNewCommitter: projects[k]['numberCommittersNew'] += 1 dom.unlink() # end of if status file exists # end of processing incubation table data print "Gather committers data ..." # Jim's "projects" page is the easiest way. However it has a very flat # structure and the xml is not valid. Need to process the "table" which follows # each "h2" element. class CommittersParser(HTMLParser.HTMLParser): def __init__(self): self.projects = {} self.projectId = "default" self.projects['default'] = [] self.rowCount = 0 self.cellCount = 0 self.committerNameRE = re.compile("([a-z0-9_]+)") self.reset() def handle_starttag(self, tag, attrs): if tag == "h2": for key, value in attrs: if key == "id": self.projectId = value try: self.projects[value] except KeyError: self.projects[value] = [] break if tag == "table": self.rowCount = 0 if tag == "tr": self.rowCount += 1 self.cellCount = 0 if tag == "td": self.cellCount += 1 def handle_data(self, data): if self.cellCount == 1: match = re.search(self.committerNameRE, data) if match: name = match.group(1) if self.rowCount > 1: # The first row is the column headers. self.projects[self.projectId].append(name) committersUrl = "http://people.apache.org/~jim/projects.html" committersInput = urllib2.urlopen(committersUrl) committers = CommittersParser() committers.feed(committersInput.read()) committers.close() print "Gather incubator group mail list data ..." class IncubatorMailListNamesParser(HTMLParser.HTMLParser): def __init__(self): self.names = [] self.reset() def handle_starttag(self, tag, attrs): if tag == "a": for key, value in attrs: if key == "href": self.names.append(value) break mailListsUrl = "http://incubator.apache.org/mail/" mailLists = IncubatorMailListNamesParser() mailLists.feed(urllib2.urlopen(mailListsUrl).read()) mailLists.close() del mailLists.names[0:5] # the first 5 are page navigation # remove some special lists mailLists.names.remove("announce/") mailLists.names.remove("cvs/") mailLists.names.remove("general/") mailLists.names.remove("projects/") projectMailLists = {} mailListNamesRE = re.compile("(.*)-([^-]+)/") mailListNamesUsualRE = re.compile("commits|cvs|dev|issues|user|spec") for listName in mailLists.names: if optionVerbose: print "DEBUG: listName=%s" % listName if listName.find("/") == -1: continue if listName.find("-") != -1: matchList = re.search(mailListNamesRE, listName) try: projectMailLists[matchList.group(1)] except KeyError: projectMailLists[matchList.group(1)] = {} listName = listName.replace('/', '') projectMailLists[matchList.group(1)][matchList.group(2)] = listName if optionVerbose: print "DEBUG: Found list: %(a)s %(b)s" % {'a': matchList.group(1), 'b': matchList.group(2)} # FIXME: We assume that mail lists are always named like this # with "-dev" or "-commits" etc. matchListUsual = re.search(mailListNamesUsualRE, matchList.group(2)) if not matchListUsual: print "WARN: Unusual mail list name '%s'" % listName else: listName = listName.replace('/', '') try: projectMailLists[listName] except KeyError: projectMailLists[listName] = {} projectMailLists[listName]["dev"] = listName print "WARN: %(a)s: unusual mail list name '%(b)s', assuming it is their dev list" % \ {'a': listName, 'b': projectMailLists[listName]["dev"]} print "Gather incubator PGP keys data ..." keysNamesRE = re.compile("/dist/incubator/([^/]+)/(.*)KEYS") class KeysNamesParser(HTMLParser.HTMLParser): def __init__(self): self.names = {} self.reset() def handle_starttag(self, tag, attrs): if tag == "a": for key, value in attrs: if key == "href": matchKey = re.search(keysNamesRE, value) if matchKey: self.names[matchKey.group(1)] = {} self.names[matchKey.group(1)]['urlKeys'] = "%(a)s/%(b)s/%(c)sKEYS" % \ {'a': "http://www.apache.org/dist/incubator", 'b': matchKey.group(1), 'c': matchKey.group(2) } break keysListUrl = "http://people.apache.org/~henkp/checker/md5.html" keysList = KeysNamesParser() keysList.feed(urllib2.urlopen(keysListUrl).read()) keysList.close() print "Gather data about releases ..." releases = {} req = urllib2.Request( url='http://people.apache.org/~crossley/incubator-releases.txt') lines = urllib2.urlopen(req).readlines() for line in lines: match = re.search(releasesRE, line) if match: try: releases[match.group(1)] except KeyError: releases[match.group(1)] = True for k in releases: try: projects[k] except KeyError: try: graduatedProjects[k] except KeyError: print 'INFO: %s: dormant/retired project has remains on Incubator mirrors' % k else: print 'ERROR: %s: graduated project has remains on Incubator mirrors' % k errorMsg = """%(a)s: Has graduated, but still has remains on Incubator distribution mirrors""" % \ {'a': k} errorMsg += ". See help." otherIssues.append(errorMsg) continue print "Processing ..." # Process the reporting schedule data, correlate and ensure each exists in the # incubation projects table, add more details to the data store. projectNames = projects.keys() projectNames.sort(ignorecasecmp) for k in projectNames: print k try: projectsTable[k] except KeyError: print 'ERROR: %s: Missing from incubation table' % k projects[k]['hasStatusEntry'] = False continue projects[k]['description'] = projectsTable[k]['description'] projects[k]['sponsor'] = projectsTable[k]['sponsor'] projects[k]['mentors'] = projectsTable[k]['mentors'] projects[k]['startDate'] = projectsTable[k]['startDate'] statusFile = "site-author/projects/%s.xml" % projectsTable[k]['statusFileName'] if not os.path.exists(statusFile): print 'ERROR: %s: Missing status file' % k projects[k]['hasStatusEntry'] = False continue projects[k]['statusFileName'] = projectsTable[k]['statusFileName'] startDate = projects[k]['startDate'] if k == "log4php": startDate = "2007-07-04" # they had a reset, so be fair match = re.search(startDateRE, startDate) if match: if match.group(3) != None: entryDateDay = int(match.group(3)) else: entryDateDay = 1 try: entryDate = datetime.datetime( int(match.group(1)), int(match.group(2)), entryDateDay) except ValueError: print 'ERROR: %s: ValueError with date' % k else: projects[k]['entryDate'] = entryDate # Gather recent updates to their status page. # FIXME: Perhaps this operation could be improved. Use "subprocess" module. # On this older Mac still python-2.3.5, so no. See notes at top. if optionVerbose: print "DEBUG: Parsing svn log for site-author/projects/%s.xml ..." % \ projects[k]['statusFileName'] command = "svn log --xml site-author/projects/%s.xml" % \ projects[k]['statusFileName'] dom = xml.dom.minidom.parseString(commands.getoutput(command)) rowCounter = 0 count1 = 0 count2 = 0 count3 = 0 for row in dom.getElementsByTagName("logentry"): # Skip counting various commits which were to standardise the status files. matchSvnSkip = re.search(svnRevisionSkipRE, row.getAttribute('revision')) if matchSvnSkip: continue rowCounter += 1 date = getText(row.getElementsByTagName("date")[0].childNodes) matchSvn = re.search(statusLogRE, date) thisDate = datetime.datetime( int(matchSvn.group(1)), int(matchSvn.group(2)), int(matchSvn.group(3))) if rowCounter == 1: projects[k]['statusLastUpdated'] = "%(a)4d-%(b)02d-%(c)02d" % \ {'a': int(matchSvn.group(1)), 'b': int(matchSvn.group(2)), 'c': int(matchSvn.group(3))} if thisDate >= statusTallyDate1: count1 += 1 if thisDate >= statusTallyDate2: count2 += 1 if thisDate >= statusTallyDate3: count3 += 1 if projects[k]['entryDate'] >= statusTallyDate1: count2 = "-" if projects[k]['entryDate'] >= statusTallyDate2: count3 = "-" projects[k]['statusUpdateCounts'] = "%(a)s,%(b)s,%(c)s" % \ {'a': count1, 'b': count2, 'c': count3} dom.unlink() # end of processing print "Detect certain resources ..." for k in projectNames: print k # Add the number of committers # Sometimes the committer SVN group name contains the sponsor TLP, # e.g. portals-wsrp4j tlpSvn = "" sponsor = projects[k]['sponsor'].lower() if sponsor.find("?") >= 0: sponsor = "incubator" if sponsor.find("incubator") == -1: if sponsor.find("logging services") >= 0: sponsor = "logging" tlpSvn = "%(a)s-%(b)s" % {'a': sponsor, 'b': k} if k.find("lucene.net") >= 0: tlpSvn = "lucene-dot-net" for svnGroup in [k, tlpSvn]: if svnGroup == "": break #print "Trying committers group '%s'" % svnGroup try: committers.projects[svnGroup] except KeyError: continue else: projects[k]['numberCommitters'] = len(committers.projects[svnGroup]) projects[k]['committersSvn'] = svnGroup break if projects[k]['committersSvn'] == None: print "INFO: %s: Does not yet have committers accounts" % k # Detect if they have SVN yet. # First, try the url from their status page # then, try a standard url. if optionUseClutchState and projects[k]['hasClutchState'] and state[k]['urlSvn']: projects[k]['urlSvn'] = state[k]['urlSvn'] else: urlSvnDefault = "http://svn.apache.org/repos/asf/incubator/%s/" % projects[k]['statusFileName'] if urlSvnDefault == projects[k]['urlSvn']: urlSvnDefault = "" for url in [projects[k]['urlSvn'], urlSvnDefault]: if url == "": continue try: urllib2.urlopen(url) except IOError: projects[k]['urlSvn'] = "" else: projects[k]['urlSvn'] = url break if not projects[k]['urlSvn']: print 'INFO: %s: Does not yet have SVN' % k # Detect if they have Tracker yet. # First, try the url from their status page # then, try a standard url. if optionUseClutchState and projects[k]['hasClutchState'] and state[k]['urlTracker']: projects[k]['urlTracker'] = state[k]['urlTracker'] else: urlTrackerDefault = "http://issues.apache.org/jira/browse/%s" % projects[k]['statusFileName'].upper() if urlTrackerDefault == projects[k]['urlTracker']: urlTrackerDefault = "" for url in [projects[k]['urlTracker'], urlTrackerDefault]: if url == "": continue try: urllib2.urlopen(url) except IOError: projects[k]['urlTracker'] = "" else: projects[k]['urlTracker'] = url break if not projects[k]['urlTracker']: print 'INFO: %s: Does not yet have an Issue Tracker' % k # Detect if they have a website yet. # First, try the url from their status page # then, try a standard url. if optionUseClutchState and projects[k]['hasClutchState'] and state[k]['urlWww']: projects[k]['urlWww'] = state[k]['urlWww'] else: urlWwwDefault = "http://incubator.apache.org/%s/" % projects[k]['statusFileName'] if urlWwwDefault == projects[k]['urlWww']: urlWwwDefault = "" for url in [projects[k]['urlWww'], urlWwwDefault]: if url == "": continue try: urllib2.urlopen(url) except IOError: projects[k]['urlWww'] = "" else: projects[k]['urlWww'] = url break if not projects[k]['urlWww']: print 'INFO: %s: Does not yet have a website' % k # See if they have a distribution area yet. if optionUseClutchState and projects[k]['hasClutchState'] and state[k]['urlDist']: projects[k]['urlDist'] = state[k]['urlDist'] else: urlDist = "http://www.apache.org/dist/incubator/%s/" % \ projects[k]['statusFileName'] urlMirror = "http://www.apache.org/dyn/closer.cgi/incubator/%s/" % \ projects[k]['statusFileName'] try: urllib2.urlopen(urlDist) except IOError: print 'INFO: %s: Does not yet have a distribution area' % k else: projects[k]['urlDist'] = urlMirror # Detect if they have a PGP KEYS file if projects[k]['urlDist']: try: keysList.names[k] except KeyError: print 'INFO: %s: Does not yet have a PGP KEYS file' % k else: projects[k]['urlKeys'] = keysList.names[k]['urlKeys'] # Detect mail lists established # First, try the list names from their status page # then, try a standard list name under incubator. # To reduce network queries, if it is an incubator-hosted list then look up in # the list of mail-lists already gathered, otherwise it is a TLP-hosted list, # so try getting the archives URL. projectName = k if projectName.find("lucene.net") == 0: projectName = "lucene-net" for listType in ['dev', 'commits']: if listType == "dev": mailListHintKey = "hintMailListDev" mailListKey = "hasMailListDev" else: mailListHintKey = "hintMailListCommits" mailListKey = "hasMailListCommits" if optionVerbose: print "DEBUG: Looking for mailList: %s" % projects[k][mailListHintKey] matchMail = re.search(mailListNameRE, projects[k][mailListHintKey]) if matchMail: mailListGroup = "%s" % matchMail.group(1) mailListNameHint = "%s" % matchMail.group(2) else: mailListGroup = "incubator" mailListNameHint = "" if optionVerbose: print "DEBUG: Trying mailListGroup=%(a)s mailListNameHint=%(b)s" % \ {'a': mailListGroup, 'b': mailListNameHint} if mailListGroup == "incubator": mailListNameDefault = "%(a)s-%(b)s" % {'a': projectName, 'b': listType} if mailListNameDefault == mailListNameHint: mailListNameDefault = "" for listName in [mailListNameHint, mailListNameDefault]: if listName == "": continue if optionVerbose: print "DEBUG: Trying listName=%s" % listName try: projectMailLists[projectName] except KeyError: print "INFO: %s: Does not yet have incubator group mail lists" % k break try: projectMailLists[projectName][listType] except: print "INFO: %(a)s: Does not yet have hinted incubator mail list '%(b)s-%(c)s'" % \ {'a': k, 'b': projectName, 'c': listType} projects[k][mailListKey] = "" else: projects[k][mailListKey] = "http://mail-archives.apache.org/mod_mbox/incubator-%(a)s/" % \ {'a': projectMailLists[projectName][listType]} if optionVerbose: print "DEBUG: Successful Incubator mail url: %s" % projects[k][mailListKey] # End of processing incubator group mail list. else: listName = projects[k][mailListHintKey] url = "http://mail-archives.apache.org/mod_mbox/%s/" % listName if optionVerbose: print "DEBUG: Trying mail url: %s" % url try: urllib2.urlopen(url) except IOError: projects[k][mailListKey] = "" else: projects[k][mailListKey] = url if optionVerbose: print "DEBUG: Successful TLP mail url: %s" % url if not projects[k][mailListKey] and listName != "": print "INFO: %(a)s: Does not yet have mail list '%(b)s'" % {'a': k, 'b': listName} # End of processing project mail lists. # end of processing print "Output the data ..." reportingGroups = {'month': 'Monthly', 'group-1': 'January,April,July,October', 'group-2': 'February,May,August,November', 'group-3': 'March,June,September,December'} monthsLong = 'January February March April May June July August September October November December'.split() nameCurrentReport = "%(a)s%(b)s" % {'a': monthsLong[gatherDate.month-1], 'b': gatherDate.year} urlCurrentReport = "http://wiki.apache.org/incubator/%s" % nameCurrentReport fileXml = open('site-author/clutch.xml', 'w') fileList = open('site-author/clutch.txt', 'w') headerXml = """ Status of the clutch
Status of the clutch currently in incubation

Clutch is a tool which gathers details about the projects currently in incubation and re-generates the table below. It aims to encourage and nurture, to provide an overview of the state of establishment of various resources, and to provide quick access to those resources. See more notes below regarding purpose and interpretation.

Clutch last gathered: %(gatherDate)s UTC.
Number of podlings in incubation: %(numProjects)s

""" % {'gatherDate': gatherDateString, 'numProjects': len(projects) } fileXml.write(tableTopXml) fileXml.write(tableColumnHeadersXml) fileList.write('#identifier,name,sponsor\n') reportList1 = "" reportList2 = "" reportList3 = "" tableRowCount = 0 tableRowCountMid = int(len(projects) / 2) for k in projectNames: tableRowCount += 1 if tableRowCount == tableRowCountMid: fileXml.write(tableColumnHeadersXml) fileXml.write(' \n' % k) fileXml.write(' \n' % projects[k]['name']) output[k] = {} output[k]['podlingName'] = projects[k]['name'] if projects[k]['sponsor'].find("?") >= 0: fileXml.write(' \n' % projects[k]['sponsor']) else: fileXml.write(' \n' % projects[k]['sponsor']) output[k]['sponsor'] = projects[k]['sponsor'] output[k]['description'] = projects[k]['description'] output[k]['mentors'] = projects[k]['mentors'] fileXml.write(' \n' % projects[k]['startDate']) output[k]['startDate'] = projects[k]['startDate'] # elapsedDays column fileXml.write(' \n') if not projects[k]['reportingMonthly']: fileXml.write(' \n' % projects[k]['reportingMonthly']) else: fileXml.write(' \n' % projects[k]['reportingMonthly']) output[k]['reportingMonthly'] = projects[k]['reportingMonthly'] fileXml.write(' \n' % projects[k]['reportingGroup']) output[k]['reportingGroup'] = reportingGroups[projects[k]['reportingGroup']] reportDevList = '"%s Developers"' % projects[k]['name'] if projects[k]['hasMailListDev']: matchDevMail = re.search(mailListNameUrlRE, projects[k]['hasMailListDev']) if matchDevMail: mailListGroup = "%s" % matchDevMail.group(1) mailListNameHint = "%s" % matchDevMail.group(2) reportDevList += ' <%(a)s@%(b)s.apache.org>' % \ {'a': matchDevMail.group(2), 'b': matchDevMail.group(1)} else: reportDevList += ' ' else: reportDevList += ' ' if optionVerbose: print "DEBUG: reportDevList=%s" % reportDevList if projects[k]['reportingMonthly']: reportList1 += '%s\n' % reportDevList reportList2 += '%s\n' % reportDevList reportList3 += '%s\n' % reportDevList else: if (projects[k]['reportingGroup'] == "group-1"): reportList1 += "%s\n" % reportDevList elif (projects[k]['reportingGroup'] == "group-2"): reportList2 += "%s\n" % reportDevList elif (projects[k]['reportingGroup'] == "group-3"): reportList3 += "%s\n" % reportDevList if projects[k]['hasReportingGroup']: fileXml.write(' \n' % projects[k]['hasReportingGroup']) else: fileXml.write(' \n' % projects[k]['hasReportingGroup']) if projects[k]['hasStatusEntry']: fileXml.write(' \n' % \ {'name': projects[k]['statusFileName'], 'entry': projects[k]['hasStatusEntry']}) else: fileXml.write(' \n' % projects[k]['hasStatusEntry']) fileXml.write(' \n' % projects[k]['statusLastUpdated']) # statusAge column fileXml.write(' \n') fileXml.write(' \n' % projects[k]['statusUpdateCounts']) if projects[k]['numberCommitters'] > 0: if projects[k]['numberCommitters'] > 2: fileXml.write(' \n' % \ {'a': projects[k]['committersSvn'], 'b': projects[k]['numberCommitters']}) else: fileXml.write(' \n' % \ {'a': projects[k]['committersSvn'], 'b': projects[k]['numberCommitters']}) else: fileXml.write(' \n') if projects[k]['numberCommittersNew'] > 0: if projects[k]['numberCommittersNew'] > 1: fileXml.write(' \n' % \ projects[k]['numberCommittersNew']) else: fileXml.write(' \n' % \ projects[k]['numberCommittersNew']) else: fileXml.write(' \n') if projects[k]['urlSvn']: fileXml.write(' \n' % projects[k]['urlSvn']) else: fileXml.write(' \n') output[k]['urlSvn'] = projects[k]['urlSvn'] if projects[k]['urlTracker']: fileXml.write(' \n' % projects[k]['urlTracker']) else: fileXml.write(' \n') output[k]['urlTracker'] = projects[k]['urlTracker'] matchUrl = re.search(urlHttpRE, projects[k]['hasMailListDev']) if matchUrl: hasUrl = True else: hasUrl = False if hasUrl: fileXml.write(' \n' % projects[k]['hasMailListDev']) else: fileXml.write(' \n') output[k]['hasMailListDev'] = projects[k]['hasMailListDev'] matchUrl = re.search(urlHttpRE, projects[k]['hasMailListCommits']) if matchUrl: hasUrl = True else: hasUrl = False if hasUrl: fileXml.write(' \n' % projects[k]['hasMailListCommits']) else: fileXml.write(' \n') output[k]['hasMailListCommits'] = projects[k]['hasMailListCommits'] if projects[k]['urlWww']: fileXml.write(' \n' % \ projects[k]['urlWww']) else: fileXml.write(' \n') output[k]['urlWww'] = projects[k]['urlWww'] if projects[k]['urlDist']: fileXml.write(' \n' % \ projects[k]['urlDist']) else: fileXml.write(' \n') output[k]['urlDist'] = projects[k]['urlDist'] if projects[k]['urlKeys']: fileXml.write(' \n' % \ projects[k]['urlKeys']) else: fileXml.write(' \n') try: releases[k] except KeyError: fileXml.write(' \n') else: fileXml.write(' \n' % \ projects[k]['urlDist']) fileXml.write(' \n') fileList.write('%(a)s,"%(b)s","%(c)s"\n' % \ {'a': k, 'b': projects[k]['name'], 'c': projects[k]['sponsor'], }) # End of rows fileXml.write(tableColumnHeadersXml) fileXml.write("
%s%s%s%s%s%s%s%s%s%(entry)s%s%s%s%(b)s%(b)s-%s%s0TrueFalseTrueFalseTrueFalseTrueFalseTrueFalseTrueFalseTrueFalseFalseTrue
\n
\n") # Other issues fileXml.write("""
Other issues

Occasionally there are other issues, e.g. Not listed in ReportingSchedule, yet listed in IncubationTable.

\n
\n") notesXml = """
Notes

Any Incubator committer can run 'clutch'. In the top-level of the "incubator/public/trunk" SVN, do: 'python clutch.py' which will generate a source xml file at "site-author/clutch.xml". Do 'svn diff' to see what changed. Then build and deploy the site as normal.

It reads the Wiki page at ReportingSchedule then builds a list of projects, adds some attributes, does some validation. Then it reads the source file for the "projects currently in incubation" table, ensures that each new project has an entry, adds some more attributes. Clutch then gathers some data from each project's Status page, makes other guesses, and validates that certain facilities are established and steps are achieved.

The clutch is the set of eggs which have been laid. The mother hen (our Incubator PMC) needs to continually gather them, to ensure that none have rolled from the nest or moved to the outside, being forgotten or becoming cold. Clutch also means to grasp eagerly.

So please interpret this table from the point-of-view of encouragement and nurture.

""" fileXml.write(notesXml) stepsXml = """
Steps

Essentially Clutch is helping us all to follow the process from the "Acceptance" phase onwards (explained in Process Description and Incubation Policy and Mentor Guide and Other Guides and summarised at Podling Bootstrap). Any committer on a podling can help with these steps. Don't leave it to your mentors, although there are some steps that only they can do.

The first steps are the "Acceptance" phase. See doc and doc and doc.

The orange and vermilion items indicate where more care and attention is needed. The following notes for each column, expand on the column definitions above and attempt to link directly to the relevant process documentation:

""" fileXml.write(stepsXml) fileXml.write("""
Mentors projects

This list is gathered from the Projects in incubation summary table.

\n
\n") footerXml = """
Data files

Data files are also generated so that other tools can easily re-use the metadata for the set of projects currently in incubation:

""" fileXml.write(footerXml) fileXml.close() fileList.close() fileReport1 = open('site-author/report_due_1.txt', 'w') fileReport1.write("%s" % reportList1) fileReport1.close() fileReport2 = open('site-author/report_due_2.txt', 'w') fileReport2.write("%s" % reportList2) fileReport2.close() fileReport3 = open('site-author/report_due_3.txt', 'w') fileReport3.write("%s" % reportList3) fileReport3.close() # Create the persistent data file. outputFile = open('clutch.pkl', 'wb') pickle.dump(output, outputFile, pickle.HIGHEST_PROTOCOL) outputFile.close() print "Done. Generated site-author/clutch.xml file." print "Now you need to re-build the site, as usual."