#!/usr/bin/env python # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ''' Purpose: Clutch gathers details about projects currently in incubation. The core resource is the SITE_CONTENT/podlings.xml file. As soon as a project is accepted into incubation, please add its entry. This script reads the SITE_CONTENT/podlings.xml table, and each podling status page, and other resources. The assembled metadata is stored in various data files. See further explanation at http://incubator.apache.org/clutch/ Note: Please keep the dependencies as minimal as possible, so this script can be operated by any Incubator committer. It uses only standard modules. Note: The 'svn log' queries might only run on UNIX, YMMV. ''' ''' External input data files used: - SITE_CONTENT/podlings.xml URLs http://people.apache.org/~crossley/incubator-keys.txt Created on minotaur using: find /www/www.apache.org/dist/incubator \ -iname "*KEYS*" | grep -v "\.svn\/" > ~/public_html/incubator-keys.txt http://people.apache.org/~crossley/incubator-releases.txt Created on minotaur using: find /www/www.apache.org/dist/incubator \ -iname "*incubat*gz.asc" -o -iname "*incubat*gz.sig" \ -o -iname "*incubat*bz2.asc" -o -iname "*incubat*bz2.sig" \ -o -iname "*incubat*zip.asc" -o -iname "*incubat*zip.sig" \ > ~/public_html/incubator-releases.txt http://people.apache.org/~crossley/incubator-releases-bad-filename.txt Created on minotaur using: find /www/www.apache.org/dist/incubator \ -iname "*gz.asc" -o -iname "*gz.sig" \ -o -iname "*bz2.asc" -o -iname "*bz2.sig" \ -o -iname "*zip.asc" -o -iname "*zip.sig" \ | sed 's/.*\/incubator\///' \ | grep -v incubat \ > ~/public_html/incubator-releases-bad-filename.txt The above has now been replaced by parsing the output of 'svn', 'ls', '-R', 'https://dist.apache.org/repos/dist/release/incubator' asf-authorization-template from Git deployment branch http://mail-archives.apache.org/mod_mbox/ http://www.apache.org/dist/incubator/ http://svn.apache.org/repos/asf/incubator SVN commands 'svn', 'ls', '-R', 'https://dist.apache.org/repos/dist/release/incubator' 'svn', 'ls', '--xml', 'http://svn.apache.org/repos/asf/incubator/' 'svn', 'log', '--xml', 'SITE_CONTENT/projects/{0}.xml' {status file} Output data files created: SITE_CONTENT/clutch/clutch.txt SITE_CONTENT/clutch/_includes/clutcho1.ad SITE_CONTENT/clutch/_includes/clutcho2.ad SITE_CONTENT/clutch/_includes/clutcht.ad SITE_CONTENT/clutch/_includes/clutchr.ad SITE_CONTENT/clutch/_includes/clutchm.ad SITE_CONTENT/clutch/_includes/clutchmy.ad SITE_CONTENT/clutch/podlings_graduated.txt SITE_CONTENT/clutch/podlings_retired.txt SITE_CONTENT/clutch/report_due_1.txt SITE_CONTENT/clutch/report_due_2.txt SITE_CONTENT/clutch/report_due_3.txt Pickle file: - clutch2.pkl (O) ''' # FIXME: Mail list detection could be improved. # FIXME: Mail list detection. See svn comments with 2009-11-13 rush bug fix. # FIXME: Occasional trailing slash issue in Clutch cache. # FIXME: Some projects use different names in different contexts, and cannot # be automatically handled, e.g. Lucene.Net, log4php (some of their stats # are missing). # See beginning attempt to handle this with "resourceNames". # FIXME: Perhaps send some error reporting to a log file: # - validate the dates. # - detect short description, e.g. Hama = Hama # Note that after 2019-03 many errors are put into the clutch.pkl # FIXME: Better/more exception handling, e.g. url open # FIXME: Need various output formats: # - source docs xml file in clutch*.ent (now happening) # - simple text list of project names and basic data clutch.txt (now happening) # - Notation3 or DOAP or RDFa or some such? (not yet) # - python pickle (now happening) # FIXME: Parse Robert's "audit" stuff. (won't fix) # FIXME: Detect if they have SVN repo yet. (won't fix) (as of 2019 only one podling is in svn) # - http://svn.apache.org/repos/asf/incubator/* ensure more than ".." # FIXME: Similarly with website. Ensure that there is some content length. # Solution: # (1) At program start update STATUS file with the value 'STARTED' # (2) At completion update STATUS file with the value 'COMPLETE' # (3) In the script that runs this program check the STATUS and output files to assure # consistency. If that fails then abort the process. # FIXME: Get better hints from Status pages, e.g. sometimes they don't link # to their "tracker" etc. they just use text. # (We are now looking at the text and finding URLs.) # FIXME: News parser gets extra committer if source has commented xml template. # (News is not a reliable source for new committers as most podlings aren't using it) # FIXME: Use fragments via other files for the sets of html notes. # (will be updating notes for clutch in newer asciidoc based incubator site) # FIXME: See some other suggestions on the general@ list. # (not going to review 10 years of general@ to find these) # FIXME: See some other suggestions in clutch.html#notes-2 # (links to email thread are stale(404/523) - need to update historical note) # FIXME: Better deal with input/output/unicode. # (much of the output calls for utf-8) # FIXME: See some other suggestions in issue INCUBATOR-78. # (Most of these are acted upon by the creation of clutch.py and clutch2report.py) import sys if sys.version_info < (3, 2): raise Exception("Python 3.2 or above is required") import subprocess from subprocess import Popen, PIPE import datetime from html.parser import HTMLParser import os.path import pickle import pprint import re import urllib.request import urllib.error import urllib.parse import xml.dom.minidom import argparse import io import json # constants for external data --- # infra moved to github # ASF_AUTH_TEMPLATE = 'https://raw.githubusercontent.com/apache/infrastructure-puppet/deployment/modules/subversion_server/files/authorization/asf-authorization-template' # Currently prefer these, but consider switch to lists.apache.org MAIL_LIST_URL = "http://mail-archives.apache.org/mod_mbox/" # All project git repositories in Apache are organized by Apache Infra and can be found on Gitbox. GITBOX_DIR = "https://gitbox.apache.org/repositories.json" # All proect committers["members"] and pmc members["owners"] are provided on whimsy PROJECT_LDAP = "https://whimsy.apache.org/public/public_ldap_projects.json" # All podlings site scan results PODLING_SITE = "https://whimsy.apache.org/public/pods-scan.json" # Constant for site content location and clutch reports --- SITE_CONTENT_DIR = 'content/' CLUTCH_CONTENT_DIR = SITE_CONTENT_DIR + 'clutch/' parser = argparse.ArgumentParser( description='Gather details about projects currently in incubation.') #parser.add_argument('--ignoreState', action='store_true', # default='True', help='Ignore state (default true)') parser.add_argument('-v', '--verbose', action='store_true', default='False', help='verbose mode (default false)') parser.add_argument('-q', '--quiet', action='store_true', default='False', help='quiet mode (default false)') parser.add_argument('-x', '--external', action='store_true', default='False', help='log external requests (e.g. svn, http) (default false)') args = parser.parse_args() # Normal level of info optionInfo = args.quiet != True # Issue some extra debug information. optionVerbose = args.verbose == True if optionVerbose: optionInfo = True # Use the persistent data to speed operations. # Occasionally bad data is cached (e.g. experimenting with developing new code). # So need to ignore the cached data and perform all resource availability # tests. optionUseClutchState = False # stickiness is not needed during development and is bad in current build from scratch state. #args.ignoreState != True # Should we log external requests? optionExternal = args.external == True # Utility functions ---- def logexternal(string): if optionExternal: print("External: " + string) def getUrl(url, encoding=None, errors=None): logexternal(url) # ensure invalid URLs don't cause long wait resp = urllib.request.urlopen(url, timeout=5) if encoding: return io.TextIOWrapper(resp, encoding=encoding, errors=errors) else: return resp def osExec(list): logexternal(" ".join(list)) return subprocess.Popen(list, stdout=subprocess.PIPE).communicate()[0] def osPopen(list): logexternal(" ".join(list)) return subprocess.Popen(list, stdout=subprocess.PIPE, universal_newlines=True) def getText(nodelist): """http://www.python.org/doc/2.5.2/lib/minidom-example.txt""" rc = "" for node in nodelist: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc def normaliseSVNurl(url): rc = url.replace('https://', 'http://') if not rc[-1] == '/': rc = rc + '/' return rc def checkStatus(k, projectList, status): statusFile = SITE_CONTENT_DIR + "projects/{0}.xml".format(k) e = projectList[k] if os.path.exists(statusFile): try: dom = xml.dom.minidom.parse(statusFile) span = dom.getElementsByTagName("span") if (len(span) < 1): print("INFO: Missing from status file: "+statusFile) print("

The {0} project {2} on {1}

".format(e['name'], e['enddate'], status)) except (Exception) as e: print("ERROR: Exception processing " + statusFile + " : " + str(e)) raise else: print("WARN: Cannot find {0}".format(statusFile)) projects = {} # internal data, derived from podlings.xml otherIssues = [] persist = {} # persistent data to be utilised by other tools mentorsProjects = {} # internal data mentorsName = {} # internal data gatherDate = datetime.datetime.utcnow() gatherDateString = datetime.datetime.utcnow().ctime() gatherYear = gatherDate.year # 2 months delta = datetime.timedelta(days=61) statusTallyDate1 = gatherDate - delta # 4 months delta = datetime.timedelta(days=122) statusTallyDate2 = gatherDate - delta # 9 months delta = datetime.timedelta(days=273) statusTallyDate3 = gatherDate - delta # Regualar expressions --- # These expressions are used often, so compile them early. startDateRE = re.compile("([0-9]+)-0?([0-9]+)-?0?([0-9]+)?") statusLogRE = re.compile("^([0-9]+)-0?([0-9]+)-0?([0-9]+)") svnRevisionSkipRE = re.compile( "707389|708087|708420|708791|709356|709648|711153|744365|761864|788239|796085|804825|894972|940767|959869|1065888|1153764|1159079|1373730|1479744|1494479|1515212|1855460") mailListRE = re.compile("^([-a-z0-9]+)@([a-z]+)\.apache\.org") mailListNameRE = re.compile("^([a-z]+)-([-a-z0-9]+)") mailListNameUrlRE = re.compile("/([a-z]+)-([-a-z0-9]+)/$") urlHttpRE = re.compile("^http") distMirrorRE = re.compile("cgi/incubator/([-a-z0-9]+)/") newCommitterRE = re.compile("[nN]ew [cC]omm?itt?ers? ?\(?([0-9]+)?") # Import the persistent data --- # This enables us to skip detection of website etc. if already detected. #inputFile = open('clutch2.pkl', 'rb') #state = pickle.load(inputFile) #inputFile.close() print("START: Incubator Podlings: {0}{1}".format(SITE_CONTENT_DIR,"podlings.xml")) print("START: See https://whimsy.apache.org/incubator/podlings/by-age") print("START: Whimsy Project LDAP: {0}".format(PROJECT_LDAP)) print("START: See https://whimsy.apache.org/public/") print("START: Whimsy Podling Website Scan: {0}".format(PODLING_SITE)) print("START: See https://whimsy.apache.org/pods/") print("START: Gitbox Repository Directory: {0}".format(GITBOX_DIR)) print("START: See https://gitbox.apache.org/repos/asf") # read from directory of Apache Gitbox repositories s = urllib.request.urlopen(GITBOX_DIR).read().decode('utf-8') gitboxRepos = json.loads(s) gitbox = {} if not gitboxRepos or len(gitboxRepos["projects"]) < 100: print("ABORT: Gitbox Repository Directory is not available - {0}".format(GITBOX_DIR)) sys.exit(1) gitbox["incubator"] = gitboxRepos["projects"]["incubator"]["repositories"] # read from project ldap s = urllib.request.urlopen(PROJECT_LDAP).read().decode('utf-8') projectLDAP = json.loads(s)["projects"] if len(projectLDAP) < 100: print("ABORT: Whimsy project LDAP is not available - {0}".format(PROJECT_LDAP)) sys.exit(2) # read from podling site scan s = urllib.request.urlopen(PODLING_SITE).read().decode('utf-8') podlingSiteScan = json.loads(s) if len(projectLDAP) < 100: print("ABORT: Whimsy podling website scan is not available - {0}".format(PODLING_SITE)) sys.exit(3) # Parse the podlings data file --- dom = xml.dom.minidom.parse(SITE_CONTENT_DIR + "podlings.xml") if not dom: print("ABORT: Incubator content/podlings.xml is not parsable - {0}{1}".format(SITE_CONTENT_DIR,"podlings.xml")) sys.exit(4) graduatedProjects = {} graduatingOrRetiring = [] retiredProjects = {} print("Gather data from podlings.xml ...") for row in dom.getElementsByTagName("podling"): name = row.getAttribute("name").strip() id = name.lower() id = id.replace(' ', '') # strip spaces from project ID startDate = row.getAttribute("startdate") endDate = row.getAttribute("enddate") resource = row.getAttribute("resource") if row.getAttribute("status") == 'graduated': graduatedProjects[resource.lower()] = {'name': name, 'startdate': startDate, 'enddate': endDate} if row.getAttribute("status") == 'retired': retiredProjects[resource.lower()] = {'name': name, 'startdate': startDate, 'enddate': endDate} if row.getAttribute("status") == 'current': if optionVerbose: print("Name: {0} - {1}".format(name,resource.lower())) if id in projects: print("ERROR: {0}: row exists".format(id)) errorMsg = ["00","Duplicate entry in podlings.xml"] projects[id]['zFixMeList'].append(errorMsg) else: projects[id] = {} # strip spaces from project name (as per original ReportingSchedule) # TODO is this still needed? Or should the @name attribute not # contain spaces? projects[id]['fullName'] = name projects[id]['name'] = name.replace(' ', '') projects[id]['zFixMeList'] = [] projects[id]['resource'] = resource # Set some defaults needMetadata = False projects[id]['reportingMonthly'] = False projects[id]['reportingComments'] = "" projects[id]['hasReportingGroup'] = True # currently needed for reporting phase projects[id]['reportingGroup'] = 'month' projects[id]['hasStatusEntry'] = True projects[id]['statusFileName'] = id projects[id]['statusLastUpdated'] = "" projects[id]['statusAge'] = 0 projects[id]['statusUpdateCounts'] = "" projects[id]['urlSvn'] = "" projects[id]['urlGit'] = "" projects[id]['repositories'] = "" try: if gitboxRepos["projects"][resource]: d = gitboxRepos["projects"][resource]["repositories"] projects[id]['repositories'] = sorted(d.items(), key=lambda x: x[0]) except (Exception) as e: print("Exception Gitbox repositories " + resource + " : " + str(e)) projects[id]['urlTracker'] = "" projects[id]['urlWiki'] = "" projects[id]['urlWww'] = "" projects[id]['urlDistSVN'] = "" projects[id]['urlDist'] = "" projects[id]['urlKeys'] = "" projects[id]['releases'] = [] projects[id]['distributions'] = [] projects[id]['distribHash'] = [] projects[id]['hasEntryIssues'] = False projects[id]['resourceNames'] = [id] # Some projects use an alternate short resource name # rather than their project name alias = row.getAttribute("resource") if (alias != '' and alias != id): projects[id]['resourceNames'].append(alias) for alias in row.getAttribute("resourceAliases").split(','): if alias != '': projects[id]['resourceNames'].append(alias) projects[id]['entryDate'] = None projects[id]['committersSvn'] = None projects[id]['news'] = [] projects[id]['hintMailListDev'] = "" projects[id]['hasMailListDev'] = "" projects[id]['hintMailListCommits'] = "" projects[id]['hasMailListCommits'] = "" projects[id]['numberCommittersNew'] = 0 try: projects[id]['numberCommitters'] = len(projectLDAP[resource]["members"]) projects[id]['numberPMCMembers'] = len(projectLDAP[resource]["owners"]) except (Exception) as e: print("ERROR: Exception project LDAP " + resource + " : " + str(e)) projects[id]['zFixMeList'].append(["01","No project LDAP"]) projects[id]['numberCommitters'] = 0 projects[id]['numberPMCMembers'] = 0 if projects[id]['numberCommitters'] == projects[id]['numberPMCMembers']: projects[id]['numberPMCEquals'] = True else: projects[id]['numberPMCEquals'] = False descElements = row.getElementsByTagName("description") projects[id]['description'] = getText(descElements[0].childNodes) if 'FIXME' in projects[id]['description']: needMetadata = True # projects[id]['description'] = " ".join(projects[id]['description'].split())) projects[id]['sponsor'] = row.getAttribute("sponsor") projects[id]['startDate'] = startDate projects[id]['statusFileName'] = row.getAttribute("resource") mentors = [mentor.firstChild.data.strip() for mentor in row.getElementsByTagName("mentor")] projects[id]['mentors'] = mentors if 'FIXME' in mentors: needMetadata = True if needMetadata: errorMsg = "{0}: Need to add incubation metadata.".format(id) print('ERROR: ', errorMsg) errorMsg += " Please maintain your records in the content/podlings.xml file. See link:#h-hasstatusfile[help]" otherIssues.append(errorMsg) errorMsg = ["02","Podling metadata missing in content/podlings.xml file. This is ground truth."] projects[id]['zFixMeList'].append(errorMsg) # accumulate projects for each mentor mentors = [(mentor.firstChild.data.strip(),mentor.getAttribute("username")) for mentor in row.getElementsByTagName("mentor")] projects[id]['mentors'] = mentors for mentor in mentors: try: mentorsProjects[mentor[1]] except KeyError: mentorsProjects[mentor[1]] = [] mentorsProjects[mentor[1]].append(name) try: mentorsName[mentor[1]] = mentor[0] except KeyError: dontcare = True isGraduating = row.getElementsByTagName("graduating").length > 0 if isGraduating: graduatingOrRetiring.append(id) if not row.getAttribute("endDate"): errorMsg = "{0}: Has graduated, but still needs to follow the graduation steps.".format(id) print('ERROR: ', errorMsg) errorMsg += " See link:#h-graduate[help]." otherIssues.append(errorMsg) errorMsg = ["03","Graduated, but some graduation steps are incomplete"] projects[id]['zFixMeList'].append(errorMsg) isRetiring = row.getElementsByTagName("retiring").length > 0 if isRetiring: graduatingOrRetiring.append(id) if not row.getAttribute("endDate"): errorMsg = "{0}: Has retired, but still needs to follow the retirement steps.".format(id) print('ERROR: ', errorMsg) errorMsg += " See link:#h-retire[help]." otherIssues.append(errorMsg) errorMsg = ["04","Retired, but some retirement steps are incomplete"] projects[id]['zFixMeList'].append(errorMsg) # Is it reporting monthly? reporting = row.getElementsByTagName("reporting") if reporting.length != 1: projects[id]['hasReportingGroup'] = False if not isGraduating: errorMsg = "SEVERE: {0}: expecting a singleton report group".format(id) print(errorMsg) otherIssues.append(errorMsg) errorMsg = ["05","Reporting group is either missing or a multiple"] projects[id]['zFixMeList'].append(errorMsg) else: if reporting[0].getAttribute("monthly").lower() == 'true': projects[id]['reportingMonthly'] = True projects[id]['reportingComments'] = getText(reporting) projects[id]['hasEntryIssues'] = True group = reporting[0].getAttribute("group") if group == None: projects[id]['hasReportingGroup'] = False errorMsg = "SEVERE: {0}: missing group attribute".format(id) print(errorMsg) otherIssues.append(errorMsg) errorMsg = ["06","Reporting group is missing"] projects[id]['zFixMeList'].append(errorMsg) else: projects[id]['reportingGroup'] = 'group-' + group dom.unlink() for k in sorted(graduatedProjects): checkStatus(k, graduatedProjects, 'graduated') for k in sorted(retiredProjects): checkStatus(k, retiredProjects, 'retired') # Process the incubation table data, detect some potential issues. --- print("Gather details from project status files ...") projectNames = list(projects.keys()) for k in sorted(projectNames, key=str.lower): if optionVerbose: print("DEBUG: Processing status file for {0}".format(k)) # Append more potential alternate names for a project if projects[k]['statusFileName'] not in projects[k]['resourceNames']: projects[k]['resourceNames'].append(projects[k]['statusFileName']) if optionVerbose and len(projects[k]['resourceNames']) > 1: print("DEBUG: Will try alternate names: {0}".format( projects[k]['resourceNames'])) # parse their project status file to extract specific information statusFile = SITE_CONTENT_DIR + \ "projects/{0}.xml".format(projects[k]['statusFileName']) if os.path.exists(statusFile): try: dom = xml.dom.minidom.parse(statusFile) except (Exception) as e: print("ERROR: Exception processing " + statusFile + " : " + str(e)) raise # get the project info hints if optionVerbose: print("DEBUG: Gather hints from project Status page") print (dom.nodeName) for node in dom.childNodes: for elem in node.childNodes: print (elem) table = dom.getElementsByTagName("table")[0] for row in table.getElementsByTagName("tr")[1:]: if (len(row.getElementsByTagName("td")) < 3): continue cell = row.getElementsByTagName("td")[2] if 'id' in cell.attributes: values = [getText(item.childNodes) for item in cell.childNodes] value = " ".join(values).strip() if value == "": value = getText(cell.childNodes).strip() if optionVerbose: print("DEBUG: Hint: {0}={1}".format( cell.getAttribute('id'), value)) if cell.getAttribute('id') == "mail-dev": value = value.replace(' at ', '@') value = value.replace(' Subscribe Unsubscribe', '') value = value.replace(' Archive', '') value = value.replace(' ', '@', 1) value = value.replace(' ', '') value = value.replace('@@', '@') matchMail = re.search(mailListRE, value) if matchMail: projects[k][ 'hintMailListDev'] = "{0}-{1}".format(matchMail.group(2), matchMail.group(1)) continue if cell.getAttribute('id') == "mail-commits": value = value.replace(' at ', '@') value = value.replace(' Subscribe Unsubscribe', '') value = value.replace(' Archive', '') value = value.replace(' ', '@', 1) value = value.replace(' ', '') value = value.replace('@@', '@') matchMail = re.search(mailListRE, value) if matchMail: projects[k][ 'hintMailListCommits'] = "{0}-{1}".format(matchMail.group(2), matchMail.group(1)) continue # Get hints for various url-based resources matchUrl = re.search(urlHttpRE, value) if not matchUrl: for item in cell.getElementsByTagName('a'): if 'href' in item.attributes: value = item.getAttribute('href') break hasUrl = re.search(urlHttpRE, value) if cell.getAttribute('id') == "svn" and hasUrl: if value[-4:] == ".git": value = "" elif value[-1] == '/': value = value[0:-1] projects[k]['urlSvn'] = value continue if cell.getAttribute('id') == "git" and hasUrl: projects[k]['urlGit'] = value continue if cell.getAttribute('id') == "tracker" and hasUrl: projects[k]['urlTracker'] = value if optionVerbose: print("{0}: urlTracker={1}".format(k,value)) continue if cell.getAttribute('id') == "www" and hasUrl: if len(value) < 10: value = "" projects[k]['urlWww'] = value if optionVerbose: print("{0}: urlWww={1}".format(k,value)) continue if cell.getAttribute('id') == "wiki" and hasUrl: projects[k]['urlWiki'] = value if optionVerbose: print("{0}: urlWiki={1}".format(k,value)) continue # Scan the project News section and count new commiters. for section in dom.getElementsByTagName("section"): if 'id' in section.attributes and section.getAttribute('id') == "News": for line in section.toxml().splitlines(): if '