#!/usr/bin/env python

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''
Purpose: Clutch gathers details about projects currently in incubation.

The core resource is the SITE_CONTENT/podlings.xml file. As soon as a project is
accepted into incubation, please add its entry. 
This script reads the SITE_CONTENT/podlings.xml table, and each podling status page, and
other resources. The assembled metadata is stored in various data files.

See further explanation at http://incubator.apache.org/clutch/

Note: Please keep the dependencies as minimal as possible, so this script can
be operated by any Incubator committer. It uses only standard modules.

Note: The 'svn log' queries might only run on UNIX, YMMV.
'''

'''
External input data files used:
- SITE_CONTENT/podlings.xml

URLs
http://people.apache.org/~crossley/incubator-keys.txt

Created on minotaur using:
find /www/www.apache.org/dist/incubator \
  -iname "*KEYS*" | grep -v "\.svn\/" > ~/public_html/incubator-keys.txt

http://people.apache.org/~crossley/incubator-releases.txt

Created on minotaur using:
find /www/www.apache.org/dist/incubator \
  -iname "*incubat*gz.asc" -o -iname "*incubat*gz.sig" \
  -o -iname "*incubat*bz2.asc" -o -iname "*incubat*bz2.sig" \
  -o -iname "*incubat*zip.asc" -o -iname "*incubat*zip.sig" \
  > ~/public_html/incubator-releases.txt

http://people.apache.org/~crossley/incubator-releases-bad-filename.txt

Created on minotaur using:
find /www/www.apache.org/dist/incubator \
  -iname "*gz.asc" -o -iname "*gz.sig" \
  -o -iname "*bz2.asc" -o -iname "*bz2.sig" \
  -o -iname "*zip.asc" -o -iname "*zip.sig" \
  | sed 's/.*\/incubator\///' \
  | grep -v incubat \
  > ~/public_html/incubator-releases-bad-filename.txt

The above has now been replaced by parsing the output of
'svn', 'ls', '-R', 'https://dist.apache.org/repos/dist/release/incubator'

asf-authorization-template from Git deployment branch
http://mail-archives.apache.org/mod_mbox/
http://www.apache.org/dist/incubator/<resource>
http://svn.apache.org/repos/asf/incubator

SVN commands 
'svn', 'ls', '-R', 'https://dist.apache.org/repos/dist/release/incubator'
'svn', 'ls', '--xml', 'http://svn.apache.org/repos/asf/incubator/'
'svn', 'log', '--xml', 'SITE_CONTENT/projects/{0}.xml' {status file}

Output data files created:
SITE_CONTENT/clutch/clutch.txt
SITE_CONTENT/clutch/_includes/clutcho1.ad
SITE_CONTENT/clutch/_includes/clutcho2.ad
SITE_CONTENT/clutch/_includes/clutcht.ad
SITE_CONTENT/clutch/_includes/clutchr.ad
SITE_CONTENT/clutch/_includes/clutchm.ad
SITE_CONTENT/clutch/_includes/clutchmy.ad
SITE_CONTENT/clutch/podlings_graduated.txt
SITE_CONTENT/clutch/podlings_retired.txt
SITE_CONTENT/clutch/report_due_1.txt
SITE_CONTENT/clutch/report_due_2.txt
SITE_CONTENT/clutch/report_due_3.txt

Pickle file:
- clutch2.pkl (O)
'''

# FIXME: Mail list detection could be improved.
# FIXME: Mail list detection. See svn comments with 2009-11-13 rush bug fix.
# FIXME: Occasional trailing slash issue in Clutch cache.
# FIXME: Some projects use different names in different contexts, and cannot
#        be automatically handled, e.g. Lucene.Net, log4php (some of their stats
#        are missing).
#        See beginning attempt to handle this with "resourceNames".
# FIXME: Perhaps send some error reporting to a log file:
#        - validate the dates.
#        - detect short description, e.g. Hama = Hama
#        Note that after 2019-03 many errors are put into the clutch.pkl
# FIXME: Better/more exception handling, e.g. url open
# FIXME: Need various output formats:
#        - source docs xml file in clutch*.ent (now happening)
#        - simple text list of project names and basic data clutch.txt (now happening)
#        - Notation3 or DOAP or RDFa or some such? (not yet)
#        - python pickle (now happening)
# FIXME: Parse Robert's "audit" stuff. (won't fix)
# FIXME: Detect if they have SVN repo yet. (won't fix) (as of 2019 only one podling is in svn)
#        - http://svn.apache.org/repos/asf/incubator/* ensure more than ".."
# FIXME: Similarly with website. Ensure that there is some content length.
#        Solution:
#        (1) At program start update STATUS file with the value 'STARTED'
#        (2) At completion update STATUS file with the value 'COMPLETE'
#        (3) In the script that runs this program check the STATUS and output files to assure
#            consistency. If that fails then abort the process.
# FIXME: Get better hints from Status pages, e.g. sometimes they don't link
#        to their "tracker" etc. they just use text.
#        (We are now looking at the text and finding URLs.)
# FIXME: News parser gets extra committer if source has commented xml template.
#        (News is not a reliable source for new committers as most podlings aren't using it)
# FIXME: Use fragments via other files for the sets of html notes.
#        (will be updating notes for clutch in newer asciidoc based incubator site)
# FIXME: See some other suggestions on the general@ list.
#        (not going to review 10 years of general@ to find these)
# FIXME: See some other suggestions in clutch.html#notes-2
#        (links to email thread are stale(404/523) - need to update historical note)
# FIXME: Better deal with input/output/unicode.
#        (much of the output calls for utf-8)
# FIXME: See some other suggestions in issue INCUBATOR-78.
#        (Most of these are acted upon by the creation of clutch.py and clutch2report.py)

import sys
if sys.version_info < (3, 2):
    raise Exception("Python 3.2 or above is required")

import subprocess
from subprocess import Popen, PIPE
import datetime
from html.parser import HTMLParser
import os.path
import pickle
import pprint
import re
import urllib.request
import urllib.error
import urllib.parse
import xml.dom.minidom
import argparse
import io
import json

# constants for external data ---
# infra moved to github
# ASF_AUTH_TEMPLATE = 'https://raw.githubusercontent.com/apache/infrastructure-puppet/deployment/modules/subversion_server/files/authorization/asf-authorization-template'

# Currently prefer these, but consider switch to lists.apache.org
MAIL_LIST_URL = "http://mail-archives.apache.org/mod_mbox/"

# All project git repositories in Apache are organized by Apache Infra and can be found on Gitbox.
GITBOX_DIR = "https://gitbox.apache.org/repositories.json"

# All proect committers["members"] and pmc members["owners"] are provided on whimsy
PROJECT_LDAP = "https://whimsy.apache.org/public/public_ldap_projects.json"

# All podlings site scan results
PODLING_SITE = "https://whimsy.apache.org/public/pods-scan.json"

# Constant for site content location and clutch reports ---
SITE_CONTENT_DIR = 'content/'
CLUTCH_CONTENT_DIR = SITE_CONTENT_DIR + 'clutch/'

parser = argparse.ArgumentParser(
    description='Gather details about projects currently in incubation.')
#parser.add_argument('--ignoreState',    action='store_true',
#                    default='True', help='Ignore state (default true)')
parser.add_argument('-v', '--verbose',  action='store_true',
                    default='False', help='verbose mode (default false)')
parser.add_argument('-q', '--quiet',    action='store_true',
                    default='False', help='quiet mode (default false)')
parser.add_argument('-x', '--external', action='store_true', default='False',
                    help='log external requests (e.g. svn, http) (default false)')
args = parser.parse_args()

# Normal level of info
optionInfo = args.quiet != True

# Issue some extra debug information.
optionVerbose = args.verbose == True
if optionVerbose:
    optionInfo = True

# Use the persistent data to speed operations.
# Occasionally bad data is cached (e.g. experimenting with developing new code).
# So need to ignore the cached data and perform all resource availability
# tests.
optionUseClutchState = False
# stickiness is not needed during development and is bad in current build from scratch state.
#args.ignoreState != True

# Should we log external requests?
optionExternal = args.external == True

# Utility functions ----


def logexternal(string):
    if optionExternal:
        print("External: " + string)


def getUrl(url, encoding=None, errors=None):
    logexternal(url)
    # ensure invalid URLs don't cause long wait
    resp = urllib.request.urlopen(url, timeout=5)
    if encoding:
        return io.TextIOWrapper(resp, encoding=encoding, errors=errors)
    else:
        return resp


def osExec(list):
    logexternal(" ".join(list))
    return subprocess.Popen(list, stdout=subprocess.PIPE).communicate()[0]


def osPopen(list):
    logexternal(" ".join(list))
    return subprocess.Popen(list, stdout=subprocess.PIPE, universal_newlines=True)

def getText(nodelist):
    """http://www.python.org/doc/2.5.2/lib/minidom-example.txt"""
    rc = ""
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            rc = rc + node.data
    return rc

def normaliseSVNurl(url):
    rc = url.replace('https://', 'http://')
    if not rc[-1] == '/':
        rc = rc + '/'
    return rc

def checkStatus(k, projectList, status):
    statusFile = SITE_CONTENT_DIR + "projects/{0}.xml".format(k)
    e = projectList[k]
    if os.path.exists(statusFile):
        try:
            dom = xml.dom.minidom.parse(statusFile)
            span = dom.getElementsByTagName("span")
            if (len(span) < 1):
                print("INFO: Missing from status file: "+statusFile)
                print("      <p><span class='{2}'>The {0} project {2} on {1}</span></p>".format(e['name'], e['enddate'], status))
        except (Exception) as e:
            print("ERROR: Exception processing " + statusFile + " : " + str(e))
            raise
    else:
        print("WARN: Cannot find {0}".format(statusFile))

projects = {}  # internal data, derived from podlings.xml
otherIssues = []
persist = {}  # persistent data to be utilised by other tools
mentorsProjects = {}  # internal data
mentorsName = {}  # internal data

gatherDate = datetime.datetime.utcnow()
gatherDateString = datetime.datetime.utcnow().ctime()
gatherYear = gatherDate.year
# 2 months
delta = datetime.timedelta(days=61)
statusTallyDate1 = gatherDate - delta
# 4 months
delta = datetime.timedelta(days=122)
statusTallyDate2 = gatherDate - delta
# 9 months
delta = datetime.timedelta(days=273)
statusTallyDate3 = gatherDate - delta

# Regualar expressions ---
# These expressions are used often, so compile them early.
startDateRE = re.compile("([0-9]+)-0?([0-9]+)-?0?([0-9]+)?")
statusLogRE = re.compile("^([0-9]+)-0?([0-9]+)-0?([0-9]+)")
svnRevisionSkipRE = re.compile(
    "707389|708087|708420|708791|709356|709648|711153|744365|761864|788239|796085|804825|894972|940767|959869|1065888|1153764|1159079|1373730|1479744|1494479|1515212|1855460")
mailListRE = re.compile("^([-a-z0-9]+)@([a-z]+)\.apache\.org")
mailListNameRE = re.compile("^([a-z]+)-([-a-z0-9]+)")
mailListNameUrlRE = re.compile("/([a-z]+)-([-a-z0-9]+)/$")
urlHttpRE = re.compile("^http")
distMirrorRE = re.compile("cgi/incubator/([-a-z0-9]+)/")
newCommitterRE = re.compile("[nN]ew [cC]omm?itt?ers? ?\(?([0-9]+)?")

# Import the persistent data ---
# This enables us to skip detection of website etc. if already detected.
#inputFile = open('clutch2.pkl', 'rb')
#state = pickle.load(inputFile)
#inputFile.close()

print("START:  Incubator Podlings: {0}{1}".format(SITE_CONTENT_DIR,"podlings.xml"))
print("START:      See https://whimsy.apache.org/incubator/podlings/by-age")
print("START:  Whimsy Project LDAP: {0}".format(PROJECT_LDAP))
print("START:      See https://whimsy.apache.org/public/")
print("START:  Whimsy Podling Website Scan: {0}".format(PODLING_SITE))
print("START:      See https://whimsy.apache.org/pods/")
print("START:  Gitbox Repository Directory: {0}".format(GITBOX_DIR))
print("START:      See https://gitbox.apache.org/repos/asf")

# read from directory of Apache Gitbox repositories
s = urllib.request.urlopen(GITBOX_DIR).read().decode('utf-8')
gitboxRepos = json.loads(s)
gitbox = {}
if not gitboxRepos or len(gitboxRepos["projects"]) < 100:
    print("ABORT:  Gitbox Repository Directory is not available - {0}".format(GITBOX_DIR))
    sys.exit(1)
gitbox["incubator"] = gitboxRepos["projects"]["incubator"]["repositories"]

# read from project ldap
s = urllib.request.urlopen(PROJECT_LDAP).read().decode('utf-8')
projectLDAP = json.loads(s)["projects"]
if len(projectLDAP) < 100:
    print("ABORT:  Whimsy project LDAP is not available - {0}".format(PROJECT_LDAP))
    sys.exit(2)

# read from podling site scan
s = urllib.request.urlopen(PODLING_SITE).read().decode('utf-8')
podlingSiteScan = json.loads(s)
if len(projectLDAP) < 100:
    print("ABORT:  Whimsy podling website scan is not available - {0}".format(PODLING_SITE))
    sys.exit(3)

# Parse the podlings data file ---
dom = xml.dom.minidom.parse(SITE_CONTENT_DIR + "podlings.xml")
if not dom:
    print("ABORT:  Incubator content/podlings.xml is not parsable - {0}{1}".format(SITE_CONTENT_DIR,"podlings.xml"))
    sys.exit(4)

graduatedProjects = {}
graduatingOrRetiring = []
retiredProjects = {}

print("Gather data from podlings.xml ...")
for row in dom.getElementsByTagName("podling"):
    name = row.getAttribute("name").strip()
    id = name.lower()
    id = id.replace(' ', '')  # strip spaces from project ID
    startDate = row.getAttribute("startdate")
    endDate = row.getAttribute("enddate")
    resource = row.getAttribute("resource")

    if row.getAttribute("status") == 'graduated':
        graduatedProjects[resource.lower()] = {'name': name, 'startdate': startDate, 'enddate': endDate}

    if row.getAttribute("status") == 'retired':
        retiredProjects[resource.lower()] = {'name': name, 'startdate': startDate, 'enddate': endDate}

    if row.getAttribute("status") == 'current':
        if optionVerbose:
            print("Name: {0} - {1}".format(name,resource.lower()))

        if id in projects:
            print("ERROR:  {0}: row exists".format(id))
            errorMsg = ["00","Duplicate entry in podlings.xml"]
            projects[id]['zFixMeList'].append(errorMsg)
        else:
            projects[id] = {}
            # strip spaces from project name (as per original ReportingSchedule)
            # TODO is this still needed? Or should the @name attribute not
            # contain spaces?
            projects[id]['fullName'] = name
            projects[id]['name'] = name.replace(' ', '')
            projects[id]['zFixMeList'] = []
            projects[id]['resource'] = resource
            # Set some defaults
            needMetadata = False
            projects[id]['reportingMonthly'] = False
            projects[id]['reportingComments'] = ""
            projects[id]['hasReportingGroup'] = True
            # currently needed for reporting phase
            projects[id]['reportingGroup'] = 'month'
            projects[id]['hasStatusEntry'] = True
            projects[id]['statusFileName'] = id
            projects[id]['statusLastUpdated'] = ""
            projects[id]['statusAge'] = 0
            projects[id]['statusUpdateCounts'] = ""
            projects[id]['urlSvn'] = ""
            projects[id]['urlGit'] = ""
            projects[id]['repositories'] = ""
            try:
                if gitboxRepos["projects"][resource]:
                    d = gitboxRepos["projects"][resource]["repositories"]
                    projects[id]['repositories'] = sorted(d.items(), key=lambda x: x[0])
            except (Exception) as e:
                print("Exception Gitbox repositories " + resource + " : " + str(e))
            projects[id]['urlTracker'] = ""
            projects[id]['urlWiki'] = ""
            projects[id]['urlWww'] = ""
            projects[id]['urlDistSVN'] = ""
            projects[id]['urlDist'] = ""
            projects[id]['urlKeys'] = ""
            projects[id]['releases'] = []
            projects[id]['distributions'] = []
            projects[id]['distribHash'] = []
            projects[id]['hasEntryIssues'] = False
            projects[id]['resourceNames'] = [id]
            # Some projects use an alternate short resource name
            # rather than their project name
            alias = row.getAttribute("resource")
            if (alias != '' and alias != id):
                projects[id]['resourceNames'].append(alias)
            for alias in row.getAttribute("resourceAliases").split(','):
                if alias != '':
                    projects[id]['resourceNames'].append(alias)
            projects[id]['entryDate'] = None
            projects[id]['committersSvn'] = None
            projects[id]['news'] = []        
            projects[id]['hintMailListDev'] = ""
            projects[id]['hasMailListDev'] = ""
            projects[id]['hintMailListCommits'] = ""
            projects[id]['hasMailListCommits'] = ""
            projects[id]['numberCommittersNew'] = 0
            try:
                projects[id]['numberCommitters'] = len(projectLDAP[resource]["members"])
                projects[id]['numberPMCMembers'] = len(projectLDAP[resource]["owners"])
            except (Exception) as e:
                print("ERROR: Exception project LDAP " + resource + " : " + str(e))
                projects[id]['zFixMeList'].append(["01","No project LDAP"])
                projects[id]['numberCommitters'] = 0
                projects[id]['numberPMCMembers'] = 0
            if projects[id]['numberCommitters'] == projects[id]['numberPMCMembers']:
                projects[id]['numberPMCEquals'] = True
            else:
                projects[id]['numberPMCEquals'] = False

            descElements = row.getElementsByTagName("description")
            projects[id]['description'] = getText(descElements[0].childNodes)
            if 'FIXME' in projects[id]['description']:
                needMetadata = True
#            projects[id]['description'] = " ".join(projects[id]['description'].split()))
            projects[id]['sponsor'] = row.getAttribute("sponsor")
            projects[id]['startDate'] = startDate
            projects[id]['statusFileName'] = row.getAttribute("resource")

            mentors = [mentor.firstChild.data.strip()
                       for mentor in row.getElementsByTagName("mentor")]
            projects[id]['mentors'] = mentors
            if 'FIXME' in mentors:
                needMetadata = True
            if needMetadata:
                errorMsg = "{0}: Need to add incubation metadata.".format(id)
                print('ERROR:  ', errorMsg)
                errorMsg += " Please maintain your records in the content/podlings.xml file. See link:#h-hasstatusfile[help]"
                otherIssues.append(errorMsg)
                errorMsg = ["02","Podling metadata missing in content/podlings.xml file. This is ground truth."]
                projects[id]['zFixMeList'].append(errorMsg)
            # accumulate projects for each mentor
            mentors = [(mentor.firstChild.data.strip(),mentor.getAttribute("username"))
                       for mentor in row.getElementsByTagName("mentor")]
            projects[id]['mentors'] = mentors
            for mentor in mentors:
                try:
                    mentorsProjects[mentor[1]]
                except KeyError:
                    mentorsProjects[mentor[1]] = []
                mentorsProjects[mentor[1]].append(name)
                try:
                    mentorsName[mentor[1]] = mentor[0]
                except KeyError:
                    dontcare = True
            
            isGraduating = row.getElementsByTagName("graduating").length > 0
            if isGraduating:
                graduatingOrRetiring.append(id)
                if not row.getAttribute("endDate"):
                    errorMsg = "{0}: Has graduated, but still needs to follow the graduation steps.".format(id)
                    print('ERROR: ', errorMsg)
                    errorMsg += " See link:#h-graduate[help]."
                    otherIssues.append(errorMsg)
                    errorMsg = ["03","Graduated, but some graduation steps are incomplete"]
                    projects[id]['zFixMeList'].append(errorMsg)

            isRetiring = row.getElementsByTagName("retiring").length > 0
            if isRetiring:
                graduatingOrRetiring.append(id)
                if not row.getAttribute("endDate"):
                    errorMsg = "{0}: Has retired, but still needs to follow the retirement steps.".format(id)
                    print('ERROR: ', errorMsg)
                    errorMsg += " See link:#h-retire[help]."
                    otherIssues.append(errorMsg)
                    errorMsg = ["04","Retired, but some retirement steps are incomplete"]
                    projects[id]['zFixMeList'].append(errorMsg)

            # Is it reporting monthly?
            reporting = row.getElementsByTagName("reporting")
            if reporting.length != 1:
                projects[id]['hasReportingGroup'] = False
                if not isGraduating:
                    errorMsg = "SEVERE: {0}: expecting a singleton report group".format(id)
                    print(errorMsg)
                    otherIssues.append(errorMsg)
                    errorMsg = ["05","Reporting group is either missing or a multiple"]
                    projects[id]['zFixMeList'].append(errorMsg)
            else:
                if reporting[0].getAttribute("monthly").lower() == 'true':
                    projects[id]['reportingMonthly'] = True
                    projects[id]['reportingComments'] = getText(reporting)
                    projects[id]['hasEntryIssues'] = True
                group = reporting[0].getAttribute("group")
                if group == None:
                    projects[id]['hasReportingGroup'] = False
                    errorMsg = "SEVERE: {0}: missing group attribute".format(id)
                    print(errorMsg)
                    otherIssues.append(errorMsg)
                    errorMsg = ["06","Reporting group is missing"]
                    projects[id]['zFixMeList'].append(errorMsg)
                else:
                    projects[id]['reportingGroup'] = 'group-' + group

dom.unlink()

for k in sorted(graduatedProjects):
    checkStatus(k, graduatedProjects, 'graduated')
for k in sorted(retiredProjects):
    checkStatus(k, retiredProjects, 'retired')

# Process the incubation table data, detect some potential issues. ---

print("Gather details from project status files ...")
projectNames = list(projects.keys())
for k in sorted(projectNames, key=str.lower):
    if optionVerbose:
        print("DEBUG: Processing status file for {0}".format(k))

    # Append more potential alternate names for a project
    if projects[k]['statusFileName'] not in projects[k]['resourceNames']:
        projects[k]['resourceNames'].append(projects[k]['statusFileName'])
    if optionVerbose and len(projects[k]['resourceNames']) > 1:
        print("DEBUG: Will try alternate names: {0}".format(
            projects[k]['resourceNames']))

    # parse their project status file to extract specific information
    statusFile = SITE_CONTENT_DIR + \
        "projects/{0}.xml".format(projects[k]['statusFileName'])
    if os.path.exists(statusFile):
        try:
            dom = xml.dom.minidom.parse(statusFile)
        except (Exception) as e:
            print("ERROR: Exception processing " + statusFile + " : " + str(e))
            raise
        # get the project info hints
        if optionVerbose:
            print("DEBUG: Gather hints from project Status page")
        print (dom.nodeName)
        for node in dom.childNodes:
            for elem in node.childNodes:
                print (elem)
        table = dom.getElementsByTagName("table")[0]
        for row in table.getElementsByTagName("tr")[1:]:
            if (len(row.getElementsByTagName("td")) < 3):
                continue
            cell = row.getElementsByTagName("td")[2]
            if 'id' in cell.attributes:
                values = [getText(item.childNodes) for item in cell.childNodes]
                value = " ".join(values).strip()
                if value == "":
                    value = getText(cell.childNodes).strip()
                if optionVerbose:
                    print("DEBUG: Hint: {0}={1}".format(
                        cell.getAttribute('id'), value))
                if cell.getAttribute('id') == "mail-dev":
                    value = value.replace('  at  ', '@')
                    value = value.replace('  Subscribe  Unsubscribe', '')
                    value = value.replace('  Archive', '')
                    value = value.replace(' ', '@', 1)
                    value = value.replace(' ', '')
                    value = value.replace('@@', '@')
                    matchMail = re.search(mailListRE, value)
                    if matchMail:
                        projects[k][
                            'hintMailListDev'] = "{0}-{1}".format(matchMail.group(2), matchMail.group(1))
                    continue
                if cell.getAttribute('id') == "mail-commits":
                    value = value.replace('  at  ', '@')
                    value = value.replace('  Subscribe  Unsubscribe', '')
                    value = value.replace('  Archive', '')
                    value = value.replace(' ', '@', 1)
                    value = value.replace(' ', '')
                    value = value.replace('@@', '@')
                    matchMail = re.search(mailListRE, value)
                    if matchMail:
                        projects[k][
                            'hintMailListCommits'] = "{0}-{1}".format(matchMail.group(2), matchMail.group(1))
                    continue
                # Get hints for various url-based resources
                matchUrl = re.search(urlHttpRE, value)
                if not matchUrl:
                    for item in cell.getElementsByTagName('a'):
                        if 'href' in item.attributes:
                            value = item.getAttribute('href')
                            break
                hasUrl = re.search(urlHttpRE, value)
                if cell.getAttribute('id') == "svn" and hasUrl:
                    if value[-4:] == ".git":
                        value = ""
                    elif value[-1] == '/':
                        value = value[0:-1]
                    projects[k]['urlSvn'] = value
                    continue
                if cell.getAttribute('id') == "git" and hasUrl:
                    projects[k]['urlGit'] = value
                    continue
                if cell.getAttribute('id') == "tracker" and hasUrl:
                    projects[k]['urlTracker'] = value
                    if optionVerbose:
                        print("{0}: urlTracker={1}".format(k,value))
                    continue
                if cell.getAttribute('id') == "www" and hasUrl:
                    if len(value) < 10:
                        value = ""
                    projects[k]['urlWww'] = value
                    if optionVerbose:
                        print("{0}: urlWww={1}".format(k,value))
                    continue
                if cell.getAttribute('id') == "wiki" and hasUrl:
                    projects[k]['urlWiki'] = value
                    if optionVerbose:
                        print("{0}: urlWiki={1}".format(k,value))
                    continue
        # Scan the project News section and count new commiters.
        for section in dom.getElementsByTagName("section"):
            if 'id' in section.attributes and section.getAttribute('id') == "News":
                for line in section.toxml().splitlines():
                    if '<!--' in line:
                        continue
                    if not '<li>' in line:
                        continue
                    ll = line.split('<li>')
                    ll = ll[1].split('</li>')
                    projects[k]['news'].append(ll[0])
                    matchNewCommitter = re.search(newCommitterRE, line)
                    if matchNewCommitter:
                        if matchNewCommitter.group(1):
                            projects[k]['numberCommittersNew'] += int(matchNewCommitter.group(1))
                        else:
                            projects[k]['numberCommittersNew'] += 1
        dom.unlink()
    # end of if status file exists

# end of processing incubation table data

# Gather incubator group mail list data ---

print("Gather incubator group mail list data ...")


class IncubatorMailListNamesParser(HTMLParser):

    def __init__(self):
        self.strict = True
        self.names = []
        self.newStyle = []
        self.convert_charrefs = False
        self.reset()

    def handle_starttag(self, tag, attrs):
        # Get the newStyle projects
        if tag == "option":
            for key, value in attrs:
                if (key == "value" and ".incubator" in value):
                    value = value.replace('.incubator', '')
                    self.newStyle.append(value)

        # Get all Incubator lists
        if tag == "a":
            for key, value in attrs:
                if (key == "href" and "incubator" in value):
                    value = value.replace('incubator-', '')
                    value = value.replace('/', '')
                    self.names.append(value)
                    break

mailLists = IncubatorMailListNamesParser()
mailLists.feed(getUrl(MAIL_LIST_URL).read().decode('utf-8'))
mailLists.close()
if optionVerbose:
    pprint.pprint(mailLists.names)
    pprint.pprint(mailLists.newStyle)

projectMailLists = {}
mailListNamesRE = re.compile("(.*)-([^-]+)")
mailListNamesUsualRE = re.compile(
    "announce|commits|cvs|dev|issues|notifications|reviews|user|users|spec")
for listName in mailLists.names:
    if listName in ["announce", "cvs", "general", "projects", "dev", "commits", "user"]:
        continue
    if optionVerbose:
        print("DEBUG: listName=" + listName)
    if ('-' in listName):
        matchList = re.search(mailListNamesRE, listName)
        try:
            projectMailLists[matchList.group(1)]
        except KeyError:
            projectMailLists[matchList.group(1)] = {}
        listName = listName.replace('/', '')
        projectMailLists[matchList.group(1)][matchList.group(2)] = listName
        if optionVerbose:
            print("DEBUG: Found list: {0} {1}".format(
                matchList.group(1), matchList.group(2)))
            if (matchList.group(1) not in mailLists.newStyle):
                print("DEBUG: Uses oldStyle list set-up")
        # FIXME: We assume that mail lists are always named like this
        # with "-dev" or "-commits" etc.
        matchListUsual = re.search(mailListNamesUsualRE, matchList.group(2))
        if optionVerbose and not matchListUsual:
            errorMsg = "WARN: Unusual mail list name '{0}'".format(listName)
            print(errorMsg)
            otherIssues.append(errorMsg)
    else:
        listName = listName.replace('/', '')
        try:
            projectMailLists[listName]
        except KeyError:
            projectMailLists[listName] = {}
        projectMailLists[listName]['dev'] = listName
        errorMsg = "WARN: {0}: unusual mail list name '{1}', assuming it is their dev list".format(
            listName, projectMailLists[listName]['dev'])
        print(errorMsg)
        otherIssues.append(errorMsg)
#        projects[projectMailLists[listName]]['zFixMeList'].append(errorMsg)

if optionVerbose:
    print("DEBUG: projectMailLists")
    pprint.pprint(projectMailLists)

# Gather incubator PGP keys data ---

print("Gather incubator PGP keys data and releases ...")

keysList = {}
releases = {}
releasesBadName = {}
releasesListing = {}
distributions = {}
distribHash = {}
distareas = {}  # podlings with dist areas

with osPopen(['svn', 'ls', '-Rv', 'https://dist.apache.org/repos/dist/release/incubator']) as s:
    for line in s.stdout:
        line = line.strip()
        if line[-1:] == '/':
            # skip directories
            continue
        listing = line.split(' ')
        revision = "r{0}".format(listing[0])
        user = listing[1]
        if listing[-6] == '':
            dtm1 = datetime.datetime.strptime(" ".join(listing[-4:-2])+" "+str(gatherYear),"%b %d %Y")
            if dtm1 > gatherDate:
                dtm1 = datetime.datetime.strptime(" ".join(listing[-4:-2])+" "+str(gatherYear-1),"%b %d %Y")
            fsize = listing[-5]
        else:
            dtm1 = datetime.datetime.strptime(" ".join(listing[-5:-1]),"%b %d %Y")
            fsize = listing[-6]
        dtm = dtm1.strftime("%m/%d/%Y")
        line = listing[-1]
        releasesListing[line] = {
            'user': user,
            'revision': revision,
            'dtm': dtm,
            'size': fsize
            }
        fields = line.split('/')
        podling = fields[0]
        distareas[podling] = True
        file = fields[-1]
        if file:
            if re.search('KEYS(\.txt)?$', file):
                keysList[podling] = "{0}/{1}".format("https://downloads.apache.org/incubator", line)
            if re.search('\.(asc|sig)$', file, flags=re.IGNORECASE):
                path = "/".join(fields[1:])
                if optionVerbose:
                    print("DEBUG: {0} - {1}".format(podling,path))
                try:
                    if distributions[podling]:
                        distributions[podling].append(path)
                except:
                    distributions[podling] = []
                    distributions[podling].append(path)
                if re.search('incubat(ing|or)', file, flags=re.IGNORECASE):
                    releases[podling] = True
                else:
                    releasesBadName[podling] = True
                    if podling in projects and podling != "netbeans":
                        # netbeans has over 600 release parts and had legacy naming requirements
                        errorMsg = "WARN: {0}: Release is missing incubator/incubating in file name {1}".format(podling,file)
                        print(errorMsg)
                        errorMsg = ["42","Distribution is missing incubator/incubating in file name {0}".format(file)]
                        projects[podling]['zFixMeList'].append(errorMsg)                    
            if re.search('\.(sha512|sha1|sha256|sha|md5)$', file, flags=re.IGNORECASE):
                path = "/".join(fields[1:])
                if optionVerbose:
                    print("DEBUG: {0} - {1}".format(podling,path))
                try:
                    # some projects have done a hash on the detached signature. Identify and ignore.
                    part0 = ".".join(path.split('.')[-2:-1])
                    if part0 == "asc":
                        continue
                    # see if this is not the first hash for this release
                    # sha512 is preferred and it will be the last provided.
                    part1 = ".".join(distribHash[podling][-1].split('.')[:-1])
                    part2 = ".".join(path.split('.')[:-1])
                    if part1 == part2:
                        distribHash[podling][-1] = path
                    else:
                        distribHash[podling].append(path)
                except:
                    distribHash[podling] = []
                    distribHash[podling].append(path)

for k in releases:
    if not k in projects:
        if k in graduatedProjects:
            errorMsg = "{0}: Has graduated, but still has remains on Incubator distribution mirrors".format(
                k)
            print("ERROR: ", errorMsg)
            errorMsg += ". See link:#h-graduate[help]."
            otherIssues.append(errorMsg)
            continue
        if k in retiredProjects:
            errorMsg = "{0}: retired project has remains on Incubator mirrors".format(k)
            print("WARN: ", errorMsg)
            errorMsg += ". See link:#h-retire[help]."
            otherIssues.append(errorMsg)

for k in releasesBadName:
    errorMsg = '{0}: Has a distribution filename missing the word "incubating/incubator"'.format(k)
    print('WARN:', errorMsg)
    errorMsg += ". See link:#h-hasrelease[help]."
    otherIssues.append(errorMsg)

# Processing the gathered sata ---

print("Processing ...")
# Process the reporting schedule data, correlate and ensure each exists in the
# incubation projects summary table, add more details to the data store.
projectNames = list(projects.keys())
for k in sorted(projectNames, key=str.lower):
    print(k)

    statusFile = SITE_CONTENT_DIR + \
        "projects/{0}.xml".format(projects[k]['statusFileName'])
    if not os.path.exists(statusFile):
        errorMsg = "{0}: Missing status file".format(k)
        print('ERROR:  ', errorMsg)
        errorMsg += ". See link:#h-hasstatusfile[help]."
        otherIssues.append(errorMsg)
        projects[k]['hasStatusEntry'] = False
        errorMsg = ["07","Status file is missing"]
        projects[k]['zFixMeList'].append(errorMsg)
        continue

    startDate = projects[k]['startDate']
    match = re.search(startDateRE, startDate)
    if match:
        if match.group(3) != None:
            entryDateDay = int(match.group(3))
        else:
            entryDateDay = 1
        try:
            entryDate = datetime.datetime(
                int(match.group(1)), int(match.group(2)), entryDateDay)
        except ValueError:
            errorMsg = "ERROR:  {0}: ValueError with date".format(k)
            print(errorMsg)
            errorMsg = ["08","Start date is not a valid date"]
            projects[k]['zFixMeList'].append(errorMsg)
        else:
            projects[k]['entryDate'] = entryDate

    # Gather recent updates to their status page.
    inputFile = SITE_CONTENT_DIR + \
        "projects/{0}.xml".format(projects[k]['statusFileName'])
    if optionVerbose:
        print("DEBUG: Parsing svn log for {0} ...".format(inputFile))
    outputString = osExec(['svn', 'log', '--xml', inputFile])
    dom = xml.dom.minidom.parseString(outputString)
    rowCounter = 0
    count1 = 0
    count2 = 0
    count3 = 0
    lastDate = entryDate
    for row in dom.getElementsByTagName("logentry"):
        # Skip counting various commits which were to standardise the status
        # files.
        matchSvnSkip = re.search(
            svnRevisionSkipRE, row.getAttribute('revision'))
        if matchSvnSkip:
            continue
        rowCounter += 1
        date = getText(row.getElementsByTagName("date")[0].childNodes)
        matchSvn = re.search(statusLogRE, date)
        thisDate = datetime.datetime(
            int(matchSvn.group(1)), int(matchSvn.group(2)), int(matchSvn.group(3)))
        if thisDate > lastDate:
            lastDate = thisDate
        if rowCounter == 1:
            projects[k]['statusLastUpdated'] = "{0:4d}-{1:02d}-{2:02d}".format(
                int(matchSvn.group(1)), int(matchSvn.group(2)), int(matchSvn.group(3)))
        if thisDate >= statusTallyDate1:
            count1 += 1
        if thisDate >= statusTallyDate2:
            count2 += 1
        if thisDate >= statusTallyDate3:
            count3 += 1
    if projects[k]['entryDate'] >= statusTallyDate1:
        count2 = "-"
    if projects[k]['entryDate'] >= statusTallyDate2:
        count3 = "-"
    projects[k]['statusUpdateCounts'] = "{0},{1},{2}".format(
        count1, count2, count3)
    projects[k]['statusAge'] = (gatherDate-lastDate).days

    dom.unlink()

# end of processing

# Collect SVN directory names ---

print("Collect SVN directory names")
incubatorSvnDirs = {}  # top-level SVN incubator dirs
outputString = osExec(
    ['svn', 'ls', '--xml', 'http://svn.apache.org/repos/asf/incubator/'])
dom = xml.dom.minidom.parseString(outputString)
"""
Sample output
<lists>
  <list path="http://svn.apache.org/repos/asf/incubator">
    <entry kind="file">
    <name>REPO-ORGANISATION.txt</name>
    ...
    </entry>
    <entry kind="dir">
    <name>accumulo</name>
    ...
"""
for entry in dom.getElementsByTagName("entry"):
    if entry.getAttribute("kind") == 'dir':
        name = entry.getElementsByTagName("name")[0].firstChild.data
        if name not in ('trunk', 'public'):  # skip non-podling entries
            incubatorSvnDirs[
                "http://svn.apache.org/repos/asf/incubator/{0}/".format(name)] = True

if len(incubatorSvnDirs) == 0:
    print("ABORT:  Incubator http://svn.apache.org/repos/asf/incubator is not available")
    sys.exit(5)

# Detect certain resources ---

print("Detect certain resources ...")
for k in sorted(projectNames, key=str.lower):
    print(k)

    # Add the number of committers
    # Sometimes the committer SVN group name contains the sponsor TLP,
    # e.g. portals-wsrp4j
#    svnGroups = projects[k]['resourceNames'][:]
#    sponsor = projects[k]['sponsor'].lower()
#    if '?' in sponsor:
#        sponsor = "incubator"
#    if not 'incubator' in sponsor:
#        tlpSvn = "{0}-{1}".format(sponsor, k)
#        svnGroups.append(tlpSvn)
#    for svnGroup in svnGroups:
#        if optionVerbose:
#            print("DEBUG: Trying committers group '{0}'".format(svnGroup))
#        if svnGroup in committers_projects:
#            projects[k]['numberCommitters'] = len(
#                committers_projects[svnGroup])
#            projects[k]['committersSvn'] = svnGroup
#            break
#        else:
#            continue
#    if projects[k]['committersSvn'] == None and optionInfo:
#        print("INFO: {0}: Does not yet have committers accounts".format(k))

    # Detect if they have Tracker yet.
    # First, try the url from their status page
    # then, try a standard url.(BAD - removed)
    urlTrackerDefault = "" # no default
    #urlTrackerDefault = "https://issues.apache.org/jira/browse/{0}".format(
    #        projects[k]['statusFileName'].upper())
    for url in [projects[k]['urlTracker'], urlTrackerDefault]:
        if url == "":
            continue
        if optionVerbose:
            print("DEBUG: Trying Tracker URL: " + url)
        try:
            if url[-1] == '/':
                url = url[0:-1]
            getUrl(url)
            projects[k]['urlTracker'] = url
            break
        except IOError:
            projects[k]['urlTracker'] = ""
    if not projects[k]['urlTracker']:
        errorMsg = "INFO: {0}: Has not yet provided an Issue Tracker link".format(k)
        print(errorMsg)
        errorMsg = ["10","Has not provided an issue tracker link"]
        projects[k]['zFixMeList'].append(errorMsg)

    # Detect if they have Wiki yet.
    # First, try the url from their status page
    # then, possibly, try a standard url.
    urlWikiDefault = "" # no default
    for url in [projects[k]['urlWiki'], urlWikiDefault]:
        if url == "":
            continue
        if optionVerbose:
            print("DEBUG: Trying Wiki URL: " + url)
        try:
            if url[-1] == '/':
                url = url[0:-1]
            getUrl(url)
            projects[k]['urlWiki'] = url
            break
        except IOError:
            projects[k]['urlWiki'] = ""
    if not projects[k]['urlWiki']:
        errorMsg = "INFO: {0}: Has not yet provided a Wiki link".format(k)
        print(errorMsg)
        errorMsg = ["18","Has not provided a wiki link"]
        projects[k]['zFixMeList'].append(errorMsg)

    # Detect if they have a website yet.
    # First, try the url from their status page
    # then, try a standard url.
    urlWwwDefault = "http://{0}.incubator.apache.org/".format(
        projects[k]['statusFileName'])
    urlWwwDefault2 = "http://incubator.apache.org/{0}/".format(
        projects[k]['statusFileName'])
    if urlWwwDefault == projects[k]['urlWww']:
        urlWwwDefault = ""
    if urlWwwDefault2 == projects[k]['urlWww']:
        urlWwwDefault2 = ""
    for url in [projects[k]['urlWww'], urlWwwDefault, urlWwwDefault2]:
        if url == "":
            continue
        try:
            if url[-1] == '/':
                url = url[0:-1]
            getUrl(url)
            projects[k]['urlWww'] = url
            break
        except IOError:
            projects[k]['urlWww'] = ""
    if not projects[k]['urlWww']:
        errorMsg = "INFO: {0}: Does not yet have a website".format(k)
        print(errorMsg)
        errorMsg = ["11","Website is missing"]
        projects[k]['zFixMeList'].append(errorMsg)
    elif "apache.org" not in projects[k]['urlWww']:
        errorMsg = "INFO: {0}: Does not yet have an Apache website. Instead {1}".format(k,projects[k]['urlWww'])
        print(errorMsg)
        errorMsg = ["09","Apache website is missing. Instead: {0}".format(projects[k]['urlWww'])]
        projects[k]['zFixMeList'].append(errorMsg)

    if projects[k]['urlWww']:
        # if the website exists then look into site scan for possible issues.
        j = projects[k]['resource']
        try:
            if podlingSiteScan[j]:
                if not podlingSiteScan[j]['foundation']:
                    errorMsg = "INFO: {0}: Does not yet have website foundation link".format(k)
                    print(errorMsg)
                    errorMsg = ["31","ASF Foundation link is missing from website"]
                    projects[k]['zFixMeList'].append(errorMsg)
                if not podlingSiteScan[j]['events']:
                    errorMsg = "INFO: {0}: Does not yet have website events link".format(k)
                    print(errorMsg)
                    errorMsg = ["32","ASF Events link is missing from website"]
                    projects[k]['zFixMeList'].append(errorMsg)
                if not podlingSiteScan[j]['license']:
                    errorMsg = "INFO: {0}: Does not yet have website license link".format(k)
                    print(errorMsg)
                    errorMsg = ["33","Apache License link is missing from website"]
                    projects[k]['zFixMeList'].append(errorMsg)
                if not podlingSiteScan[j]['thanks']:
                    errorMsg = "INFO: {0}: Does not yet have website thanks link".format(k)
                    print(errorMsg)
                    errorMsg = ["34","Apache Thanks link is missing from website"]
                    projects[k]['zFixMeList'].append(errorMsg)
                if not podlingSiteScan[j]['security']:
                    errorMsg = "INFO: {0}: Does not yet have website security link".format(k)
                    print(errorMsg)
                    errorMsg = ["35","Apache Security link is missing from website"]
                    projects[k]['zFixMeList'].append(errorMsg)
                if not podlingSiteScan[j]['sponsorship']:
                    errorMsg = "INFO: {0}: Does not yet have website sponsorship link".format(k)
                    print(errorMsg)
                    errorMsg = ["36","Apache Sponsorship link is missing from website"]
                    projects[k]['zFixMeList'].append(errorMsg)
                if not podlingSiteScan[j]['trademarks']:
                    errorMsg = "INFO: {0}: Does not yet have website trademarks".format(k)
                    print(errorMsg)
                    errorMsg = ["37","Trademarks are missing from website"]
                    projects[k]['zFixMeList'].append(errorMsg)
                if not podlingSiteScan[j]['copyright']:
                    errorMsg = "INFO: {0}: Does not yet have website copyright".format(k)
                    print(errorMsg)
                    errorMsg = ["38","Copyright is missing from website"]
                    projects[k]['zFixMeList'].append(errorMsg)
                flag = False
                try:
                    if not podlingSiteScan[j]['disclaimer']:
                        flag = True
                except (Exception) as e:
                    flag = True
                if flag:
                    errorMsg = "INFO: {0}: Does not yet have website disclaimer".format(k)
                    print(errorMsg)
                    errorMsg = ["39","Disclaimer is missing from website"]
                    projects[k]['zFixMeList'].append(errorMsg)
            else:
                print('missing from site scan')
        except (Exception) as e:
            print("ERROR: {0}: Exception processing: ".format(k) + str(e))

    # See if they have a distribution area yet.
    for nameDist in projects[k]['resourceNames']:
        urlDist = "https://downloads.apache.org/incubator/{0}/".format(nameDist)
        urlMirror = "https://www.apache.org/dyn/closer.lua/incubator/{0}/".format(nameDist)
        urlDistSVN = "https://dist.apache.org/repos/dist/release/incubator/{0}".format(nameDist)
        if nameDist in distareas:
            projects[k]['urlDist'] = urlMirror
            projects[k]['urlDistSVN'] = urlDistSVN
            break
    if not projects[k]['urlDist']:
        errorMsg = "INFO: {0}: Does not yet have a distribution area".format(k)
        print(errorMsg)
        errorMsg = ["12","Release distribution area is not setup"]
        projects[k]['zFixMeList'].append(errorMsg)

    elif optionVerbose:
        print("DEBUG: dist=" + projects[k]['urlDist'])

    # Detect if they have a PGP KEYS file
    if projects[k]['urlDist']:
        match = re.search("/incubator/([^/]+)/", projects[k]['urlDist'])
        if match:
            nameDistArea = match.group(1)
            if nameDistArea in keysList:
                projects[k]['urlKeys'] = keysList[nameDistArea]
                try:
                    projects[k]['distributions'] = distributions[nameDistArea]
                except KeyError:
                    errorMsg = "INFO: {0}: Has a PGP KEYS file, but no signed distribution".format(k)
                    print(errorMsg)
                    errorMsg = ["41","PGP KEYS file available without a distribution"]
                    projects[k]['zFixMeList'].append(errorMsg)                    
                try:
                    projects[k]['distribHash'] = distribHash[nameDistArea]
                except KeyError:
                    dontcare = True
                try:
                    l1 = len(projects[k]['distributions'])
                    l2 = len(projects[k]['distribHash'])
                    ld = l1-l2
                    if l1 > 8:
                        errorMsg = "WARN: {0}: Has {1} release distributions. Non-current releases should be removed".format(k,l1)
                        print(errorMsg)
                        errorMsg = ["45","{0} is a large number of release distributions. Non-current releases should be removed".format(l1)]
                        projects[k]['zFixMeList'].append(errorMsg)
                    if ld > 0:
                        errorMsg = "WARN: {0}: Has {1} signed release distributions without a hash".format(k,ld)
                        print(errorMsg)
                        errorMsg = ["43","Signed release distributions without a hash = {0}".format(ld)]
                        projects[k]['zFixMeList'].append(errorMsg)
                        pprint.pprint(projects[k]['distributions'])
                        pprint.pprint(projects[k]['distribHash'])
                    elif ld < 0:
                        ld = l2-l1
                        errorMsg = "WARN: {0}: Has {1} unsigned release distributions".format(k,ld)
                        print(errorMsg)
                        errorMsg = ["44","Unsigned release distributions = {0}".format(ld)]
                        projects[k]['zFixMeList'].append(errorMsg)
                        pprint.pprint(projects[k]['distributions'])
                        pprint.pprint(projects[k]['distribHash'])
                    else:
                        # create a dictionary of the podling releases.
                        url1 = projects[k]['urlKeys']
                        url1 = "/".join(url1.split("/")[:-1])+"/"
                        url2 = projects[k]['urlDist']
                        releasesDist = {}
                        for i in range(l1):
                            l = projects[k]['distributions'][i]
                            kk = projects[k]['distribHash'][i]
                            checksum = "".join(kk.split(".")[-1:])
                            r = ".".join(l.split(".")[:-1])
                            rr = "".join(r.split("/")[-1:])
                            folder = "/".join(r.split("/")[:-1])
                            rMeta = releasesListing[k+"/"+r]
                            release = {
                                'folder': folder,
                                'hash': checksum, 
                                'user': rMeta['user'],
                                'revision': rMeta['revision'],
                                'dtm': rMeta['dtm'],
                                'size': rMeta['size']
                                }
                            releasesDist[rr] = release
                        projects[k]['releases'] = releasesDist
                except:
                    dontcare = True
            else:
                errorMsg = "INFO: {0}: Does not yet have a PGP KEYS file".format(k)
                print(errorMsg)
                errorMsg = ["13","Release signing PGP KEYS file is missing"]
                projects[k]['zFixMeList'].append(errorMsg)
                try:
                    if len(distributions[nameDistArea]) > 0 or len(distribHash[nameDistArea]) > 0:
                        errorMsg = "{0}: Apache Releases cannot be validated without a KEYS file. Please add required KEYS!".format(k)
                        print("SEVERE: ",errorMsg)
                        otherIssues.append(errorMsg)
                        errorMsg = ["46","Apache Releases cannot be validated without a KEYS file. Please add required KEYS!"]
                        projects[k]['zFixMeList'].append(errorMsg)
                except:
                    dontcare = True

    if optionVerbose:
        print("DEBUG: KEYS=" + projects[k]['urlKeys'])

    # Detect mail lists established:
    # For each alternate resourceName:
    # First, try the list names from their status page
    # then, try a standard list name under incubator.
    # To reduce network queries, if it is an incubator-hosted list then look up in
    # the list of mail-lists already gathered, otherwise it is a TLP-hosted list,
    # so try getting the archives URL.
    foundMailLists = False
    for projectName in projects[k]['resourceNames']:
        for listType in ['dev', 'commits']:
            if listType == "dev":
                mailListHintKey = "hintMailListDev"
                mailListKey = "hasMailListDev"
            else:
                mailListHintKey = "hintMailListCommits"
                mailListKey = "hasMailListCommits"
            if optionVerbose:
                print("DEBUG: Looking for mailList: " +
                      projects[k][mailListHintKey])
            matchMail = re.search(mailListNameRE, projects[k][mailListHintKey])
            if matchMail:
                mailListGroup = matchMail.group(1)
                mailListNameHint = matchMail.group(2)
            else:
                mailListGroup = "incubator"
                mailListNameHint = ""
            if optionVerbose:
                print("DEBUG: Trying mailListGroup={0} mailListNameHint={1}".format(
                    mailListGroup, mailListNameHint))
            if mailListGroup == "incubator":
                mailListNameDefault = "{0}-{1}-{2}".format(mailListGroup, projectName, listType)
                if mailListNameDefault == mailListNameHint:
                    mailListNameDefault = ""
                for listName in [mailListNameHint, mailListNameDefault]:
                    if listName == "":
                        continue
                    if optionVerbose:
                        print("DEBUG: Trying listName=" + listName)
                    if not projectName in projectMailLists:
                        if optionVerbose:
                            print("DEBUG: {0}: No incubator group mail lists using '{1}'".format(k, projectName))
                        break
                    if listType in projectMailLists[projectName]:
                        leader = 'incubator'
                        projects[k][mailListKey] = MAIL_LIST_URL + \
                            "{0}-{1}".format(projectName,listType)
                        if optionVerbose:
                            print("DEBUG: Successful Incubator mail url: " +
                                  projects[k][mailListKey])
                        foundMailLists = True
                        break
#                    else:
#                        errorMsg = "INFO: {0}: Does not yet have hinted incubator mail list '{1}-{2}'".format(k, projectName, listType)
#                        print(errorMsg)
#                        errorMsg = ["14","Expected mailing list (incubator-{0}-{1}) is missing".format(projectName, listType)]
#                        projects[k]['zFixMeList'].append(errorMsg)
#                        projects[k][mailListKey] = ""
            # End of processing incubator group mail list.
            else:
                listName = projects[k][mailListHintKey]
                url = "http://mail-archives.apache.org/mod_mbox/{0}".format(
                    listName)
                if optionVerbose:
                    print("DEBUG: Trying mail url: " + url)
                try:
                    getUrl(url)
                except IOError:
                    projects[k][mailListKey] = ""
                else:
                    projects[k][mailListKey] = url
                    if optionVerbose:
                        print("DEBUG: Successful TLP mail url: " + url)
                    foundMailLists = True
        if foundMailLists:
            break
    # End of processing project mail lists.
    if not projects[k]['hasMailListDev']:
        errorMsg = "ERROR:  {0}: Does not yet have 'dev' mail list".format(k)
        print(errorMsg)
        errorMsg = ["15","Project does not have 'dev' mailing list"]
        projects[k]['zFixMeList'].append(errorMsg)
    if not projects[k]['hasMailListCommits']:
        errorMsg = "INFO: {0}: Does not yet have 'commits' mail list".format(k)
        print(errorMsg)
        errorMsg = ["16","Project does not have 'commits' mailing list"]
        projects[k]['zFixMeList'].append(errorMsg)

# end of processing each podling to detect resource availability

for entry in sorted(incubatorSvnDirs):
    if incubatorSvnDirs[entry] == True and entry in graduatedProjects:
        print("INFO: graduated project has SVN directory " + entry)

# Output data files ---

print("Output the data ...")
reportingGroups = {'month': 'Monthly',
                   'group-1': 'January, April, July, October',
                   'group-2': 'February, May, August, November',
                   'group-3': 'March, June, September, December'}
monthsLong = 'January February March April May June July August September October November December'.split()
nameCurrentReport = "{0}{1}".format(
    monthsLong[gatherDate.month - 1], gatherDate.year)
urlCurrentReport = "".join(
    ["http://wiki.apache.org/incubator/", nameCurrentReport])

fileXmlMY = open(CLUTCH_CONTENT_DIR + '_includes/clutchmy.ad', encoding='utf-8', mode='w')
fileXmlMY.write(
    '{0}[{1}]\n'.format(urlCurrentReport, nameCurrentReport))
fileXmlMY.close()

fileList = open(CLUTCH_CONTENT_DIR + 'clutch.txt', 'w')

fileXmlo1 = open(CLUTCH_CONTENT_DIR + '_includes/clutcho1.ad', encoding='utf-8', mode='w')
if len(otherIssues):
    otherXml = """==== Other Issues\n\nSometimes other issues are found which are link:#other[listed] below for:\n"""
    otherIssuesRE = re.compile("^([^:]+):.*$")
    deduplicateOthers = {}
    if len(otherIssues) == 0:
        otherXml += "\n* No other issues"
    for issue in sorted(otherIssues, key=str.lower):
        print("issue="+issue)
        matchOtherIssues = re.search(otherIssuesRE, issue)
        try:
            if deduplicateOthers[matchOtherIssues.group(1)]:
                continue
        except:
            deduplicateOthers[matchOtherIssues.group(1)] = True
        otherXml += '\n* [.care]#{0}# '.format(
            matchOtherIssues.group(1))
    otherXml += "\n"
    fileXmlo1.write(otherXml)
fileXmlo1.close()

fileXmlt = open(CLUTCH_CONTENT_DIR + '_includes/clutcht.ad', encoding='utf-8', mode='w')
tableTopXml = """
Clutch last gathered: {0} UTC.

Number of podlings in incubation: {1}
""".format(gatherDateString, len(projects))
fileXmlt.write(tableTopXml)
fileXmlt.close()

fileList.write('#identifier,name,sponsor\n')
reportList1 = ""
reportList2 = ""
reportList3 = ""
tableRowCount = 0
fileXml = open(CLUTCH_CONTENT_DIR + '_includes/clutchr.ad', encoding='utf-8', mode='w')
for k in sorted(projectNames, key=str.lower):
    tableRowCount += 1
    fileXml.write('\n|')
#    if k in graduatingOrRetiring:
#        fileXml.write('[="grad"')
    fileXml.write('link:{0}.html[{1}]\n'.format(k,projects[k]['fullName']))
    persist[k] = {}
    persist[k]['podlingName'] = projects[k]['name']
    persist[k]['resource'] = projects[k]['resource']
    persist[k]['fullName'] = projects[k]['fullName']

    if '?' in projects[k]['sponsor']:
        fileXml.write(
            '|[.issue]#{0}#\n'.format(projects[k]['sponsor']))
    else:
        fileXml.write(
            '|{0}\n'.format(projects[k]['sponsor']))
    persist[k]['sponsor'] = projects[k]['sponsor']
    persist[k]['description'] = projects[k]['description']
    persist[k]['mentors'] = projects[k]['mentors']

    fileXml.write('|{0}\n'.format(projects[k]['startDate']))
    persist[k]['startDate'] = projects[k]['startDate']
    startDate = datetime.datetime.strptime(projects[k]['startDate'],"%Y-%m-%d")
    daysIncubating = (gatherDate-startDate).days

    # elapsedDays column
    if daysIncubating < 92:
        fileXml.write('|link:/projects/{0}.html">[[.cool3]#{1}#]\n'.format(
            projects[k]['statusFileName'], daysIncubating))
    elif daysIncubating < 365:
        fileXml.write('|link:/projects/{0}.html">[[.cool2]#{1}#]\n'.format(
            projects[k]['statusFileName'], daysIncubating))
    elif daysIncubating < 552:
        fileXml.write('|link:/projects/{0}.html">[[.cool1]#{1}#]\n'.format(
            projects[k]['statusFileName'], daysIncubating))
    elif daysIncubating < 730:
        fileXml.write('|link:/projects/{0}.html">[[.cool3]#{1}#]\n'.format(
            projects[k]['statusFileName'], daysIncubating))
    else:
        fileXml.write('|link:/projects/{0}.html">[[.cool4]#{1}#]\n'.format(
            projects[k]['statusFileName'], daysIncubating))
    persist[k]['statusAge'] = projects[k]['statusAge']

    if not projects[k]['reportingMonthly']:
        fileXml.write(
            '|{0}\n'.format(projects[k]['reportingMonthly']))
    else:
        fileXml.write(
            '|[.care]#{0}#\n'.format(projects[k]['reportingMonthly']))
    persist[k]['reportingMonthly'] = projects[k]['reportingMonthly']

    fileXml.write(
        '|{0}\n'.format(projects[k]['reportingGroup'][-1]))
    # save the simple group number for programs that have their own ideas.
    persist[k]['rawReportingGroup'] = projects[k]['reportingGroup']
    persist[k]['reportingGroup'] = reportingGroups[
        projects[k]['reportingGroup']]
    reportDevList = '"{0} Developers"'.format(projects[k]['fullName'])
    if projects[k]['hasMailListDev']:
        # assume that if we have identified one they are all now the standard pattern.
        reportDevList += " <dev@{0}.incubator.apache.org>".format(
            projects[k]['resource'])
    else:
        reportDevList += " <general@incubator.apache.org>"
    if optionVerbose:
        print("DEBUG: {0}: reportDevList={1}".format(k, reportDevList))
    reportDevList += "\n"
    if projects[k]['reportingMonthly']:
        reportList1 += reportDevList
        reportList2 += reportDevList
        reportList3 += reportDevList
    else:
        if (projects[k]['reportingGroup'] == "group-1"):
            reportList1 += reportDevList
        elif (projects[k]['reportingGroup'] == "group-2"):
            reportList2 += reportDevList
        elif (projects[k]['reportingGroup'] == "group-3"):
            reportList3 += reportDevList

    if projects[k]['hasStatusEntry']:
        fileXml.write('|link:/projects/{0}.html">[[.cool1]#{1}#]\n'.format(
            projects[k]['statusFileName'], projects[k]['hasStatusEntry']))
    else:
        fileXml.write(
            '|[.issue]#{0}#\n'.format(projects[k]['hasStatusEntry']))

    fileXml.write(
        '|{0}\n'.format(projects[k]['statusLastUpdated']))
    persist[k]['statusLastUpdated'] = projects[k]['statusLastUpdated']

    # statusAge column
    if projects[k]['statusAge'] < 61:
        fileXml.write('|link:/projects/{0}.html">[[.cool1]#{1}#]\n'.format(
            projects[k]['statusFileName'], projects[k]['statusAge']))
    elif projects[k]['statusAge'] < 122:
        fileXml.write('|link:/projects/{0}.html">[[.cool2]#{1}#]\n'.format(
            projects[k]['statusFileName'], projects[k]['statusAge']))
    elif projects[k]['statusAge'] < 175:
        fileXml.write('|link:/projects/{0}.html">[[.cool3]#{1}#]\n'.format(
            projects[k]['statusFileName'], projects[k]['statusAge']))
    else:
        fileXml.write('|link:/projects/{0}.html">[[.cool4]#{1}#]\n'.format(
            projects[k]['statusFileName'], projects[k]['statusAge']))
    persist[k]['statusAge'] = projects[k]['statusAge']

    fileXml.write(
        '|{0}\n'.format(projects[k]['statusUpdateCounts']))

    if projects[k]['numberCommitters'] > 0:
        if projects[k]['numberCommitters'] > 2:
            fileXml.write('|https://people.apache.org/phonebook.html?podling={0}[[.cool1]#{1}#]\n'.format(
                projects[k]['resource'], projects[k]['numberCommitters']))
        else:
            fileXml.write('|https://people.apache.org/phonebook.html?podling={0}[[.care]#{1}#]\n'.format(
                projects[k]['resource'], projects[k]['numberCommitters']))
    else:
        fileXml.write('|[.care]#-#\n')

    if projects[k]['numberCommittersNew'] > 0:
        if projects[k]['numberCommittersNew'] > 1:
            fileXml.write(
                '|[.cool1]#{0}#\n'.format(projects[k]['numberCommittersNew']))
        else:
            fileXml.write(
                '|[.cool2]#{0}#\n'.format(projects[k]['numberCommittersNew']))
    else:
        fileXml.write('|[.care]#0#\n')

    # individual podling analysis pages will use these as opposed to old style clutch page using announced new committers (flawed...)
    resource = projects[k]['resource']
    try:
        persist[k]['ldap'] = {
            'members': projectLDAP[resource]["members"],
            'owners': projectLDAP[resource]["owners"],
            'numberCommitters': projects[k]['numberCommitters'],
            'numberCommittersNew': projects[k]['numberCommittersNew'],
            'numberPMCMembers': projects[k]['numberPMCMembers'],
            'numberPMCEquals': projects[k]['numberPMCEquals']
            }
    except:
        persist[k]['ldap'] = {}

    if projects[k]['urlSvn']:
        fileXml.write(
            '|{0}[[.cool1]#True#]\n'.format(projects[k]['urlSvn']))
    else:
        if projects[k]['repositories']:
            # we have the dictionary of gitbox repositories and we can extract the first git repos from the sorted list.
            projects[k]['urlGit'] = None
            projects[k]['urlGit'] = 'https://gitbox.apache.org/repos/asf?p={0}.git;a=tree;hb=HEAD'.format(projects[k]['repositories'][0][0])
            fileXml.write(
                '|{0}[[.cool1]#True#]\n'.format(projects[k]['urlGit']))
        else:
            fileXml.write('|[.care]#False#\n')

    persist[k]['repositories'] = projects[k]['repositories']
    if not projects[k]['urlGit'] and not projects[k]['urlSvn']:
        errorMsg = "INFO: {0}: Does not yet have a source code repository".format(k)
        print(errorMsg)
        errorMsg = ["17","No source code repository"]
        projects[k]['zFixMeList'].append(errorMsg)
    if len(projects[k]['urlGit']) > 0 and len(projects[k]['repositories']) == 0:
        errorMsg = "INFO: {0}: Does not yet have a source code repository".format(k)
        print(errorMsg)
        errorMsg = ["17","No source code repository. Declared but nonexistent"]
        projects[k]['zFixMeList'].append(errorMsg)

    if projects[k]['urlTracker']:
        fileXml.write(
            '|{0}[[.cool1]#True#]\n'.format(projects[k]['urlTracker']))
    else:
        fileXml.write('|[.care]#False#\n')

    hasUrl = re.search(urlHttpRE, projects[k]['hasMailListDev'])
    if hasUrl:
        fileXml.write(
            '|{0}[[.cool1]#True#]\n'.format(projects[k]['hasMailListDev']))
    else:
        fileXml.write('|[.care]#False#\n')
    persist[k]['hasMailListDev'] = projects[k]['hasMailListDev']

    hasUrl = re.search(urlHttpRE, projects[k]['hasMailListCommits'])
    if hasUrl:
        fileXml.write('|{0}[[.cool1]#True#]\n'.format(
            projects[k]['hasMailListCommits']))
    else:
        fileXml.write('|[.care]#False#\n')
    persist[k]['hasMailListCommits'] = projects[k]['hasMailListCommits']

    if projects[k]['urlWww']:
        fileXml.write(
            '|{0}[[.cool1]#True#]\n'.format(projects[k]['urlWww']))
    else:
        fileXml.write('|[.care]#False#\n')

    if projects[k]['urlDistSVN']:
        fileXml.write(
            '|{0}[[.cool1]#True#]\n'.format(projects[k]['urlDistSVN']))
    else:
        fileXml.write('|[.care]#False#\n')

    if projects[k]['urlKeys']:
        fileXml.write(
            '|{0}[[.cool1]#True#]\n'.format(projects[k]['urlKeys']))
    else:
        fileXml.write('|[.care]#False#\n')

    persist[k]['urls'] = {
        'www': projects[k]['urlWww'],
        'keys': projects[k]['urlKeys'],
        'tracker': projects[k]['urlTracker'],
        'wiki': projects[k]['urlWiki'],
        'svn': projects[k]['urlSvn'],
        'git': projects[k]['urlGit']
        }
    persist[k]['releases'] = projects[k]['releases']
    persist[k]['zFixMeList'] = projects[k]['zFixMeList']
    persist[k]['news'] = projects[k]['news']

    if len(projects[k]['releases']) > 0:
        fileXml.write('|link:{0}.html#releases[[.cool1]#True#]\n'.format(k))
    else:
        fileXml.write('|[.care]#False#\n')

    fileXml.write('\n')

    fileList.write('{0},"{1}","{2}"\n'.format(
        k, projects[k]['name'], projects[k]['sponsor']))

fileXml.close()
# End of rows

# Other issues
fileXmlo2 = open(CLUTCH_CONTENT_DIR + '_includes/clutcho2.ad', encoding='utf-8', mode='w')
if len(otherIssues):
    otherIssues.sort()
    for issue in sorted(otherIssues, key=str.lower):
        fileXmlo2.write("* {0}\n".format(issue))
else:
    fileXmlo2.write("* No known issues.\n")
fileXmlo2.close()

mentorsList = {}
for mentor in mentorsName:
    mentorsList[mentorsName[mentor]] = [mentorsName[mentor],mentor,mentorsProjects[mentor]]
mentors = list(mentorsList.keys())
mentors.sort()

fileXmlm = open(CLUTCH_CONTENT_DIR + '_includes/clutchm.ad', encoding='utf-8', mode='w')
for mentor in mentors:
    fileXmlm.write(". {0} ({1}): {2}\n".format(
        mentor, mentorsList[mentor][1], ', '.join(mentorsList[mentor][2])))
fileXmlm.close()

fileList.close()

fileReport1 = open(CLUTCH_CONTENT_DIR + 'report_due_1.txt', 'w')
fileReport1.write(reportList1)
fileReport1.close()
fileReport2 = open(CLUTCH_CONTENT_DIR + 'report_due_2.txt', 'w')
fileReport2.write(reportList2)
fileReport2.close()
fileReport3 = open(CLUTCH_CONTENT_DIR + 'report_due_3.txt', 'w')
fileReport3.write(reportList3)
fileReport3.close()


fileReportG = open(CLUTCH_CONTENT_DIR + 'podlings_graduated.txt', 'w')
for k in sorted(graduatedProjects, key=str.lower):
    fileReportG.write("{0},{1},{2}\n".format(k,graduatedProjects[k]['startdate'],graduatedProjects[k]['enddate']))
fileReportG.close()

fileReportR = open(CLUTCH_CONTENT_DIR + 'podlings_retired.txt', 'w')
for k in sorted(retiredProjects, key=str.lower):
    fileReportR.write("{0},{1},{2}\n".format(k,retiredProjects[k]['startdate'],retiredProjects[k]['enddate']))
fileReportR.close()

# Create the persistent data file.
outputFile = open('clutch2.pkl', 'wb')
pickle.dump(persist, outputFile, protocol=3)
outputFile.close()

print("Done. Generated clutch*.ent files.")
print("Now you need to re-build the site, as usual.")
print("FINAL: current: {0}".format(len(projects)))
print("FINAL: date: {0}".format(gatherDateString))

