#!/usr/local/bin/python
import os
from os.path import join as pjoin
import sys
import subprocess
def get_output(cmd):
s = subprocess.Popen(cmd, stdout=subprocess.PIPE)
out = s.communicate()[0]
s.wait()
return out.strip()
# you could use os.path.walk to calculate this... or you could use du(1).
def duhack(path):
cmd = ['du', '-k', path]
out = get_output(cmd).split()
return int(out[0]) * 1024
BASEPATH=sys.argv[1]
ROOT="/x1/mail-archives/mod_mbox"
HOSTNAME="http://mail-archives.apache.org/mod_mbox/"
PARITION_SIZE=100 * 1024 * 1024
tlps={}
for files in os.listdir(ROOT):
path = files
tlp = path[0:path.find('-')]
list = path[path.find('-')+1:]
# print "%s - %s %s" % (tlp, list, path)
if not os.access("%s/%s/listinfo.db" % (ROOT, path), os.F_OK):
continue
if tlp == "www":
tlp = "asf"
if not tlps.has_key(tlp):
tlps[tlp] = {}
tlps[tlp][list] = [path, duhack(pjoin(ROOT, path))]
keys = tlps.keys()
keys.sort()
count = 0
fcount = 0
def write_sitemap_header(fp):
fp.write("""\n\n""")
def write_sitemap_footer(fp):
fp.write("\n")
fp = open(BASEPATH % (fcount), 'w')
write_sitemap_header(fp)
for tlp in keys:
klist = tlps[tlp].keys()
klist.sort()
for list in klist:
name = tlps[tlp][list][0]
size = tlps[tlp][list][1]
if size < PARITION_SIZE:
count += 1
fp.write("%s%s/?format=sitemap\n" % (HOSTNAME, name))
else:
part = (size / PARITION_SIZE) + 1
for i in range(0, part):
count += 1
fp.write("%s%s/?format=sitemap&pmax=%d&part=%d\n" % (HOSTNAME, name, part, i))
if count > 500:
write_sitemap_footer(fp)
fp.close()
count = 0
fcount += 1
fp = open(BASEPATH % (fcount), 'w')
write_sitemap_header(fp)
write_sitemap_footer(fp)