# ==================================================================== # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # # Author: Erik Hatcher # # to index all man pages on $MANPATH or /usr/share/man: # python manindex.py pages # ==================================================================== import os, re, sys, lucene from subprocess import * from java.nio.file import Paths from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.index import IndexWriter, IndexWriterConfig from org.apache.lucene.document import Document, Field, StringField, TextField from org.apache.lucene.store import SimpleFSDirectory def indexDirectory(dir): for name in os.listdir(dir): path = os.path.join(dir, name) if os.path.isfile(path): indexFile(dir, name) def indexFile(dir, filename): path = os.path.join(dir, filename) print " File: ", filename if filename.endswith('.gz'): child = Popen('gunzip -c ' + path + ' | groff -t -e -E -mandoc -Tascii | col -bx', shell=True, stdout=PIPE, cwd=os.path.dirname(dir)).stdout command, section = re.search('^(.*)\.(.*)\.gz$', filename).groups() else: child = Popen('groff -t -e -E -mandoc -Tascii ' + path + ' | col -bx', shell=True, stdout=PIPE, cwd=os.path.dirname(dir)).stdout command, section = re.search('^(.*)\.(.*)$', filename).groups() data = child.read() err = child.close() if err: raise RuntimeError, '%s failed with exit code %d' %(command, err) matches = re.search('^NAME$(.*?)^\S', data, re.MULTILINE | re.DOTALL) name = matches and matches.group(1) or '' matches = re.search('^(?:SYNOPSIS|SYNOPSYS)$(.*?)^\S', data, re.MULTILINE | re.DOTALL) synopsis = matches and matches.group(1) or '' matches = re.search('^(?:DESCRIPTION|OVERVIEW)$(.*?)', data, re.MULTILINE | re.DOTALL) description = matches and matches.group(1) or '' doc = Document() doc.add(Field("command", command, StringField.TYPE_STORED)) doc.add(Field("section", section, StringField.TYPE_STORED)) doc.add(Field("name", name.strip(), TextField.TYPE_STORED)) doc.add(Field("synopsis", synopsis.strip(), TextField.TYPE_STORED)) doc.add(Field("keywords", ' '.join((command, name, synopsis, description)), TextField.TYPE_NOT_STORED)) doc.add(Field("filename", os.path.abspath(path), StringField.TYPE_STORED)) writer.addDocument(doc) if __name__ == '__main__': if len(sys.argv) != 2: print "Usage: python manindex.py " else: lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = SimpleFSDirectory(Paths.get(sys.argv[1])) analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, 10000) config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) manpath = os.environ.get('MANPATH', '/usr/share/man').split(os.pathsep) for dir in manpath: print "Crawling", dir for name in os.listdir(dir): path = os.path.join(dir, name) if os.path.isdir(path): indexDirectory(path) writer.commit() writer.close()