# ==================================================================== # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # # Author: Thomas Koch # # FacetExample.py - a simple Facet example for PyLucene # (based on the Java counterpart from # package org.apache.lucene.facet.example.simple) # ==================================================================== usage = """ usage: python FacetExample.py [index | simple | drilldown] where 'index' => create index for faceted search 'simple' => run simple faceted search 'drilldown' => run faceted search with drilldown """ INDEX_DIR = "FacetExample.Index" TAXONOMY_DIR = "FacetExample.Taxonomy" import os, sys, lucene from java.io import File from java.lang import System from java.text import DecimalFormat from java.util import Arrays from org.apache.lucene.util import Version from org.apache.lucene.analysis.core import WhitespaceAnalyzer from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.search import Query, TermQuery, Sort, \ TopScoreDocCollector, MultiCollector, IndexSearcher,SortField, \ TopFieldCollector, MatchAllDocsQuery, BooleanQuery, BooleanClause from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import FSDirectory, SimpleFSDirectory from org.apache.lucene.index import IndexWriter, IndexReader, \ IndexWriterConfig, DirectoryReader, Term from org.apache.lucene.document import Document, Field, TextField from org.apache.lucene.facet.index import FacetFields from org.apache.lucene.facet.taxonomy import CategoryPath from org.apache.lucene.facet.taxonomy.directory import \ DirectoryTaxonomyWriter, DirectoryTaxonomyReader from org.apache.lucene.facet.params import FacetIndexingParams, \ FacetSearchParams from org.apache.lucene.facet.search import FacetRequest, \ FacetResult, FacetResultNode, CountFacetRequest, \ DrillDownQuery, FacetsCollector # ----------------------------------------------------------------------------- # SimpleUtils: # Documents title field TITLE = "title" TEXT = "text"; docTexts = [ "the white car is the one I want.", "the white dog does not belong to anyone.", ] # sample documents titles (for the title field). docTitles = [ "white car", # doc nr.0 "white dog", # doc nr.1 ] # Categories: categories[D][N] == category-path no. N for document no. D. categories = [ [["root","a","f1"], ["root","a","f2"]], # doc nr.0 [["root","a","f1"], ["root","a","f3"]] # doc nr.1 ] # ----------------------------------------------------------------------------- # port of org.apache.lucene.facet.example.simple from java to python # Sample indexer creates an index, and adds to it sample documents and facets. class SimpleIndexer(object): def index (cls, indexDir, taxoDir): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer from org.apache.lucene.util import Version config = IndexWriterConfig(Version.LUCENE_42, WhitespaceAnalyzer(Version.LUCENE_42)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # FacetFields is a utility class for adding facet fields to a document: facet_fields = FacetFields(taxo) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # obtain the sample facets for current document facets = categories[docNum] facetList = [CategoryPath(f) for f in facets] # NOTE: setCategoryPaths() requires an Iterable, so need to convert the # Python list in order to to pass a proper argument to setCategoryPaths. # We use java.util.Arrays (via JCC) to create a Java List: facetList = Arrays.asList(facetList) # NOTE: we could use lucene.collections here as well in order to convert our # Python list to a Java based list using the JavaList class (JavaList implements # java.util.List around a Python list instance it wraps): # from lucene.collections import JavaList # facetList = JavaList(facetList) # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # use the FacetFields utility class for adding facet fields (i.e. the categories) # to the document (and, as required, to the taxonomy index) facet_fields.addFields(doc, facetList) # finally add the document to the index iw.addDocument(doc) nDocsAdded +=1 nFacetsAdded += facetList.size() # end for # commit changes. # we commit changes to the taxonomy index prior to committing them to the search index. # this is important, so that all facets referred to by documents in the search index # will indeed exist in the taxonomy index. taxo.commit() iw.commit() # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. taxo.close() iw.close() print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded) index = classmethod(index) # ----------------------------------------------------------------------------- # port of org.apache.lucene.facet.example.simple from java to python # SimpleSearcer searches index with facets. class SimpleSearcher(object): def searchWithFacets(cls, indexReader, taxoReader): """ Search an index with facets. returns a List """ facetRequest = CountFacetRequest(CategoryPath(["root","a"]), 10) return cls.searchWithRequest(indexReader, taxoReader, None, facetRequest) def searchWithRequest(cls, indexReader, taxoReader, indexingParams, facetRequest): """ Search an index with facets for given facet requests. returns a List """ query = TermQuery(Term(TEXT, "white")) return cls.searchWithRequestAndQuery(query, indexReader, taxoReader, indexingParams, facetRequest) def searchWithRequestAndQuery(cls, query, indexReader, taxoReader, indexingParams, facetRequest): """ Search an index with facets for given query and facet requests. returns a List """ # prepare searcher to search against searcher = IndexSearcher(indexReader) # collect matching documents into a collector topDocsCollector = TopScoreDocCollector.create(10, True) if not indexingParams: indexingParams = FacetIndexingParams.DEFAULT # Faceted search parameters indicate which facets are we interested in facetRequests = [facetRequest,] facetRequests = Arrays.asList(facetRequests) # Add the facet request of interest to the search params: facetSearchParams = FacetSearchParams(indexingParams, facetRequests) # and create a FacetsCollector to use in our facetted search: facetsCollector = FacetsCollector.create(facetSearchParams, indexReader, taxoReader) # perform documents search and facets accumulation searcher.search(query, MultiCollector.wrap([topDocsCollector, facetsCollector])) print "\nFound %d Documents for query=%s" % (topDocsCollector.totalHits, query.toString().encode('utf-8')) # Obtain facets results and print them res = facetsCollector.getFacetResults() i = 0 for facetResult in res: print "Result #%d has %d descendants" % (i, facetResult.getNumValidDescendants()) print "Result #%d : %s" % (i, facetResult) i += 1 return res def searchWithDrillDown(cls, indexReader, taxoReader, indexingParams=None): """ Search an index with facets drill-down. returns a List """ # base query the user is interested in baseQuery = TermQuery(Term(TEXT, "white")) if not indexingParams: indexingParams = FacetIndexingParams.DEFAULT # facet of interest facetRequest = CountFacetRequest(CategoryPath(["root","a"]), 10) # initial search - all docs matching the base query will contribute to the accumulation res1 = cls.searchWithRequest(indexReader, taxoReader, None, facetRequest) # a single result (because there was a single request) fres = res1.get(0) # assume the user is interested in the second sub-result # (just take the second sub-result returned by the iterator - we know there are 3 results!) subResults = fres.getFacetResultNode().subResults resultNode = FacetResultNode.cast_(subResults.get(1)) # 2nd subresult categoryOfInterest = resultNode.label # turn the base query into a drill-down query for the category of interest # first create a new DrillDownQuery over the given base query. query2 = DrillDownQuery(indexingParams, baseQuery) # next the categories of interest are added to the DrillDownQuery query2.add([categoryOfInterest,]) # that's it - search with the new query and we're done! # only documents both matching the base query AND containing the # category of interest will contribute to the new accumulation return cls.searchWithRequestAndQuery(query2, indexReader, taxoReader, indexingParams, facetRequest) searchWithFacets = classmethod(searchWithFacets) searchWithRequest = classmethod(searchWithRequest) searchWithRequestAndQuery = classmethod(searchWithRequestAndQuery) searchWithDrillDown = classmethod(searchWithDrillDown) # ----------------------------------------------------------------------------- class FacetExample(object): def __init__(self, directory): self.directory = directory # create Directories for the search index and for the taxonomy index # in RAM or on Disc #indexDir = RAMDirectory() #taxoDir = RAMDirectory() self.indexDir = FSDirectory.open(File(os.path.join(self.directory, INDEX_DIR))) self.taxoDir = FSDirectory.open(File(os.path.join(self.directory, TAXONOMY_DIR))) def createIndex(self): # index the sample documents SimpleIndexer.index(self.indexDir, self.taxoDir) def runSimple(self): # open readers taxo = DirectoryTaxonomyReader(self.taxoDir) indexReader = DirectoryReader.open(self.indexDir) # returns List facetRes = SimpleSearcher.searchWithFacets(indexReader, taxo) # close readers taxo.close() indexReader.close() # return result return facetRes def runDrillDown(self): # open readers taxo = DirectoryTaxonomyReader(self.taxoDir) indexReader = DirectoryReader.open(self.indexDir) facetRes = SimpleSearcher.searchWithDrillDown(indexReader, taxo) # close readers taxo.close() indexReader.close() # return result return facetRes def main(cls, argv): baseDir = os.path.dirname(os.path.abspath(argv[0])) if len(argv) > 1: index = simple = drilldown = False for arg in argv[1:]: if arg == "index": index = True elif arg == "simple": simple = True elif arg == "drilldown": drilldown = True else: sys.exit(usage+"\nunknown argument: %s" % arg) else: index = simple = True drilldown = False example = FacetExample(baseDir) if index: example.createIndex() if simple: example.runSimple() if drilldown: example.runDrillDown() main = classmethod(main) if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) FacetExample.main(sys.argv)