# ==================================================================== # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # # Author: Thomas Koch # # FacetExample.py - a simple Facet example for PyLucene # (originally based on the Java counterpart from # package org.apache.lucene.facet.example.simple # later updated to new Facet API) # ==================================================================== usage = """ usage: python FacetExample.py [index | simple | drilldown] where 'index' => create index for faceted search 'simple' => run simple faceted search 'drilldown' => run faceted search with drilldown """ INDEX_DIR = "FacetExample.Index" TAXONOMY_DIR = "FacetExample.Taxonomy" import os, sys, lucene from java.nio.file import Paths from java.lang import System from java.text import DecimalFormat from java.util import Arrays from org.apache.lucene.analysis.core import WhitespaceAnalyzer from org.apache.lucene.search import IndexSearcher, TermQuery, MatchAllDocsQuery from org.apache.lucene.store import FSDirectory, SimpleFSDirectory from org.apache.lucene.index import (IndexWriter, IndexReader, DirectoryReader, Term, IndexWriterConfig) from org.apache.lucene.document import Document, Field, TextField from org.apache.lucene.facet import DrillSideways, DrillDownQuery from org.apache.lucene.facet import (Facets, FacetField, FacetResult, FacetsConfig, FacetsCollector) from org.apache.lucene.facet.taxonomy import FastTaxonomyFacetCounts from org.apache.lucene.facet.taxonomy.directory import (DirectoryTaxonomyWriter, DirectoryTaxonomyReader) # ----------------------------------------------------------------------------- # SimpleUtils: # Documents title field TITLE = "title" TEXT = "text" docTexts = [ "The white car is the one I want.", # doc nr.0 "The white dog does not belong to anyone." # doc nr.1 ] # sample documents titles (for the title field). docTitles = [ "white car", # doc nr.0 "white dog", # doc nr.1 ] # Authors: author[n] == Author of n-th document # example for simple, single-value facet authors = [ "Bob", # doc nr.0 "Lisa" # doc nr.1 ] # Categories: categories[D][N] == category-path no. N for document no. D. # example for hierarchical multi-value facet categories = [ [["root","a","f1"], ["root","a","f2"]], # doc nr.0 [["root","a","f1"], ["root","a","f3"]] # doc nr.1 ] # samples for (drilldown) search searchValues = ['white', 'car'] drilldownCategories = [["root","a","f1"], ["root","a","f2"]] # ----------------------------------------------------------------------------- # Sample indexer creates an index, and adds to it sample documents and facets. class SimpleIndexer(object): def index (cls, indexDir, taxoDir, facets_config): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer config = IndexWriterConfig(WhitespaceAnalyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # obtain the sample facets for current document facets = categories[docNum] author = authors[docNum] # ... and use the FacetField class for adding facet fields to # the Lucene document (and via FacetsConfig to the taxonomy index) doc.add(FacetField("Author", author)) for f in facets: doc.add(FacetField("Categories", f)) # finally add the document to the index iw.addDocument(facets_config.build(taxo, doc)) nDocsAdded += 1 # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. iw.close() taxo.close() print "Indexed %d documents with facets." % nDocsAdded index = classmethod(index) # ----------------------------------------------------------------------------- # SimpleSearcer searches index with facets. class SimpleSearcher(object): def searchWithFacets(cls, indexReader, taxoReader, facets_config): """ Search an index with facets. return a list of FacetResult instances """ # MatchAllDocsQuery is for "browsing" (counts facets for all non-deleted docs in the index) query = MatchAllDocsQuery() return cls.searchWithQuery(query, indexReader, taxoReader, facets_config) def searchWithTerm(cls, query, indexReader, taxoReader, facets_config): """ Search an index with facets by using simple term query return a list of FacetResult instances """ query = TermQuery(Term(TEXT, query)) return cls.searchWithQuery(query, indexReader, taxoReader, facets_config) def searchWithQuery(cls, query, indexReader, taxoReader, facets_config): """ Search an index with facets for a given query return a list of FacetResult instances """ # prepare searcher to search against searcher = IndexSearcher(indexReader) # create a FacetsCollector to use in our facetted search: facets_collector = FacetsCollector() FacetsCollector.search(searcher, query, 10, facets_collector) # Count both "Categories" and "Author" dimensions facets = FastTaxonomyFacetCounts(taxoReader, facets_config, facets_collector) results = [] facet_result = facets.getTopChildren(10, "Categories") if facet_result: results.append(facet_result) print "Categories: ", facet_result.childCount for lv in facet_result.labelValues: print " '%s' (%s)" % (lv.label, lv.value) facet_result = facets.getTopChildren(10, "Categories", "root", "a") if facet_result: results.append(facet_result) print "Root-a-Categories: ", facet_result.childCount for lv in facet_result.labelValues: print " '%s' (%s)" % (lv.label, lv.value) facet_result = facets.getTopChildren(10, "Author") if facet_result: results.append(facet_result) print "Author: ", facet_result.childCount for lv in facet_result.labelValues: print " '%s' (%s)" % (lv.label, lv.value) return results def searchWithDrillDown(cls, drilldownCategory, indexReader, taxoReader, facets_config): """ Search an index with facets drill-down. return a list of FacetResult instances """ # User drills down on 'Categories' "root/a/f1" and we return facets for 'Author' searcher = IndexSearcher(indexReader) # Passing no baseQuery means we drill down on all documents ("browse only"): query = DrillDownQuery(facets_config) # Now user drills down on Publish Date/2010: query.add("Categories", drilldownCategory) facets_collector = FacetsCollector() FacetsCollector.search(searcher, query, 10, facets_collector) # Retrieve results facets = FastTaxonomyFacetCounts(taxoReader, facets_config, facets_collector) facet_result = facets.getTopChildren(10, "Author") print "Author: ", facet_result.childCount for lv in facet_result.labelValues: print " '%s' (%s)" % (lv.label, lv.value) return facet_result searchWithFacets = classmethod(searchWithFacets) searchWithTerm = classmethod(searchWithTerm) searchWithQuery = classmethod(searchWithQuery) searchWithDrillDown = classmethod(searchWithDrillDown) # ----------------------------------------------------------------------------- class FacetExample(object): def __init__(self, directory): self.directory = directory # create Directories for the search index and for the taxonomy index # in RAM or on Disc #indexDir = RAMDirectory() #taxoDir = RAMDirectory() self.indexDir = FSDirectory.open(Paths.get(os.path.join(self.directory, INDEX_DIR))) self.taxoDir = FSDirectory.open(Paths.get(os.path.join(self.directory, TAXONOMY_DIR))) # FacetConfig self.facets_config = FacetsConfig() self.facets_config.setHierarchical("Categories", True) self.facets_config.setMultiValued("Categories", True) def createIndex(self): # index the sample documents SimpleIndexer.index(self.indexDir, self.taxoDir, self.facets_config) def runSimple(self): # open readers taxo = DirectoryTaxonomyReader(self.taxoDir) indexReader = DirectoryReader.open(self.indexDir) for term in searchValues: print "\nsearch by term '%s' ..." % term facetRes = SimpleSearcher.searchWithTerm(term, indexReader, taxo, self.facets_config) print "\nsearch all documents ..." facetRes = SimpleSearcher.searchWithFacets(indexReader, taxo, self.facets_config) # close readers taxo.close() indexReader.close() # return result return facetRes def runDrillDown(self): # open readers taxo = DirectoryTaxonomyReader(self.taxoDir) indexReader = DirectoryReader.open(self.indexDir) for drilldown in drilldownCategories: print "search with drilldown: %s" % '/'.join(drilldown) facetRes = SimpleSearcher.searchWithDrillDown(drilldown, indexReader, taxo, self.facets_config) # close readers taxo.close() indexReader.close() # return result return facetRes def main(cls, argv): baseDir = os.path.dirname(os.path.abspath(argv[0])) if len(argv) > 1: index = simple = drilldown = False for arg in argv[1:]: if arg == "index": index = True elif arg == "simple": simple = True elif arg == "drilldown": drilldown = True else: sys.exit(usage+"\nunknown argument: %s" % arg) else: index = simple = True drilldown = False example = FacetExample(baseDir) if index: example.createIndex() if simple: example.runSimple() if drilldown: example.runDrillDown() main = classmethod(main) if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) FacetExample.main(sys.argv)