You are viewing a plain text version of this content. The canonical link for it is here.
Posted to pylucene-commits@lucene.apache.org by va...@apache.org on 2014/06/03 00:37:47 UTC

svn commit: r1599384 - in /lucene/pylucene/trunk: CHANGES Makefile samples/FacetExample.py

Author: vajda
Date: Mon Jun  2 22:37:46 2014
New Revision: 1599384

URL: http://svn.apache.org/r1599384
Log:
 - FacetSample.py fixed to work with Lucene 4.8 facets API (Thomas Koch)

Modified:
    lucene/pylucene/trunk/CHANGES
    lucene/pylucene/trunk/Makefile
    lucene/pylucene/trunk/samples/FacetExample.py

Modified: lucene/pylucene/trunk/CHANGES
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/CHANGES?rev=1599384&r1=1599383&r2=1599384&view=diff
==============================================================================
--- lucene/pylucene/trunk/CHANGES (original)
+++ lucene/pylucene/trunk/CHANGES Mon Jun  2 22:37:46 2014
@@ -1,3 +1,19 @@
+Version 4.8.0 ->
+----------------------
+ - FacetSample.py fixed to work with Lucene 4.8 facets API (Thomas Koch)
+ - 
+
+Version 4.7.2 -> 4.8.0
+----------------------
+ - using Lucene 4.8.0 sources
+ - PyLucene built with JCC 2.19
+ - Lucene now requires Java 7 at the minimum, Java 6 is no longer supported
+
+Version 4.6.1 -> 4.7.2
+----------------------
+ - using Lucene 4.7.2 sources
+ - PyLucene built with JCC 2.19
+
 Version 4.5.1 -> 4.6.1
 ----------------------
  - using Lucene 4.6.1 sources

Modified: lucene/pylucene/trunk/Makefile
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/Makefile?rev=1599384&r1=1599383&r2=1599384&view=diff
==============================================================================
--- lucene/pylucene/trunk/Makefile (original)
+++ lucene/pylucene/trunk/Makefile Mon Jun  2 22:37:46 2014
@@ -45,7 +45,7 @@ LUCENE=$(LUCENE_SRC)/lucene
 # limit.
 #
 
-# Mac OS X 10.6 (MacPorts 1.8.0 64-bit Python 2.7, Java 1.6)
+# Mac OS X 10.9 (64-bit Python 2.7, Java 1.8)
 PREFIX_PYTHON=/Users/vajda/apache/pylucene/_install
 ANT=/Users/vajda/tmp/apache-ant-1.9.3/bin/ant
 PYTHON=$(PREFIX_PYTHON)/bin/python

Modified: lucene/pylucene/trunk/samples/FacetExample.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/FacetExample.py?rev=1599384&r1=1599383&r2=1599384&view=diff
==============================================================================
--- lucene/pylucene/trunk/samples/FacetExample.py (original)
+++ lucene/pylucene/trunk/samples/FacetExample.py Mon Jun  2 22:37:46 2014
@@ -15,8 +15,9 @@
 # Author: Thomas Koch
 #
 # FacetExample.py - a simple Facet example for PyLucene
-#   (based on the Java counterpart from
-#    package org.apache.lucene.facet.example.simple)
+#   (originally based on the Java counterpart from
+#    package org.apache.lucene.facet.example.simple
+#    later updated to new Facet API)
 # ====================================================================
 
 usage = """
@@ -39,213 +40,183 @@ from java.util import Arrays
 
 from org.apache.lucene.util import Version
 from org.apache.lucene.analysis.core import WhitespaceAnalyzer
-from org.apache.lucene.analysis.standard import StandardAnalyzer
-from org.apache.lucene.search import Query, TermQuery, Sort, \
-    TopScoreDocCollector, MultiCollector, IndexSearcher,SortField, \
-    TopFieldCollector, MatchAllDocsQuery, BooleanQuery, BooleanClause
-from org.apache.lucene.queryparser.classic import QueryParser
+from org.apache.lucene.search import IndexSearcher, TermQuery, MatchAllDocsQuery
 from org.apache.lucene.store import FSDirectory, SimpleFSDirectory
-from org.apache.lucene.index import IndexWriter, IndexReader, \
-    IndexWriterConfig, DirectoryReader, Term
+from org.apache.lucene.index import (IndexWriter, IndexReader,
+                                     DirectoryReader, Term,
+                                     IndexWriterConfig)
 from org.apache.lucene.document import Document, Field, TextField
-from org.apache.lucene.facet.index import FacetFields
-from org.apache.lucene.facet.taxonomy import CategoryPath
-from org.apache.lucene.facet.taxonomy.directory import \
-    DirectoryTaxonomyWriter, DirectoryTaxonomyReader
-from org.apache.lucene.facet.params import FacetIndexingParams, \
-    FacetSearchParams
-from org.apache.lucene.facet.search import FacetRequest, \
-    FacetResult, FacetResultNode, CountFacetRequest, \
-    DrillDownQuery, FacetsCollector
-
+from org.apache.lucene.facet import DrillSideways, DrillDownQuery
+from org.apache.lucene.facet import (Facets, FacetField, FacetResult,
+                                     FacetsConfig, FacetsCollector)
+from org.apache.lucene.facet.taxonomy import FastTaxonomyFacetCounts
+from org.apache.lucene.facet.taxonomy.directory import (DirectoryTaxonomyWriter,
+                                                        DirectoryTaxonomyReader)
 
 # -----------------------------------------------------------------------------
 # SimpleUtils:
 # Documents title field
 TITLE = "title"
-TEXT = "text";
-
+TEXT = "text"
 docTexts = [
-    "the white car is the one I want.",
-    "the white dog does not belong to anyone.",
+    "The white car is the one I want.",           # doc nr.0
+    "The white dog does not belong to anyone."    # doc nr.1
 ]
 
 # sample documents titles (for the title field).
-
 docTitles = [
     "white car",  # doc nr.0
     "white dog",  # doc nr.1
 ]
 
-# Categories: categories[D][N] == category-path no. N for document no. D.
+# Authors: author[n] ==  Author of n-th document
+# example for simple, single-value facet
+authors = [
+    "Bob",   # doc nr.0
+    "Lisa"   # doc nr.1
+]
 
+# Categories: categories[D][N] == category-path no. N for document no. D.
+# example for hierarchical multi-value facet
 categories = [
     [["root","a","f1"], ["root","a","f2"]], # doc nr.0
     [["root","a","f1"], ["root","a","f3"]]  # doc nr.1
 ]
 
+# samples for (drilldown) search
+searchValues = ['white', 'car']
+drilldownCategories = [["root","a","f1"],  ["root","a","f2"]]
+
 # -----------------------------------------------------------------------------
-# port of org.apache.lucene.facet.example.simple from java to python
 # Sample indexer creates an index, and adds to it sample documents and facets.
 
 class SimpleIndexer(object):
 
-    def index (cls, indexDir, taxoDir):
+    def index (cls, indexDir, taxoDir, facets_config):
         """Create an index, and adds to it sample documents and facets.
         indexDir Directory in which the index should be created.
         taxoDir Directory in which the taxonomy index should be created.
         """
         # create and open an index writer
-        from org.apache.lucene.util import Version
-        config = IndexWriterConfig(Version.LUCENE_42,
-                                   WhitespaceAnalyzer(Version.LUCENE_42))
+        config = IndexWriterConfig(Version.LUCENE_48,
+                                   WhitespaceAnalyzer(Version.LUCENE_48))
         config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
         iw = IndexWriter(indexDir, config)
         # create and open a taxonomy writer
         taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE)
-        # FacetFields is a utility class for adding facet fields to a document:
-        facet_fields = FacetFields(taxo)
-
         # loop over sample documents
         nDocsAdded = 0
         nFacetsAdded = 0
         for docNum in range(len(docTexts)):
-            # obtain the sample facets for current document
-            facets = categories[docNum]
-            facetList = [CategoryPath(f) for f in facets]
-            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
-            #       Python list in order to to pass a proper argument to setCategoryPaths.
-            #       We use java.util.Arrays (via JCC) to create a Java List:
-            facetList = Arrays.asList(facetList)
-
-            # NOTE: we could use lucene.collections here as well in order to convert our
-            # Python list to a Java based list using the JavaList class (JavaList implements
-            # java.util.List around a Python list instance it wraps):
-            #  from lucene.collections import JavaList
-            #  facetList = JavaList(facetList)
-
             # create a plain Lucene document and add some regular Lucene fields to it
             doc = Document()
             doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
             doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
-            # use the FacetFields utility class for adding facet fields (i.e. the categories)
-            # to the document (and, as required, to the taxonomy index)
-            facet_fields.addFields(doc, facetList)
+            # obtain the sample facets for current document
+            facets = categories[docNum]
+            author = authors[docNum]
+            # ... and use the FacetField class for adding facet fields to
+            # the Lucene document (and via FacetsConfig to the taxonomy index)
+            doc.add(FacetField("Author", [author]));
+            for f in facets:
+                doc.add(FacetField("Categories", f))
             # finally add the document to the index
-            iw.addDocument(doc)
-            nDocsAdded +=1
-            nFacetsAdded += facetList.size()
-        # end for
-
-        # commit changes.
-        # we commit changes to the taxonomy index prior to committing them to the search index.
-        # this is important, so that all facets referred to by documents in the search index
-        # will indeed exist in the taxonomy index.
-        taxo.commit()
-        iw.commit()
+            iw.addDocument(facets_config.build(taxo, doc));
+            nDocsAdded += 1
 
         # close the taxonomy index and the index - all modifications are
         # now safely in the provided directories: indexDir and taxoDir.
-        taxo.close()
         iw.close()
-        print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)
+        taxo.close()
+        print "Indexed %d documents with facets." % nDocsAdded
 
     index = classmethod(index)
 
 # -----------------------------------------------------------------------------
-# port of org.apache.lucene.facet.example.simple from java to python
 # SimpleSearcer searches index with facets.
 
 class SimpleSearcher(object):
 
-    def searchWithFacets(cls, indexReader, taxoReader):
+    def searchWithFacets(cls, indexReader, taxoReader, facets_config):
         """
         Search an index with facets.
-        returns a List<FacetResult>
+        return a list of FacetResult instances
         """
-        facetRequest = CountFacetRequest(CategoryPath(["root","a"]), 10)
-        return cls.searchWithRequest(indexReader, taxoReader, None, facetRequest)
+        # MatchAllDocsQuery is for "browsing" (counts facets for all non-deleted docs in the index)
+        query = MatchAllDocsQuery()
+        return cls.searchWithQuery(query, indexReader, taxoReader, facets_config)
 
-    def searchWithRequest(cls, indexReader, taxoReader, indexingParams, facetRequest):
+    def searchWithTerm(cls, query, indexReader, taxoReader, facets_config):
         """
-        Search an index with facets for given facet requests.
-        returns a List<FacetResult>
+        Search an index with facets by using simple term query
+        return a list of FacetResult instances
         """
-        query = TermQuery(Term(TEXT, "white"))
-        return cls.searchWithRequestAndQuery(query, indexReader, taxoReader,
-                                         indexingParams, facetRequest)
+        query = TermQuery(Term(TEXT, query))
+        return cls.searchWithQuery(query, indexReader, taxoReader, facets_config)
 
-    def searchWithRequestAndQuery(cls, query, indexReader, taxoReader,
-                                  indexingParams, facetRequest):
+    def searchWithQuery(cls, query, indexReader, taxoReader, facets_config):
         """
-        Search an index with facets for given query and facet requests.
-        returns a List<FacetResult>
+        Search an index with facets for a given query
+        return a list of FacetResult instances
         """
         # prepare searcher to search against
         searcher = IndexSearcher(indexReader)
-        # collect matching documents into a collector
-        topDocsCollector = TopScoreDocCollector.create(10, True)
-        if not indexingParams:
-            indexingParams = FacetIndexingParams.DEFAULT
-
-        # Faceted search parameters indicate which facets are we interested in
-        facetRequests = [facetRequest,]
-        facetRequests = Arrays.asList(facetRequests)
-        # Add the facet request of interest to the search params:
-        facetSearchParams = FacetSearchParams(indexingParams, facetRequests)
-        # and create a FacetsCollector to use in our facetted search:
-        facetsCollector = FacetsCollector.create(facetSearchParams, indexReader, taxoReader)
-        # perform documents search and facets accumulation
-        searcher.search(query, MultiCollector.wrap([topDocsCollector, facetsCollector]))
-        print "\nFound %d Documents for query=%s" % (topDocsCollector.totalHits,
-                                                     query.toString().encode('utf-8'))
-        # Obtain facets results and print them
-        res = facetsCollector.getFacetResults()
-        i = 0
-        for facetResult in res:
-            print "Result #%d has %d descendants" % (i, facetResult.getNumValidDescendants())
-            print "Result #%d : %s" % (i, facetResult)
-            i += 1
-
-        return res
+        # create a FacetsCollector to use in our facetted search:
+        facets_collector = FacetsCollector()
+        FacetsCollector.search(searcher, query, 10, facets_collector)
+        # Count both "Categories" and "Author" dimensions
+        facets = FastTaxonomyFacetCounts(taxoReader, facets_config, facets_collector)
+        results = []
+
+        facet_result = facets.getTopChildren(10, "Categories", [])
+        if facet_result:
+            results.append(facet_result)
+            print  "Categories: ", facet_result.childCount
+            for  lv in facet_result.labelValues:
+                print " '%s' (%s)"  % (lv.label, lv.value)
+
+        facet_result = facets.getTopChildren(10, "Categories", ["root","a"])
+        if facet_result:
+            results.append(facet_result)
+            print  "Root-a-Categories: ", facet_result.childCount
+            for  lv in facet_result.labelValues:
+                print " '%s' (%s)"  % (lv.label, lv.value)
+
+        facet_result = facets.getTopChildren(10, "Author", [])
+        if facet_result:
+            results.append(facet_result)
+            print  "Author: ", facet_result.childCount
+            for  lv in facet_result.labelValues:
+                print " '%s' (%s)"  % (lv.label, lv.value)
 
+        return results
 
-    def searchWithDrillDown(cls, indexReader, taxoReader, indexingParams=None):
+    def searchWithDrillDown(cls, drilldownCategory, indexReader, taxoReader, facets_config):
         """
         Search an index with facets drill-down.
-        returns a List<FacetResult>
+        return a list of FacetResult instances
         """
-        # base query the user is interested in
-        baseQuery = TermQuery(Term(TEXT, "white"))
-        if not indexingParams:
-            indexingParams = FacetIndexingParams.DEFAULT
-
-        # facet of interest
-        facetRequest = CountFacetRequest(CategoryPath(["root","a"]), 10)
-        # initial search - all docs matching the base query will contribute to the accumulation
-        res1 = cls.searchWithRequest(indexReader, taxoReader, None, facetRequest)
-        # a single result (because there was a single request)
-        fres = res1.get(0)
-        # assume the user is interested in the second sub-result
-        # (just take the second sub-result returned by the iterator - we know there are 3 results!)
-        subResults = fres.getFacetResultNode().subResults
-        resultNode = FacetResultNode.cast_(subResults.get(1)) # 2nd subresult
-        categoryOfInterest = resultNode.label
-        # turn the base query into a drill-down query for the category of interest
-        # first create a new  DrillDownQuery over the given base query.
-        query2 = DrillDownQuery(indexingParams, baseQuery)
-        # next the categories of interest are added to the DrillDownQuery
-        query2.add([categoryOfInterest,])
-        # that's it - search with the new query and we're done!
-        # only documents both matching the base query AND containing the
-        # category of interest will contribute to the new accumulation
-        return cls.searchWithRequestAndQuery(query2, indexReader, taxoReader,
-                                             indexingParams, facetRequest)
+        #  User drills down on 'Categories' "root/a/f1" and we return facets for 'Author'
+        searcher = IndexSearcher(indexReader)
+        #  Passing no baseQuery means we drill down on all documents ("browse only"):
+        query =  DrillDownQuery(facets_config);
+        # Now user drills down on Publish Date/2010:
+        query.add("Categories",  drilldownCategory)
+        facets_collector = FacetsCollector()
+        FacetsCollector.search(searcher, query, 10, facets_collector    )
+        # Retrieve results
+        facets =  FastTaxonomyFacetCounts(taxoReader, facets_config, facets_collector)
+        facet_result = facets.getTopChildren(10, "Author", [])
+        print  "Author: ", facet_result.childCount
+        for  lv in facet_result.labelValues:
+            print " '%s' (%s)"  % (lv.label, lv.value)
+
+        return facet_result
 
 
     searchWithFacets = classmethod(searchWithFacets)
-    searchWithRequest = classmethod(searchWithRequest)
-    searchWithRequestAndQuery = classmethod(searchWithRequestAndQuery)
+    searchWithTerm = classmethod(searchWithTerm)
+    searchWithQuery = classmethod(searchWithQuery)
     searchWithDrillDown = classmethod(searchWithDrillDown)
 
 
@@ -263,17 +234,28 @@ class FacetExample(object):
                                                            INDEX_DIR)))
         self.taxoDir = FSDirectory.open(File(os.path.join(self.directory,
                                                           TAXONOMY_DIR)))
+        # FacetConfig
+        self.facets_config = FacetsConfig()
+        self.facets_config.setHierarchical("Categories", True)
+        self.facets_config.setMultiValued("Categories", True)
+
 
     def createIndex(self):
         # index the sample documents
-        SimpleIndexer.index(self.indexDir, self.taxoDir)
+        SimpleIndexer.index(self.indexDir, self.taxoDir, self.facets_config)
 
     def runSimple(self):
         # open readers
         taxo = DirectoryTaxonomyReader(self.taxoDir)
         indexReader = DirectoryReader.open(self.indexDir)
-        # returns List<FacetResult>
-        facetRes = SimpleSearcher.searchWithFacets(indexReader, taxo)
+
+        for term in searchValues:
+            print  "\nsearch by term '%s' ..." % term
+            facetRes = SimpleSearcher.searchWithTerm(term, indexReader, taxo,
+                                                       self.facets_config)
+        print  "\nsearch all documents  ..."
+        facetRes = SimpleSearcher.searchWithFacets(indexReader, taxo,
+                                                   self.facets_config)
         # close readers
         taxo.close()
         indexReader.close()
@@ -284,7 +266,11 @@ class FacetExample(object):
         # open readers
         taxo = DirectoryTaxonomyReader(self.taxoDir)
         indexReader = DirectoryReader.open(self.indexDir)
-        facetRes = SimpleSearcher.searchWithDrillDown(indexReader, taxo)
+
+        for drilldown in drilldownCategories:
+            print "search with drilldown: %s" %  '/'.join(drilldown)
+            facetRes = SimpleSearcher.searchWithDrillDown(drilldown, indexReader,
+                                                          taxo, self.facets_config)
         # close readers
         taxo.close()
         indexReader.close()