You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2015/01/15 19:33:01 UTC

svn commit: r1652212 - in /lucene/dev/trunk/lucene/spatial/src: java/org/apache/lucene/spatial/ java/org/apache/lucene/spatial/prefix/ java/org/apache/lucene/spatial/prefix/tree/ test/org/apache/lucene/spatial/prefix/

Author: dsmiley
Date: Thu Jan 15 18:33:00 2015
New Revision: 1652212

URL: http://svn.apache.org/r1652212
Log:
LUCENE-5735: spatial PrefixTreeFacetCounter (for faceting on SpatialPrefixTrees)

Added:
    lucene/dev/trunk/lucene/spatial/src/java/org/apache/lucene/spatial/prefix/tree/PrefixTreeFacetCounter.java   (with props)
Removed:
    lucene/dev/trunk/lucene/spatial/src/java/org/apache/lucene/spatial/prefix/NumberRangePrefixTreeFacets.java
Modified:
    lucene/dev/trunk/lucene/spatial/src/java/org/apache/lucene/spatial/NumberRangePrefixTreeStrategy.java
    lucene/dev/trunk/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/NumberRangeFacetsTest.java

Modified: lucene/dev/trunk/lucene/spatial/src/java/org/apache/lucene/spatial/NumberRangePrefixTreeStrategy.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/spatial/src/java/org/apache/lucene/spatial/NumberRangePrefixTreeStrategy.java?rev=1652212&r1=1652211&r2=1652212&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/spatial/src/java/org/apache/lucene/spatial/NumberRangePrefixTreeStrategy.java (original)
+++ lucene/dev/trunk/lucene/spatial/src/java/org/apache/lucene/spatial/NumberRangePrefixTreeStrategy.java Thu Jan 15 18:33:00 2015
@@ -29,10 +29,11 @@ import org.apache.lucene.analysis.TokenS
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexReaderContext;
 import org.apache.lucene.queries.function.ValueSource;
-import org.apache.lucene.spatial.prefix.NumberRangePrefixTreeFacets;
+import org.apache.lucene.search.Filter;
 import org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy;
+import org.apache.lucene.spatial.prefix.tree.Cell;
 import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree;
-import org.apache.lucene.util.Bits;
+import org.apache.lucene.spatial.prefix.tree.PrefixTreeFacetCounter;
 
 import static org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape;
 
@@ -76,13 +77,13 @@ public class NumberRangePrefixTreeStrate
   /** Calculates facets between {@code start} and {@code end} to a detail level one greater than that provided by the
    * arguments. For example providing March to October of 2014 would return facets to the day level of those months.
    * This is just a convenience method.
-   * @see #calcFacets(IndexReaderContext, Bits, Shape, int)
+   * @see #calcFacets(IndexReaderContext, Filter, Shape, int)
    */
-  public Facets calcFacets(IndexReaderContext context, final Bits acceptDocs, UnitNRShape start, UnitNRShape end)
+  public Facets calcFacets(IndexReaderContext context, Filter filter, UnitNRShape start, UnitNRShape end)
       throws IOException {
-    Shape filter = getGrid().toRangeShape(start, end);
+    Shape facetRange = getGrid().toRangeShape(start, end);
     int detailLevel = Math.max(start.getLevel(), end.getLevel()) + 1;
-    return calcFacets(context, acceptDocs, filter, detailLevel);
+    return calcFacets(context, filter, facetRange, detailLevel);
   }
 
   /**
@@ -92,9 +93,50 @@ public class NumberRangePrefixTreeStrate
    * {@link org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape#getLevel()}.
    * Facet computation is implemented by navigating the underlying indexed terms efficiently.
    */
-  public Facets calcFacets(IndexReaderContext context, final Bits acceptDocs, Shape facetRange, int level)
+  public Facets calcFacets(IndexReaderContext context, Filter filter, Shape facetRange, final int level)
       throws IOException {
-    return NumberRangePrefixTreeFacets.compute(this, context, acceptDocs, facetRange, level);
+    final Facets facets = new Facets(level);
+    PrefixTreeFacetCounter.compute(this, context, filter, facetRange, level,
+        new PrefixTreeFacetCounter.FacetVisitor() {
+          Facets.FacetParentVal parentFacet;
+          UnitNRShape parentShape;
+
+          @Override
+          public void visit(Cell cell, int count) {
+            if (cell.getLevel() < level - 1) {//some ancestor of parent facet level, direct or distant
+              parentFacet = null;//reset
+              parentShape = null;//reset
+              facets.topLeaves += count;
+            } else if (cell.getLevel() == level - 1) {//parent
+              //set up FacetParentVal
+              setupParent((UnitNRShape) cell.getShape());
+              parentFacet.parentLeaves += count;
+            } else {//at facet level
+              UnitNRShape unitShape = (UnitNRShape) cell.getShape();
+              UnitNRShape unitShapeParent = unitShape.getShapeAtLevel(unitShape.getLevel() - 1);
+              if (parentFacet == null || !parentShape.equals(unitShapeParent)) {
+                setupParent(unitShapeParent);
+              }
+              //lazy init childCounts
+              if (parentFacet.childCounts == null) {
+                parentFacet.childCounts = new int[parentFacet.childCountsLen];
+              }
+              parentFacet.childCounts[unitShape.getValAtLevel(cell.getLevel())] += count;
+            }
+          }
+
+          private void setupParent(UnitNRShape unitShape) {
+            parentShape = unitShape.clone();
+            //Look for existing parentFacet (from previous segment), or create anew if needed
+            parentFacet = facets.parents.get(parentShape);
+            if (parentFacet == null) {//didn't find one; make a new one
+              parentFacet = new Facets.FacetParentVal();
+              parentFacet.childCountsLen = getGrid().getNumSubCells(parentShape);
+              facets.parents.put(parentShape, parentFacet);
+            }
+          }
+        });
+    return facets;
   }
 
   /** Facet response information */

Added: lucene/dev/trunk/lucene/spatial/src/java/org/apache/lucene/spatial/prefix/tree/PrefixTreeFacetCounter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/spatial/src/java/org/apache/lucene/spatial/prefix/tree/PrefixTreeFacetCounter.java?rev=1652212&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/spatial/src/java/org/apache/lucene/spatial/prefix/tree/PrefixTreeFacetCounter.java (added)
+++ lucene/dev/trunk/lucene/spatial/src/java/org/apache/lucene/spatial/prefix/tree/PrefixTreeFacetCounter.java Thu Jan 15 18:33:00 2015
@@ -0,0 +1,191 @@
+package org.apache.lucene.spatial.prefix.tree;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import com.spatial4j.core.shape.Shape;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.IndexReaderContext;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.spatial.prefix.AbstractVisitingPrefixTreeFilter;
+import org.apache.lucene.spatial.prefix.PrefixTreeStrategy;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.SparseFixedBitSet;
+
+/**
+ * Computes facets on cells for {@link org.apache.lucene.spatial.prefix.PrefixTreeStrategy}.
+ *
+ * @lucene.experimental
+ */
+public class PrefixTreeFacetCounter {
+
+  /** A callback/visitor of facet counts. */
+  public static abstract class FacetVisitor {
+    /** Called at the start of the segment, if there is indexed data. */
+    public void startOfSegment() {}
+
+    /** Called for cells with a leaf, or cells at the target facet level.  {@code count} is greater than zero.
+     * When an ancestor cell is given with non-zero count, the count can be considered to be added to all cells
+     * below. You won't necessarily get a cell at level {@code facetLevel} if the indexed data is courser (bigger).
+     */
+    public abstract void visit(Cell cell, int count);
+  }
+
+  private PrefixTreeFacetCounter() {
+  }
+
+  /**
+   * Computes facets using a callback/visitor style design, allowing flexibility for the caller to determine what to do
+   * with each underlying count.
+   *
+   * @param strategy the prefix tree strategy (contains the field reference, grid, max levels)
+   * @param context the IndexReader's context
+   * @param filter a Filter to limit counted docs. For optimal performance, it's
+   *               {@link org.apache.lucene.search.DocIdSet#bits()} should be non-null. If no filter is provided, live
+   *               docs are counted.
+   * @param queryShape the shape to limit the range of facet counts to
+   * @param facetLevel the maximum depth (detail) of faceted cells
+   * @param facetVisitor the visitor/callback to receive the counts
+   */
+  public static void compute(PrefixTreeStrategy strategy, IndexReaderContext context, Filter filter,
+                             Shape queryShape, int facetLevel, FacetVisitor facetVisitor)
+      throws IOException {
+    //We collect per-leaf
+    for (final LeafReaderContext leafCtx : context.leaves()) {
+      //determine leaf acceptDocs Bits
+      Bits leafAcceptDocs;
+      if (filter == null) {
+        leafAcceptDocs = leafCtx.reader().getLiveDocs();//filter deleted
+      } else {
+        final DocIdSet docIdSet = filter.getDocIdSet(leafCtx, leafCtx.reader().getLiveDocs());
+        if (docIdSet == null) {
+          continue;//no docs in filter
+        }
+        leafAcceptDocs = docIdSet.bits();
+        if (leafAcceptDocs == null) {
+          final DocIdSetIterator iterator = docIdSet.iterator();
+          if (iterator == null) {
+            continue;//no docs in filter
+          }
+          //build bits from iterator (abnormal, hopefully, not expecting many docs)
+          SparseFixedBitSet bitSet = new SparseFixedBitSet(leafCtx.reader().maxDoc());
+          bitSet.or(iterator);
+          leafAcceptDocs = bitSet;
+        }
+      }
+
+      compute(strategy, leafCtx, leafAcceptDocs, queryShape, facetLevel, facetVisitor);
+    }
+  }
+
+  /** Lower-level per-leaf segment method. */
+  public static void compute(final PrefixTreeStrategy strategy, final LeafReaderContext context, final Bits acceptDocs,
+                             final Shape queryShape, final int facetLevel, final FacetVisitor facetVisitor)
+      throws IOException {
+    if (acceptDocs != null && acceptDocs.length() != context.reader().maxDoc()) {
+      throw new IllegalArgumentException(
+          "acceptDocs bits length " + acceptDocs.length() +" != leaf maxdoc " + context.reader().maxDoc());
+    }
+    final SpatialPrefixTree tree = strategy.getGrid();
+
+    //scanLevel is an optimization knob of AbstractVisitingPrefixTreeFilter. It's unlikely
+    // another scanLevel would be much faster and it tends to be a risky knob (can help a little, can hurt a ton).
+    // TODO use RPT's configured scan level?  Do we know better here?  Hard to say.
+    final int scanLevel = tree.getMaxLevels();
+
+    //AbstractVisitingPrefixTreeFilter is a Lucene Filter.  We don't need a filter; we use it for its great prefix-tree
+    // traversal code.  TODO consider refactoring if/when it makes sense (more use cases than this)
+    new AbstractVisitingPrefixTreeFilter(queryShape, strategy.getFieldName(), tree, facetLevel, scanLevel) {
+
+      @Override
+      public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException {
+        assert facetLevel == super.detailLevel;//same thing, FYI. (constant)
+
+        final boolean hasIndexedLeaves = !strategy.isPointsOnly();
+
+        return new VisitorTemplate(context, acceptDocs, hasIndexedLeaves) {
+
+          @Override
+          protected void start() throws IOException {
+            facetVisitor.startOfSegment();
+          }
+
+          @Override
+          protected DocIdSet finish() throws IOException {
+            return null;//unused;
+          }
+
+          @Override
+          protected boolean visit(Cell cell) throws IOException {
+            // At facetLevel...
+            if (cell.getLevel() == facetLevel) {
+              // Count docs
+              visitLeaf(cell);//we're not a leaf but we treat it as such at facet level
+              return false;//don't descend further; this is enough detail
+            }
+
+            // We optimize for discriminating filters (reflected in acceptDocs) and short-circuit if no
+            // matching docs. We could do this at all levels or never but the closer we get to the facet level, the
+            // higher the probability this is worthwhile. We do when docFreq == 1 because it's a cheap check, especially
+            // due to "pulsing" in the codec.
+            //TODO this opt should move to VisitorTemplate (which contains an optimization TODO to this effect)
+            if (cell.getLevel() == facetLevel - 1 || termsEnum.docFreq() == 1) {
+              if (!hasDocsAtThisTerm()) {
+                return false;
+              }
+            }
+            return true;
+          }
+
+          @Override
+          protected void visitLeaf(Cell cell) throws IOException {
+            final int count = countDocsAtThisTerm();
+            if (count > 0) {
+              facetVisitor.visit(cell, count);
+            }
+          }
+
+          private int countDocsAtThisTerm() throws IOException {
+            if (acceptDocs == null) {
+              return termsEnum.docFreq();
+            }
+            int count = 0;
+            docsEnum = termsEnum.docs(acceptDocs, docsEnum, DocsEnum.FLAG_NONE);
+            while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+              count++;
+            }
+            return count;
+          }
+
+          private boolean hasDocsAtThisTerm() throws IOException {
+            if (acceptDocs == null) {
+              return true;
+            }
+            docsEnum = termsEnum.docs(acceptDocs, docsEnum, DocsEnum.FLAG_NONE);
+            return (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
+          }
+
+        }.getDocIdSet();
+      }
+    }.getDocIdSet(context, acceptDocs);
+  }
+}

Modified: lucene/dev/trunk/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/NumberRangeFacetsTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/NumberRangeFacetsTest.java?rev=1652212&r1=1652211&r2=1652212&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/NumberRangeFacetsTest.java (original)
+++ lucene/dev/trunk/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/NumberRangeFacetsTest.java Thu Jan 15 18:33:00 2015
@@ -26,11 +26,8 @@ import java.util.List;
 import com.carrotsearch.randomizedtesting.annotations.Repeat;
 import com.spatial4j.core.shape.Shape;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.queries.TermsFilter;
+import org.apache.lucene.search.Filter;
 import org.apache.lucene.spatial.NumberRangePrefixTreeStrategy;
 import org.apache.lucene.spatial.NumberRangePrefixTreeStrategy.Facets;
 import org.apache.lucene.spatial.StrategyTestCase;
@@ -39,7 +36,6 @@ import org.apache.lucene.spatial.prefix.
 import org.apache.lucene.spatial.prefix.tree.DateRangePrefixTree;
 import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree;
 import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape;
-import org.apache.lucene.util.FixedBitSet;
 import org.junit.Before;
 import org.junit.Test;
 
@@ -64,7 +60,7 @@ public class NumberRangeFacetsTest exten
     randomCalWindowMs = Math.max(2000L, tmpCal.getTimeInMillis());
   }
 
-  @Repeat(iterations = 100)
+  @Repeat(iterations = 10000)
   @Test
   public void test() throws IOException {
     //generate test data
@@ -114,47 +110,40 @@ public class NumberRangeFacetsTest exten
         detailLevel = -1 * detailLevel;
       }
 
-      //Randomly pick a filter as Bits/acceptDocs
-      FixedBitSet acceptDocs = null;//the answer
+      //Randomly pick a filter
+      Filter filter = null;
       List<Integer> acceptFieldIds = new ArrayList<>();
       if (usually()) {
         //get all possible IDs into a list, random shuffle it, then randomly choose how many of the first we use to
         // replace the list.
         for (int i = 0; i < indexedShapes.size(); i++) {
-          if (indexedShapes.get(i) == null) {
+          if (indexedShapes.get(i) == null) { // we deleted this one
             continue;
           }
           acceptFieldIds.add(i);
         }
         Collections.shuffle(acceptFieldIds, random());
         acceptFieldIds = acceptFieldIds.subList(0, randomInt(acceptFieldIds.size()));
-        acceptDocs = new FixedBitSet(indexSearcher.getIndexReader().maxDoc());
-        //query for their Lucene docIds to put into acceptDocs
         if (!acceptFieldIds.isEmpty()) {
-          BooleanQuery acceptQuery = new BooleanQuery();
+          List<Term> terms = new ArrayList<>();
           for (Integer acceptDocId : acceptFieldIds) {
-            acceptQuery.add(new TermQuery(new Term("id", acceptDocId.toString())), BooleanClause.Occur.SHOULD);
+            terms.add(new Term("id", acceptDocId.toString()));
           }
-          final TopDocs topDocs = indexSearcher.search(acceptQuery, numIndexedShapes);
-
-          for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
-            acceptDocs.set(scoreDoc.doc);
-          }
-
+          filter = new TermsFilter(terms);
         }
       }
 
       //Lets do it!
       NumberRangePrefixTree.NRShape facetRange = tree.toRangeShape(tree.toShape(leftCal), tree.toShape(rightCal));
       Facets facets = ((NumberRangePrefixTreeStrategy) strategy)
-          .calcFacets(indexSearcher.getTopReaderContext(), acceptDocs, facetRange, detailLevel);
+          .calcFacets(indexSearcher.getTopReaderContext(), filter, facetRange, detailLevel);
 
       //System.out.println("Q: " + queryIdx + " " + facets);
 
       //Verify results. We do it by looping over indexed shapes and reducing the facet counts.
       Shape facetShapeRounded = facetRange.roundToLevel(detailLevel);
       for (int indexedShapeId = 0; indexedShapeId < indexedShapes.size(); indexedShapeId++) {
-        if (acceptDocs != null && !acceptFieldIds.contains(indexedShapeId)) {
+        if (filter != null && !acceptFieldIds.contains(indexedShapeId)) {
           continue;// this doc was filtered out via acceptDocs
         }
         Shape indexedShape = indexedShapes.get(indexedShapeId);