You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2015/07/07 14:18:44 UTC

svn commit: r1689637 - in /lucene/dev/trunk/lucene: ./ join/src/java/org/apache/lucene/search/join/ join/src/test/org/apache/lucene/search/join/

Author: jpountz
Date: Tue Jul  7 12:18:44 2015
New Revision: 1689637

URL: http://svn.apache.org/r1689637
Log:
LUCENE-6589: Add CheckJoinIndex to validate index structure for joins.

Added:
    lucene/dev/trunk/lucene/join/src/java/org/apache/lucene/search/join/CheckJoinIndex.java   (with props)
    lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestCheckJoinIndex.java   (with props)
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java
    lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1689637&r1=1689636&r2=1689637&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Jul  7 12:18:44 2015
@@ -130,6 +130,10 @@ New Features
   more scalable iterator API (FiniteStringsIterator) (Markus Heiden
   via Mike McCandless)
 
+* LUCENE-6589: Add a new org.apache.lucene.search.join.CheckJoinIndex class
+  that can be used to validate that an index has an appropriate structure to
+  run join queries. (Adrien Grand)
+
 API Changes
 
 * LUCENE-6508: Simplify Lock api, there is now just 

Added: lucene/dev/trunk/lucene/join/src/java/org/apache/lucene/search/join/CheckJoinIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/join/src/java/org/apache/lucene/search/join/CheckJoinIndex.java?rev=1689637&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/join/src/java/org/apache/lucene/search/join/CheckJoinIndex.java (added)
+++ lucene/dev/trunk/lucene/join/src/java/org/apache/lucene/search/join/CheckJoinIndex.java Tue Jul  7 12:18:44 2015
@@ -0,0 +1,72 @@
+package org.apache.lucene.search.join;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.Bits;
+
+/** Utility class to check a block join index. */
+public class CheckJoinIndex {
+
+  private CheckJoinIndex() {}
+
+  /**
+   * Check that the given index is good to use for block joins.
+   * @throws IllegalStateException if the index does not have an appropriate structure
+   */
+  public static void check(IndexReader reader, BitSetProducer parentsFilter) throws IOException {
+    for (LeafReaderContext context : reader.leaves()) {
+      if (context.reader().maxDoc() == 0) {
+        continue;
+      }
+      final BitSet parents = parentsFilter.getBitSet(context);
+      if (parents == null || parents.cardinality() == 0) {
+        throw new IllegalStateException("Every segment should have at least one parent, but " + context.reader() + " does not have any");
+      }
+      if (parents.get(context.reader().maxDoc() - 1) == false) {
+        throw new IllegalStateException("The last document of a segment must always be a parent, but " + context.reader() + " has a child as a last doc");
+      }
+      final Bits liveDocs = context.reader().getLiveDocs();
+      if (liveDocs != null) {
+        int prevParentDoc = -1;
+        DocIdSetIterator it = new BitSetIterator(parents, 0L);
+        for (int parentDoc = it.nextDoc(); parentDoc != DocIdSetIterator.NO_MORE_DOCS; parentDoc = it.nextDoc()) {
+          final boolean parentIsLive = liveDocs.get(parentDoc);
+          for (int child = prevParentDoc + 1; child != parentDoc; child++) {
+            final boolean childIsLive = liveDocs.get(child);
+            if (parentIsLive != childIsLive) {
+              if (childIsLive) {
+                throw new IllegalStateException("Parent doc " + parentDoc + " of segment " + context.reader() + " is live but has a deleted child document " + child);
+              } else {
+                throw new IllegalStateException("Parent doc " + parentDoc + " of segment " + context.reader() + " is deleted but has a live child document " + child);
+              }
+            }
+          }
+          prevParentDoc = parentDoc;
+        }
+      }
+    }
+  }
+
+}

Modified: lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java?rev=1689637&r1=1689636&r2=1689637&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java (original)
+++ lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java Tue Jul  7 12:18:44 2015
@@ -126,18 +126,12 @@ public class TestBlockJoin extends Lucen
     docs.add(makeResume("Frank", "United States"));
     w.addDocuments(docs);
     w.commit();
-    int num = atLeast(10); // produce a segment that doesn't have a value in the docType field
-    for (int i = 0; i < num; i++) {
-      docs.clear();
-      docs.add(makeJob("java", 2007));
-      w.addDocuments(docs);
-    }
     
     IndexReader r = DirectoryReader.open(w, random().nextBoolean());
     w.close();
-    assertTrue(r.leaves().size() > 1);
     IndexSearcher s = new IndexSearcher(r);
     BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "resume")));
+    CheckJoinIndex.check(r, parentsFilter);
 
     BooleanQuery.Builder childQuery = new BooleanQuery.Builder();
     childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
@@ -190,6 +184,7 @@ public class TestBlockJoin extends Lucen
 
     // Create a filter that defines "parent" documents in the index - in this case resumes
     BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "resume")));
+    CheckJoinIndex.check(r, parentsFilter);
 
     // Define child document criteria (finds an example of relevant work experience)
     BooleanQuery.Builder childQuery = new BooleanQuery.Builder();
@@ -280,6 +275,7 @@ public class TestBlockJoin extends Lucen
     qc.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
 
     BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "resume")));
+    CheckJoinIndex.check(r, parentsFilter);
 
     int h1 = qc.hashCode();
     Query qw1 = qc.rewrite(r);
@@ -341,6 +337,7 @@ public class TestBlockJoin extends Lucen
 
     // Create a filter that defines "parent" documents in the index - in this case resumes
     BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "resume")));
+    CheckJoinIndex.check(r, parentsFilter);
 
     // Define child document criteria (finds an example of relevant work experience)
     BooleanQuery.Builder childQuery = new BooleanQuery.Builder();
@@ -650,6 +647,7 @@ public class TestBlockJoin extends Lucen
     final IndexSearcher joinS = new IndexSearcher(joinR);
 
     final BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("isParent", "x")));
+    CheckJoinIndex.check(joinS.getIndexReader(), parentsFilter);
 
     final int iters = 200*RANDOM_MULTIPLIER;
 
@@ -1059,6 +1057,7 @@ public class TestBlockJoin extends Lucen
 
     // Create a filter that defines "parent" documents in the index - in this case resumes
     BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "resume")));
+    CheckJoinIndex.check(s.getIndexReader(), parentsFilter);
 
     // Define child document criteria (finds an example of relevant work experience)
     BooleanQuery.Builder childJobQuery = new BooleanQuery.Builder();
@@ -1140,6 +1139,7 @@ public class TestBlockJoin extends Lucen
     Query tq = new TermQuery(new Term("child", "1"));
     BitSetProducer parentFilter = new QueryBitSetProducer(
                               new TermQuery(new Term("parent", "1")));
+    CheckJoinIndex.check(s.getIndexReader(), parentFilter);
 
     ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(tq, parentFilter, ScoreMode.Avg);
     Weight weight = s.createNormalizedWeight(q, true);
@@ -1173,6 +1173,7 @@ public class TestBlockJoin extends Lucen
     Query tq = new TermQuery(new Term("child", "2"));
     BitSetProducer parentFilter = new QueryBitSetProducer(
                               new TermQuery(new Term("isparent", "yes")));
+    CheckJoinIndex.check(s.getIndexReader(), parentFilter);
 
     ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(tq, parentFilter, ScoreMode.Avg);
     Weight weight = s.createNormalizedWeight(q, true);
@@ -1205,6 +1206,7 @@ public class TestBlockJoin extends Lucen
 
     // Create a filter that defines "parent" documents in the index - in this case resumes
     BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "resume")));
+    CheckJoinIndex.check(s.getIndexReader(), parentsFilter);
 
     // Define child document criteria (finds an example of relevant work experience)
     BooleanQuery.Builder childQuery = new BooleanQuery.Builder();
@@ -1311,6 +1313,7 @@ public class TestBlockJoin extends Lucen
     IndexSearcher searcher = new ToParentBlockJoinIndexSearcher(r);
     Query childQuery = new TermQuery(new Term("childText", "text"));
     BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("isParent", "yes")));
+    CheckJoinIndex.check(r, parentsFilter);
     ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
     BooleanQuery.Builder parentQuery = new BooleanQuery.Builder();
     parentQuery.add(childJoinQuery, Occur.SHOULD);
@@ -1381,6 +1384,7 @@ public class TestBlockJoin extends Lucen
     // never matches:
     Query childQuery = new TermQuery(new Term("childText", "bogus"));
     BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("isParent", "yes")));
+    CheckJoinIndex.check(r, parentsFilter);
     ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
     BooleanQuery.Builder parentQuery = new BooleanQuery.Builder();
     parentQuery.add(childJoinQuery, Occur.SHOULD);
@@ -1446,6 +1450,7 @@ public class TestBlockJoin extends Lucen
     // illegally matches parent:
     Query childQuery = new TermQuery(new Term("parentText", "text"));
     BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("isParent", "yes")));
+    CheckJoinIndex.check(r, parentsFilter);
     ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
     BooleanQuery.Builder parentQuery = new BooleanQuery.Builder();
     parentQuery.add(childJoinQuery, Occur.SHOULD);
@@ -1498,6 +1503,7 @@ public class TestBlockJoin extends Lucen
 
     // Create a filter that defines "parent" documents in the index - in this case resumes
     BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("isparent", "yes")));
+    CheckJoinIndex.check(r, parentsFilter);
 
     Query parentQuery = new TermQuery(new Term("parent", "2"));
 
@@ -1628,4 +1634,5 @@ public class TestBlockJoin extends Lucen
     r.close();
     dir.close();
   }
+
 }

Modified: lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java?rev=1689637&r1=1689636&r2=1689637&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java (original)
+++ lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java Tue Jul  7 12:18:44 2015
@@ -185,12 +185,6 @@ public class TestBlockJoinSorting extend
     docs.add(document);
     w.addDocuments(docs);
 
-    // This doc will not be included, because it doesn't have nested docs
-    document = new Document();
-    document.add(new StringField("__type", "parent", Field.Store.NO));
-    document.add(new StringField("field1", "h", Field.Store.NO));
-    w.addDocument(document);
-
     docs.clear();
     document = new Document();
     document.add(new StringField("field2", "m", Field.Store.NO));
@@ -214,20 +208,10 @@ public class TestBlockJoinSorting extend
     w.addDocuments(docs);
     w.commit();
 
-    // Some garbage docs, just to check if the NestedFieldComparator can deal with this.
-    document = new Document();
-    document.add(new StringField("fieldXXX", "x", Field.Store.NO));
-    w.addDocument(document);
-    document = new Document();
-    document.add(new StringField("fieldXXX", "x", Field.Store.NO));
-    w.addDocument(document);
-    document = new Document();
-    document.add(new StringField("fieldXXX", "x", Field.Store.NO));
-    w.addDocument(document);
-
     IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(w.w, false));
     w.close();
     BitSetProducer parentFilter = new QueryBitSetProducer(new TermQuery(new Term("__type", "parent")));
+    CheckJoinIndex.check(searcher.getIndexReader(), parentFilter);
     BitSetProducer childFilter = new QueryBitSetProducer(new PrefixQuery(new Term("field2")));
     ToParentBlockJoinQuery query = new ToParentBlockJoinQuery(
         new PrefixQuery(new Term("field2")),
@@ -281,7 +265,7 @@ public class TestBlockJoinSorting extend
     topDocs = searcher.search(query, 5, sort);
     assertEquals(topDocs.totalHits, 7);
     assertEquals(5, topDocs.scoreDocs.length);
-    assertEquals(28, topDocs.scoreDocs[0].doc);
+    assertEquals(27, topDocs.scoreDocs[0].doc);
     assertEquals("o", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[0]).fields[0]).utf8ToString());
     assertEquals(23, topDocs.scoreDocs[1].doc);
     assertEquals("m", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[1]).fields[0]).utf8ToString());
@@ -308,7 +292,7 @@ public class TestBlockJoinSorting extend
     assertEquals(5, topDocs.scoreDocs.length);
     assertEquals(23, topDocs.scoreDocs[0].doc);
     assertEquals("m", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[0]).fields[0]).utf8ToString());
-    assertEquals(28, topDocs.scoreDocs[1].doc);
+    assertEquals(27, topDocs.scoreDocs[1].doc);
     assertEquals("m", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[1]).fields[0]).utf8ToString());
     assertEquals(11, topDocs.scoreDocs[2].doc);
     assertEquals("g", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[2]).fields[0]).utf8ToString());

Added: lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestCheckJoinIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestCheckJoinIndex.java?rev=1689637&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestCheckJoinIndex.java (added)
+++ lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestCheckJoinIndex.java Tue Jul  7 12:18:44 2015
@@ -0,0 +1,143 @@
+package org.apache.lucene.search.join;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.NoMergePolicy;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.MatchNoDocsQuery;
+import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestCheckJoinIndex extends LuceneTestCase {
+
+  public void testNoParent() throws IOException {
+    final Directory dir = newDirectory();
+    final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+    final int numDocs = TestUtil.nextInt(random(), 1, 3);
+    for (int i = 0; i < numDocs; ++i) {
+      w.addDocument(new Document());
+    }
+    final IndexReader reader = w.getReader();
+    w.close();
+    BitSetProducer parentsFilter = new QueryBitSetProducer(new MatchNoDocsQuery());
+    try {
+      CheckJoinIndex.check(reader, parentsFilter);
+      fail("Invalid index");
+    } catch (IllegalStateException e) {
+      // expected
+    } finally {
+      reader.close();
+      dir.close();
+    }
+  }
+
+  public void testOrphans() throws IOException {
+    final Directory dir = newDirectory();
+    final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+
+    {
+      // Add a first valid block
+      List<Document> block = new ArrayList<>();
+      final int numChildren = TestUtil.nextInt(random(), 0, 3);
+      for (int i = 0; i < numChildren; ++i) {
+        block.add(new Document());
+      }
+      Document parent = new Document();
+      parent.add(new StringField("parent", "true", Store.NO));
+      block.add(parent);
+      w.addDocuments(block);
+    }
+
+    {
+      // Then a block with no parent
+      List<Document> block = new ArrayList<>();
+      final int numChildren = TestUtil.nextInt(random(), 1, 3);
+      for (int i = 0; i < numChildren; ++i) {
+        block.add(new Document());
+      }
+      w.addDocuments(block);
+    }
+
+    final IndexReader reader = w.getReader();
+    w.close();
+    BitSetProducer parentsFilter = new QueryBitSetProducer(new QueryWrapperFilter(new TermQuery(new Term("parent", "true"))));
+    try {
+      CheckJoinIndex.check(reader, parentsFilter);
+      fail("Invalid index");
+    } catch (IllegalStateException e) {
+      // expected
+    } finally {
+      reader.close();
+      dir.close();
+    }
+  }
+
+  public void testInconsistentDeletes() throws IOException {
+    final Directory dir = newDirectory();
+    final IndexWriterConfig iwc = newIndexWriterConfig();
+    iwc.setMergePolicy(NoMergePolicy.INSTANCE); // so that deletions don't trigger merges
+    final RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+    List<Document> block = new ArrayList<>();
+    final int numChildren = TestUtil.nextInt(random(), 0, 3);
+    for (int i = 0; i < numChildren; ++i) {
+      Document doc = new Document();
+      doc.add(new StringField("child", Integer.toString(i), Store.NO));
+      block.add(doc);
+    }
+    Document parent = new Document();
+    parent.add(new StringField("parent", "true", Store.NO));
+    block.add(parent);
+    w.addDocuments(block);
+
+    if (random().nextBoolean()) {
+      w.deleteDocuments(new Term("parent", "true"));
+    } else {
+      // delete any of the children
+      w.deleteDocuments(new Term("child", Integer.toString(random().nextInt(numChildren))));
+    }
+
+    final IndexReader reader = w.getReader();
+    w.close();
+
+    BitSetProducer parentsFilter = new QueryBitSetProducer(new QueryWrapperFilter(new TermQuery(new Term("parent", "true"))));
+    try {
+      CheckJoinIndex.check(reader, parentsFilter);
+      fail("Invalid index");
+    } catch (IllegalStateException e) {
+      // expected
+    } finally {
+      reader.close();
+      dir.close();
+    }
+  }
+
+}