You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2015/07/07 14:18:44 UTC
svn commit: r1689637 - in /lucene/dev/trunk/lucene: ./
join/src/java/org/apache/lucene/search/join/
join/src/test/org/apache/lucene/search/join/
Author: jpountz
Date: Tue Jul 7 12:18:44 2015
New Revision: 1689637
URL: http://svn.apache.org/r1689637
Log:
LUCENE-6589: Add CheckJoinIndex to validate index structure for joins.
Added:
lucene/dev/trunk/lucene/join/src/java/org/apache/lucene/search/join/CheckJoinIndex.java (with props)
lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestCheckJoinIndex.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java
lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1689637&r1=1689636&r2=1689637&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Jul 7 12:18:44 2015
@@ -130,6 +130,10 @@ New Features
more scalable iterator API (FiniteStringsIterator) (Markus Heiden
via Mike McCandless)
+* LUCENE-6589: Add a new org.apache.lucene.search.join.CheckJoinIndex class
+ that can be used to validate that an index has an appropriate structure to
+ run join queries. (Adrien Grand)
+
API Changes
* LUCENE-6508: Simplify Lock api, there is now just
Added: lucene/dev/trunk/lucene/join/src/java/org/apache/lucene/search/join/CheckJoinIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/join/src/java/org/apache/lucene/search/join/CheckJoinIndex.java?rev=1689637&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/join/src/java/org/apache/lucene/search/join/CheckJoinIndex.java (added)
+++ lucene/dev/trunk/lucene/join/src/java/org/apache/lucene/search/join/CheckJoinIndex.java Tue Jul 7 12:18:44 2015
@@ -0,0 +1,72 @@
+package org.apache.lucene.search.join;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.Bits;
+
+/** Utility class to check a block join index. */
+public class CheckJoinIndex {
+
+ private CheckJoinIndex() {}
+
+ /**
+ * Check that the given index is good to use for block joins.
+ * @throws IllegalStateException if the index does not have an appropriate structure
+ */
+ public static void check(IndexReader reader, BitSetProducer parentsFilter) throws IOException {
+ for (LeafReaderContext context : reader.leaves()) {
+ if (context.reader().maxDoc() == 0) {
+ continue;
+ }
+ final BitSet parents = parentsFilter.getBitSet(context);
+ if (parents == null || parents.cardinality() == 0) {
+ throw new IllegalStateException("Every segment should have at least one parent, but " + context.reader() + " does not have any");
+ }
+ if (parents.get(context.reader().maxDoc() - 1) == false) {
+ throw new IllegalStateException("The last document of a segment must always be a parent, but " + context.reader() + " has a child as a last doc");
+ }
+ final Bits liveDocs = context.reader().getLiveDocs();
+ if (liveDocs != null) {
+ int prevParentDoc = -1;
+ DocIdSetIterator it = new BitSetIterator(parents, 0L);
+ for (int parentDoc = it.nextDoc(); parentDoc != DocIdSetIterator.NO_MORE_DOCS; parentDoc = it.nextDoc()) {
+ final boolean parentIsLive = liveDocs.get(parentDoc);
+ for (int child = prevParentDoc + 1; child != parentDoc; child++) {
+ final boolean childIsLive = liveDocs.get(child);
+ if (parentIsLive != childIsLive) {
+ if (childIsLive) {
+ throw new IllegalStateException("Parent doc " + parentDoc + " of segment " + context.reader() + " is live but has a deleted child document " + child);
+ } else {
+ throw new IllegalStateException("Parent doc " + parentDoc + " of segment " + context.reader() + " is deleted but has a live child document " + child);
+ }
+ }
+ }
+ prevParentDoc = parentDoc;
+ }
+ }
+ }
+ }
+
+}
Modified: lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java?rev=1689637&r1=1689636&r2=1689637&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java (original)
+++ lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java Tue Jul 7 12:18:44 2015
@@ -126,18 +126,12 @@ public class TestBlockJoin extends Lucen
docs.add(makeResume("Frank", "United States"));
w.addDocuments(docs);
w.commit();
- int num = atLeast(10); // produce a segment that doesn't have a value in the docType field
- for (int i = 0; i < num; i++) {
- docs.clear();
- docs.add(makeJob("java", 2007));
- w.addDocuments(docs);
- }
IndexReader r = DirectoryReader.open(w, random().nextBoolean());
w.close();
- assertTrue(r.leaves().size() > 1);
IndexSearcher s = new IndexSearcher(r);
BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "resume")));
+ CheckJoinIndex.check(r, parentsFilter);
BooleanQuery.Builder childQuery = new BooleanQuery.Builder();
childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
@@ -190,6 +184,7 @@ public class TestBlockJoin extends Lucen
// Create a filter that defines "parent" documents in the index - in this case resumes
BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "resume")));
+ CheckJoinIndex.check(r, parentsFilter);
// Define child document criteria (finds an example of relevant work experience)
BooleanQuery.Builder childQuery = new BooleanQuery.Builder();
@@ -280,6 +275,7 @@ public class TestBlockJoin extends Lucen
qc.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "resume")));
+ CheckJoinIndex.check(r, parentsFilter);
int h1 = qc.hashCode();
Query qw1 = qc.rewrite(r);
@@ -341,6 +337,7 @@ public class TestBlockJoin extends Lucen
// Create a filter that defines "parent" documents in the index - in this case resumes
BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "resume")));
+ CheckJoinIndex.check(r, parentsFilter);
// Define child document criteria (finds an example of relevant work experience)
BooleanQuery.Builder childQuery = new BooleanQuery.Builder();
@@ -650,6 +647,7 @@ public class TestBlockJoin extends Lucen
final IndexSearcher joinS = new IndexSearcher(joinR);
final BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("isParent", "x")));
+ CheckJoinIndex.check(joinS.getIndexReader(), parentsFilter);
final int iters = 200*RANDOM_MULTIPLIER;
@@ -1059,6 +1057,7 @@ public class TestBlockJoin extends Lucen
// Create a filter that defines "parent" documents in the index - in this case resumes
BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "resume")));
+ CheckJoinIndex.check(s.getIndexReader(), parentsFilter);
// Define child document criteria (finds an example of relevant work experience)
BooleanQuery.Builder childJobQuery = new BooleanQuery.Builder();
@@ -1140,6 +1139,7 @@ public class TestBlockJoin extends Lucen
Query tq = new TermQuery(new Term("child", "1"));
BitSetProducer parentFilter = new QueryBitSetProducer(
new TermQuery(new Term("parent", "1")));
+ CheckJoinIndex.check(s.getIndexReader(), parentFilter);
ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(tq, parentFilter, ScoreMode.Avg);
Weight weight = s.createNormalizedWeight(q, true);
@@ -1173,6 +1173,7 @@ public class TestBlockJoin extends Lucen
Query tq = new TermQuery(new Term("child", "2"));
BitSetProducer parentFilter = new QueryBitSetProducer(
new TermQuery(new Term("isparent", "yes")));
+ CheckJoinIndex.check(s.getIndexReader(), parentFilter);
ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(tq, parentFilter, ScoreMode.Avg);
Weight weight = s.createNormalizedWeight(q, true);
@@ -1205,6 +1206,7 @@ public class TestBlockJoin extends Lucen
// Create a filter that defines "parent" documents in the index - in this case resumes
BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "resume")));
+ CheckJoinIndex.check(s.getIndexReader(), parentsFilter);
// Define child document criteria (finds an example of relevant work experience)
BooleanQuery.Builder childQuery = new BooleanQuery.Builder();
@@ -1311,6 +1313,7 @@ public class TestBlockJoin extends Lucen
IndexSearcher searcher = new ToParentBlockJoinIndexSearcher(r);
Query childQuery = new TermQuery(new Term("childText", "text"));
BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("isParent", "yes")));
+ CheckJoinIndex.check(r, parentsFilter);
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
BooleanQuery.Builder parentQuery = new BooleanQuery.Builder();
parentQuery.add(childJoinQuery, Occur.SHOULD);
@@ -1381,6 +1384,7 @@ public class TestBlockJoin extends Lucen
// never matches:
Query childQuery = new TermQuery(new Term("childText", "bogus"));
BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("isParent", "yes")));
+ CheckJoinIndex.check(r, parentsFilter);
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
BooleanQuery.Builder parentQuery = new BooleanQuery.Builder();
parentQuery.add(childJoinQuery, Occur.SHOULD);
@@ -1446,6 +1450,7 @@ public class TestBlockJoin extends Lucen
// illegally matches parent:
Query childQuery = new TermQuery(new Term("parentText", "text"));
BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("isParent", "yes")));
+ CheckJoinIndex.check(r, parentsFilter);
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
BooleanQuery.Builder parentQuery = new BooleanQuery.Builder();
parentQuery.add(childJoinQuery, Occur.SHOULD);
@@ -1498,6 +1503,7 @@ public class TestBlockJoin extends Lucen
// Create a filter that defines "parent" documents in the index - in this case resumes
BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("isparent", "yes")));
+ CheckJoinIndex.check(r, parentsFilter);
Query parentQuery = new TermQuery(new Term("parent", "2"));
@@ -1628,4 +1634,5 @@ public class TestBlockJoin extends Lucen
r.close();
dir.close();
}
+
}
Modified: lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java?rev=1689637&r1=1689636&r2=1689637&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java (original)
+++ lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinSorting.java Tue Jul 7 12:18:44 2015
@@ -185,12 +185,6 @@ public class TestBlockJoinSorting extend
docs.add(document);
w.addDocuments(docs);
- // This doc will not be included, because it doesn't have nested docs
- document = new Document();
- document.add(new StringField("__type", "parent", Field.Store.NO));
- document.add(new StringField("field1", "h", Field.Store.NO));
- w.addDocument(document);
-
docs.clear();
document = new Document();
document.add(new StringField("field2", "m", Field.Store.NO));
@@ -214,20 +208,10 @@ public class TestBlockJoinSorting extend
w.addDocuments(docs);
w.commit();
- // Some garbage docs, just to check if the NestedFieldComparator can deal with this.
- document = new Document();
- document.add(new StringField("fieldXXX", "x", Field.Store.NO));
- w.addDocument(document);
- document = new Document();
- document.add(new StringField("fieldXXX", "x", Field.Store.NO));
- w.addDocument(document);
- document = new Document();
- document.add(new StringField("fieldXXX", "x", Field.Store.NO));
- w.addDocument(document);
-
IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(w.w, false));
w.close();
BitSetProducer parentFilter = new QueryBitSetProducer(new TermQuery(new Term("__type", "parent")));
+ CheckJoinIndex.check(searcher.getIndexReader(), parentFilter);
BitSetProducer childFilter = new QueryBitSetProducer(new PrefixQuery(new Term("field2")));
ToParentBlockJoinQuery query = new ToParentBlockJoinQuery(
new PrefixQuery(new Term("field2")),
@@ -281,7 +265,7 @@ public class TestBlockJoinSorting extend
topDocs = searcher.search(query, 5, sort);
assertEquals(topDocs.totalHits, 7);
assertEquals(5, topDocs.scoreDocs.length);
- assertEquals(28, topDocs.scoreDocs[0].doc);
+ assertEquals(27, topDocs.scoreDocs[0].doc);
assertEquals("o", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[0]).fields[0]).utf8ToString());
assertEquals(23, topDocs.scoreDocs[1].doc);
assertEquals("m", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[1]).fields[0]).utf8ToString());
@@ -308,7 +292,7 @@ public class TestBlockJoinSorting extend
assertEquals(5, topDocs.scoreDocs.length);
assertEquals(23, topDocs.scoreDocs[0].doc);
assertEquals("m", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[0]).fields[0]).utf8ToString());
- assertEquals(28, topDocs.scoreDocs[1].doc);
+ assertEquals(27, topDocs.scoreDocs[1].doc);
assertEquals("m", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[1]).fields[0]).utf8ToString());
assertEquals(11, topDocs.scoreDocs[2].doc);
assertEquals("g", ((BytesRef) ((FieldDoc) topDocs.scoreDocs[2]).fields[0]).utf8ToString());
Added: lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestCheckJoinIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestCheckJoinIndex.java?rev=1689637&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestCheckJoinIndex.java (added)
+++ lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestCheckJoinIndex.java Tue Jul 7 12:18:44 2015
@@ -0,0 +1,143 @@
+package org.apache.lucene.search.join;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.NoMergePolicy;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.MatchNoDocsQuery;
+import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestCheckJoinIndex extends LuceneTestCase {
+
+ public void testNoParent() throws IOException {
+ final Directory dir = newDirectory();
+ final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ final int numDocs = TestUtil.nextInt(random(), 1, 3);
+ for (int i = 0; i < numDocs; ++i) {
+ w.addDocument(new Document());
+ }
+ final IndexReader reader = w.getReader();
+ w.close();
+ BitSetProducer parentsFilter = new QueryBitSetProducer(new MatchNoDocsQuery());
+ try {
+ CheckJoinIndex.check(reader, parentsFilter);
+ fail("Invalid index");
+ } catch (IllegalStateException e) {
+ // expected
+ } finally {
+ reader.close();
+ dir.close();
+ }
+ }
+
+ public void testOrphans() throws IOException {
+ final Directory dir = newDirectory();
+ final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+
+ {
+ // Add a first valid block
+ List<Document> block = new ArrayList<>();
+ final int numChildren = TestUtil.nextInt(random(), 0, 3);
+ for (int i = 0; i < numChildren; ++i) {
+ block.add(new Document());
+ }
+ Document parent = new Document();
+ parent.add(new StringField("parent", "true", Store.NO));
+ block.add(parent);
+ w.addDocuments(block);
+ }
+
+ {
+ // Then a block with no parent
+ List<Document> block = new ArrayList<>();
+ final int numChildren = TestUtil.nextInt(random(), 1, 3);
+ for (int i = 0; i < numChildren; ++i) {
+ block.add(new Document());
+ }
+ w.addDocuments(block);
+ }
+
+ final IndexReader reader = w.getReader();
+ w.close();
+ BitSetProducer parentsFilter = new QueryBitSetProducer(new QueryWrapperFilter(new TermQuery(new Term("parent", "true"))));
+ try {
+ CheckJoinIndex.check(reader, parentsFilter);
+ fail("Invalid index");
+ } catch (IllegalStateException e) {
+ // expected
+ } finally {
+ reader.close();
+ dir.close();
+ }
+ }
+
+ public void testInconsistentDeletes() throws IOException {
+ final Directory dir = newDirectory();
+ final IndexWriterConfig iwc = newIndexWriterConfig();
+ iwc.setMergePolicy(NoMergePolicy.INSTANCE); // so that deletions don't trigger merges
+ final RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+ List<Document> block = new ArrayList<>();
+ final int numChildren = TestUtil.nextInt(random(), 0, 3);
+ for (int i = 0; i < numChildren; ++i) {
+ Document doc = new Document();
+ doc.add(new StringField("child", Integer.toString(i), Store.NO));
+ block.add(doc);
+ }
+ Document parent = new Document();
+ parent.add(new StringField("parent", "true", Store.NO));
+ block.add(parent);
+ w.addDocuments(block);
+
+ if (random().nextBoolean()) {
+ w.deleteDocuments(new Term("parent", "true"));
+ } else {
+ // delete any of the children
+ w.deleteDocuments(new Term("child", Integer.toString(random().nextInt(numChildren))));
+ }
+
+ final IndexReader reader = w.getReader();
+ w.close();
+
+ BitSetProducer parentsFilter = new QueryBitSetProducer(new QueryWrapperFilter(new TermQuery(new Term("parent", "true"))));
+ try {
+ CheckJoinIndex.check(reader, parentsFilter);
+ fail("Invalid index");
+ } catch (IllegalStateException e) {
+ // expected
+ } finally {
+ reader.close();
+ dir.close();
+ }
+ }
+
+}