You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mv...@apache.org on 2012/08/27 16:23:50 UTC
svn commit: r1377692 - in /lucene/dev/branches/branch_4x/lucene: CHANGES.txt
join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java
join/src/test/org/apache/lucene/search/join/TestJoinUtil.java
Author: mvg
Date: Mon Aug 27 14:23:48 2012
New Revision: 1377692
URL: http://svn.apache.org/viewvc?rev=1377692&view=rev
Log:
LUCENE-4224: Add in-order scorer to query time joining and the out-of-order scorer throws an UOE
Modified:
lucene/dev/branches/branch_4x/lucene/CHANGES.txt
lucene/dev/branches/branch_4x/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java
lucene/dev/branches/branch_4x/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1377692&r1=1377691&r2=1377692&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Mon Aug 27 14:23:48 2012
@@ -110,6 +110,9 @@ Bug Fixes
containing non-BMP Unicode characters. (Dawid Weiss, Robert Muir,
Mike McCandless)
+* LUCENE-4224: Add in-order scorer to query time joining and the
+ out-of-order scorer throws an UOE. (Martijn van Groningen, Robert Muir)
+
Optimizations
* LUCENE-4317: Improve reuse of internal TokenStreams and StringReader
Modified: lucene/dev/branches/branch_4x/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java?rev=1377692&r1=1377691&r2=1377692&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java (original)
+++ lucene/dev/branches/branch_4x/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java Mon Aug 27 14:23:48 2012
@@ -99,9 +99,9 @@ class TermsIncludingScoreQuery extends Q
private TermsEnum segmentTermsEnum;
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
- SVInnerScorer scorer = (SVInnerScorer) scorer(context, true, false, context.reader().getLiveDocs());
+ SVInnerScorer scorer = (SVInnerScorer) scorer(context, false, false, context.reader().getLiveDocs());
if (scorer != null) {
- if (scorer.advance(doc) == doc) {
+ if (scorer.advanceForExplainOnly(doc) == doc) {
return scorer.explain();
}
}
@@ -127,7 +127,13 @@ class TermsIncludingScoreQuery extends Q
}
segmentTermsEnum = terms.iterator(segmentTermsEnum);
- if (multipleValuesPerDocument) {
+ if (scoreDocsInOrder) {
+ if (multipleValuesPerDocument) {
+ return new MVInOrderScorer(this, acceptDocs, segmentTermsEnum, context.reader().maxDoc());
+ } else {
+ return new SVInOrderScorer(this, acceptDocs, segmentTermsEnum, context.reader().maxDoc());
+ }
+ } else if (multipleValuesPerDocument) {
return new MVInnerScorer(this, acceptDocs, segmentTermsEnum, context.reader().maxDoc());
} else {
return new SVInnerScorer(this, acceptDocs, segmentTermsEnum);
@@ -182,8 +188,7 @@ class TermsIncludingScoreQuery extends Q
}
scoreUpto = upto;
- TermsEnum.SeekStatus status = termsEnum.seekCeil(terms.get(ords[upto++], spare), true);
- if (status == TermsEnum.SeekStatus.FOUND) {
+ if (termsEnum.seekExact(terms.get(ords[upto++], spare), true)) {
docsEnum = reuse = termsEnum.docs(acceptDocs, reuse, 0);
}
} while (docsEnum == null);
@@ -192,6 +197,10 @@ class TermsIncludingScoreQuery extends Q
}
public int advance(int target) throws IOException {
+ throw new UnsupportedOperationException("advance() isn't supported because doc ids are emitted out of order");
+ }
+
+ private int advanceForExplainOnly(int target) throws IOException {
int docId;
do {
docId = nextDoc();
@@ -251,8 +260,7 @@ class TermsIncludingScoreQuery extends Q
}
scoreUpto = upto;
- TermsEnum.SeekStatus status = termsEnum.seekCeil(terms.get(ords[upto++], spare), true);
- if (status == TermsEnum.SeekStatus.FOUND) {
+ if (termsEnum.seekExact(terms.get(ords[upto++], spare), true)) {
docsEnum = reuse = termsEnum.docs(acceptDocs, reuse, 0);
}
} while (docsEnum == null);
@@ -274,4 +282,89 @@ class TermsIncludingScoreQuery extends Q
}
}
+ class SVInOrderScorer extends Scorer {
+
+ final DocIdSetIterator matchingDocsIterator;
+ final float[] scores;
+
+ int currentDoc = -1;
+
+ SVInOrderScorer(Weight weight, Bits acceptDocs, TermsEnum termsEnum, int maxDoc) throws IOException {
+ super(weight);
+ FixedBitSet matchingDocs = new FixedBitSet(maxDoc);
+ this.scores = new float[maxDoc];
+ fillDocsAndScores(matchingDocs, acceptDocs, termsEnum);
+ this.matchingDocsIterator = matchingDocs.iterator();
+ }
+
+ protected void fillDocsAndScores(FixedBitSet matchingDocs, Bits acceptDocs, TermsEnum termsEnum) throws IOException {
+ BytesRef spare = new BytesRef();
+ DocsEnum docsEnum = null;
+ for (int i = 0; i < terms.size(); i++) {
+ if (termsEnum.seekExact(terms.get(ords[i], spare), true)) {
+ docsEnum = termsEnum.docs(acceptDocs, docsEnum, 0);
+ float score = TermsIncludingScoreQuery.this.scores[ords[i]];
+ for (int doc = docsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docsEnum.nextDoc()) {
+ matchingDocs.set(doc);
+ // In the case the same doc is also related to a another doc, a score might be overwritten. I think this
+ // can only happen in a many-to-many relation
+ scores[doc] = score;
+ }
+ }
+ }
+ }
+
+ public float score() throws IOException {
+ return scores[currentDoc];
+ }
+
+ public float freq() throws IOException {
+ return 1;
+ }
+
+ public int docID() {
+ return currentDoc;
+ }
+
+ public int nextDoc() throws IOException {
+ return currentDoc = matchingDocsIterator.nextDoc();
+ }
+
+ public int advance(int target) throws IOException {
+ return currentDoc = matchingDocsIterator.advance(target);
+ }
+ }
+
+ // This scorer deals with the fact that a document can have more than one score from multiple related documents.
+ class MVInOrderScorer extends SVInOrderScorer {
+
+ MVInOrderScorer(Weight weight, Bits acceptDocs, TermsEnum termsEnum, int maxDoc) throws IOException {
+ super(weight, acceptDocs, termsEnum, maxDoc);
+ }
+
+ @Override
+ protected void fillDocsAndScores(FixedBitSet matchingDocs, Bits acceptDocs, TermsEnum termsEnum) throws IOException {
+ BytesRef spare = new BytesRef();
+ DocsEnum docsEnum = null;
+ for (int i = 0; i < terms.size(); i++) {
+ if (termsEnum.seekExact(terms.get(ords[i], spare), true)) {
+ docsEnum = termsEnum.docs(acceptDocs, docsEnum, 0);
+ float score = TermsIncludingScoreQuery.this.scores[ords[i]];
+ for (int doc = docsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docsEnum.nextDoc()) {
+ // I prefer this:
+ /*if (scores[doc] < score) {
+ scores[doc] = score;
+ matchingDocs.set(doc);
+ }*/
+ // But this behaves the same as MVInnerScorer and only then the tests will pass:
+ if (!matchingDocs.get(doc)) {
+ scores[doc] = score;
+ matchingDocs.set(doc);
+ }
+ }
+ }
+ }
+ }
+ }
+
}
Modified: lucene/dev/branches/branch_4x/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java?rev=1377692&r1=1377691&r2=1377692&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java (original)
+++ lucene/dev/branches/branch_4x/lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java Mon Aug 27 14:23:48 2012
@@ -22,13 +22,17 @@ import org.apache.lucene.analysis.MockTo
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocTermOrds;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocIdSetIterator;
@@ -53,7 +57,6 @@ import org.junit.Test;
import java.io.IOException;
import java.util.*;
-@Slow
public class TestJoinUtil extends LuceneTestCase {
public void testSimple() throws Exception {
@@ -229,6 +232,7 @@ public class TestJoinUtil extends Lucene
}
@Test
+ @Slow
public void testSingleValueRandomJoin() throws Exception {
int maxIndexIter = _TestUtil.nextInt(random(), 6, 12);
int maxSearchIter = _TestUtil.nextInt(random(), 13, 26);
@@ -236,6 +240,7 @@ public class TestJoinUtil extends Lucene
}
@Test
+ @Slow
// This test really takes more time, that is why the number of iterations are smaller.
public void testMultiValueRandomJoin() throws Exception {
int maxIndexIter = _TestUtil.nextInt(random(), 3, 6);
@@ -254,7 +259,8 @@ public class TestJoinUtil extends Lucene
dir,
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)).setMergePolicy(newLogMergePolicy())
);
- IndexIterationContext context = createContext(numberOfDocumentsToIndex, w, multipleValuesPerDocument);
+ final boolean scoreDocsInOrder = TestJoinUtil.random().nextBoolean();
+ IndexIterationContext context = createContext(numberOfDocumentsToIndex, w, multipleValuesPerDocument, scoreDocsInOrder);
IndexReader topLevelReader = w.getReader();
w.close();
@@ -310,7 +316,7 @@ public class TestJoinUtil extends Lucene
}
public boolean acceptsDocsOutOfOrder() {
- return topScoreDocCollector.acceptsDocsOutOfOrder();
+ return scoreDocsInOrder;
}
});
// Asserting bit set...
@@ -354,11 +360,11 @@ public class TestJoinUtil extends Lucene
}
}
- private IndexIterationContext createContext(int nDocs, RandomIndexWriter writer, boolean multipleValuesPerDocument) throws IOException {
- return createContext(nDocs, writer, writer, multipleValuesPerDocument);
+ private IndexIterationContext createContext(int nDocs, RandomIndexWriter writer, boolean multipleValuesPerDocument, boolean scoreDocsInOrder) throws IOException {
+ return createContext(nDocs, writer, writer, multipleValuesPerDocument, scoreDocsInOrder);
}
- private IndexIterationContext createContext(int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter, boolean multipleValuesPerDocument) throws IOException {
+ private IndexIterationContext createContext(int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter, boolean multipleValuesPerDocument, boolean scoreDocsInOrder) throws IOException {
IndexIterationContext context = new IndexIterationContext();
int numRandomValues = nDocs / 2;
context.randomUniqueValues = new String[numRandomValues];
@@ -541,55 +547,81 @@ public class TestJoinUtil extends Lucene
final Map<Integer, JoinScore> docToJoinScore = new HashMap<Integer, JoinScore>();
if (multipleValuesPerDocument) {
- toSearcher.search(new MatchAllDocsQuery(), new Collector() {
-
- private DocTermOrds docTermOrds;
- private TermsEnum docTermsEnum;
- private DocTermOrds.TermOrdsIterator reuse;
- private int docBase;
+ if (scoreDocsInOrder) {
+ AtomicReader slowCompositeReader = SlowCompositeReaderWrapper.wrap(toSearcher.getIndexReader());
+ Terms terms = slowCompositeReader.terms(toField);
+ if (terms != null) {
+ DocsEnum docsEnum = null;
+ TermsEnum termsEnum = null;
+ SortedSet<BytesRef> joinValues = new TreeSet<BytesRef>(BytesRef.getUTF8SortedAsUnicodeComparator());
+ joinValues.addAll(joinValueToJoinScores.keySet());
+ for (BytesRef joinValue : joinValues) {
+ termsEnum = terms.iterator(termsEnum);
+ if (termsEnum.seekExact(joinValue, true)) {
+ docsEnum = termsEnum.docs(slowCompositeReader.getLiveDocs(), docsEnum, 0);
+ JoinScore joinScore = joinValueToJoinScores.get(joinValue);
- public void collect(int doc) throws IOException {
- if (docTermOrds.isEmpty()) {
- return;
+ for (int doc = docsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docsEnum.nextDoc()) {
+ // First encountered join value determines the score.
+ // Something to keep in mind for many-to-many relations.
+ if (!docToJoinScore.containsKey(doc)) {
+ docToJoinScore.put(doc, joinScore);
+ }
+ }
+ }
}
+ }
+ } else {
+ toSearcher.search(new MatchAllDocsQuery(), new Collector() {
- reuse = docTermOrds.lookup(doc, reuse);
- int[] buffer = new int[5];
+ private DocTermOrds docTermOrds;
+ private TermsEnum docTermsEnum;
+ private DocTermOrds.TermOrdsIterator reuse;
+ private int docBase;
- int chunk;
- do {
- chunk = reuse.read(buffer);
- if (chunk == 0) {
+ public void collect(int doc) throws IOException {
+ if (docTermOrds.isEmpty()) {
return;
}
- for (int idx = 0; idx < chunk; idx++) {
- int key = buffer[idx];
- docTermsEnum.seekExact((long) key);
- JoinScore joinScore = joinValueToJoinScores.get(docTermsEnum.term());
- if (joinScore == null) {
- continue;
+ reuse = docTermOrds.lookup(doc, reuse);
+ int[] buffer = new int[5];
+
+ int chunk;
+ do {
+ chunk = reuse.read(buffer);
+ if (chunk == 0) {
+ return;
}
- Integer basedDoc = docBase + doc;
- // First encountered join value determines the score.
- // Something to keep in mind for many-to-many relations.
- if (!docToJoinScore.containsKey(basedDoc)) {
- docToJoinScore.put(basedDoc, joinScore);
+
+ for (int idx = 0; idx < chunk; idx++) {
+ int key = buffer[idx];
+ docTermsEnum.seekExact((long) key);
+ JoinScore joinScore = joinValueToJoinScores.get(docTermsEnum.term());
+ if (joinScore == null) {
+ continue;
+ }
+ Integer basedDoc = docBase + doc;
+ // First encountered join value determines the score.
+ // Something to keep in mind for many-to-many relations.
+ if (!docToJoinScore.containsKey(basedDoc)) {
+ docToJoinScore.put(basedDoc, joinScore);
+ }
}
- }
- } while (chunk >= buffer.length);
- }
+ } while (chunk >= buffer.length);
+ }
- public void setNextReader(AtomicReaderContext context) throws IOException {
- docBase = context.docBase;
- docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), toField);
- docTermsEnum = docTermOrds.getOrdTermsEnum(context.reader());
- reuse = null;
- }
+ public void setNextReader(AtomicReaderContext context) throws IOException {
+ docBase = context.docBase;
+ docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), toField);
+ docTermsEnum = docTermOrds.getOrdTermsEnum(context.reader());
+ reuse = null;
+ }
- public boolean acceptsDocsOutOfOrder() {return false;}
- public void setScorer(Scorer scorer) {}
- });
+ public boolean acceptsDocsOutOfOrder() {return false;}
+ public void setScorer(Scorer scorer) {}
+ });
+ }
} else {
toSearcher.search(new MatchAllDocsQuery(), new Collector() {