You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2015/04/15 00:26:21 UTC
svn commit: r1673572 - in /lucene/dev/trunk/lucene: CHANGES.txt
core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
core/src/java/org/apache/lucene/search/PhraseQuery.java
Author: rmuir
Date: Tue Apr 14 22:26:21 2015
New Revision: 1673572
URL: http://svn.apache.org/r1673572
Log:
LUCENE-6421: defer reading of positions in MultiPhraseQuery until they are needed
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1673572&r1=1673571&r2=1673572&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Apr 14 22:26:21 2015
@@ -77,6 +77,9 @@ Optimizations
* LUCENE-6388: Optimize SpanNearQuery when payloads are not present.
(Robert Muir)
+* LUCENE-6421: Defer reading of positions in MultiPhraseQuery until
+ they are needed. (Robert Muir)
+
Bug Fixes
* LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java?rev=1673572&r1=1673571&r2=1673572&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java Tue Apr 14 22:26:21 2015
@@ -200,45 +200,30 @@ public class MultiPhraseQuery extends Qu
for (int pos=0; pos<postingsFreqs.length; pos++) {
Term[] terms = termArrays.get(pos);
-
- final PostingsEnum postingsEnum;
- int docFreq;
-
- if (terms.length > 1) {
- postingsEnum = new UnionPostingsEnum(liveDocs, context, terms, termContexts, termsEnum);
-
- // coarse -- this overcounts since a given doc can
- // have more than one term:
- docFreq = 0;
- for(int termIdx=0;termIdx<terms.length;termIdx++) {
- final Term term = terms[termIdx];
- TermState termState = termContexts.get(term).get(context.ord);
- if (termState == null) {
- // Term not in reader
- continue;
- }
- termsEnum.seekExact(term.bytes(), termState);
- docFreq += termsEnum.docFreq();
- }
-
- if (docFreq == 0) {
- // None of the terms are in this reader
- return null;
- }
- } else {
- final Term term = terms[0];
+ List<PostingsEnum> postings = new ArrayList<>();
+
+ for (Term term : terms) {
TermState termState = termContexts.get(term).get(context.ord);
if (termState == null) {
// Term not in reader
return null;
}
termsEnum.seekExact(term.bytes(), termState);
- postingsEnum = termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS);
-
- docFreq = termsEnum.docFreq();
+ postings.add(termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS));
+ }
+
+ if (postings.isEmpty()) {
+ return null;
+ }
+
+ final PostingsEnum postingsEnum;
+ if (postings.size() == 1) {
+ postingsEnum = postings.get(0);
+ } else {
+ postingsEnum = new UnionPostingsEnum(postings);
}
- postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms);
+ postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, positions.get(pos).intValue(), terms);
}
// sort by increasing docFreq order
@@ -398,175 +383,164 @@ public class MultiPhraseQuery extends Qu
}
return true;
}
-}
+
+ /**
+ * Takes the logical union of multiple PostingsEnum iterators.
+ * <p>
+ * Note: positions are merged during freq()
+ */
+ static class UnionPostingsEnum extends PostingsEnum {
+ /** queue ordered by docid */
+ final DocsQueue docsQueue;
+ /** cost of this enum: sum of its subs */
+ final long cost;
+
+ /** queue ordered by position for current doc */
+ final PositionsQueue posQueue = new PositionsQueue();
+ /** current doc posQueue is working */
+ int posQueueDoc = -2;
+ /** list of subs (unordered) */
+ final PostingsEnum[] subs;
+
+ UnionPostingsEnum(Collection<PostingsEnum> subs) {
+ docsQueue = new DocsQueue(subs.size());
+ long cost = 0;
+ for (PostingsEnum sub : subs) {
+ docsQueue.add(sub);
+ cost += sub.cost();
+ }
+ this.cost = cost;
+ this.subs = subs.toArray(new PostingsEnum[subs.size()]);
+ }
-/**
- * Takes the logical union of multiple DocsEnum iterators.
- */
-
-// TODO: if ever we allow subclassing of the *PhraseScorer
-class UnionPostingsEnum extends PostingsEnum {
-
- private static final class DocsQueue extends PriorityQueue<PostingsEnum> {
- DocsQueue(List<PostingsEnum> postingsEnums) throws IOException {
- super(postingsEnums.size());
-
- Iterator<PostingsEnum> i = postingsEnums.iterator();
- while (i.hasNext()) {
- PostingsEnum postings = i.next();
- if (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
- add(postings);
+ @Override
+ public int freq() throws IOException {
+ int doc = docID();
+ if (doc != posQueueDoc) {
+ posQueue.clear();
+ for (PostingsEnum sub : subs) {
+ if (sub.docID() == doc) {
+ int freq = sub.freq();
+ for (int i = 0; i < freq; i++) {
+ posQueue.add(sub.nextPosition());
+ }
+ }
}
+ posQueue.sort();
+ posQueueDoc = doc;
}
+ return posQueue.size();
}
@Override
- public final boolean lessThan(PostingsEnum a, PostingsEnum b) {
- return a.docID() < b.docID();
+ public int nextPosition() throws IOException {
+ return posQueue.next();
}
- }
- private static final class IntQueue {
- private int _arraySize = 16;
- private int _index = 0;
- private int _lastIndex = 0;
- private int[] _array = new int[_arraySize];
-
- final void add(int i) {
- if (_lastIndex == _arraySize)
- growArray();
-
- _array[_lastIndex++] = i;
+ @Override
+ public int docID() {
+ return docsQueue.top().docID();
}
- final int next() {
- return _array[_index++];
- }
+ @Override
+ public int nextDoc() throws IOException {
+ PostingsEnum top = docsQueue.top();
+ int doc = top.docID();
+
+ do {
+ top.nextDoc();
+ top = docsQueue.updateTop();
+ } while (top.docID() == doc);
- final void sort() {
- Arrays.sort(_array, _index, _lastIndex);
+ return top.docID();
}
- final void clear() {
- _index = 0;
- _lastIndex = 0;
- }
+ @Override
+ public int advance(int target) throws IOException {
+ PostingsEnum top = docsQueue.top();
+
+ do {
+ top.advance(target);
+ top = docsQueue.updateTop();
+ } while (top.docID() < target);
- final int size() {
- return (_lastIndex - _index);
+ return top.docID();
}
- private void growArray() {
- int[] newArray = new int[_arraySize * 2];
- System.arraycopy(_array, 0, newArray, 0, _arraySize);
- _array = newArray;
- _arraySize *= 2;
+ @Override
+ public long cost() {
+ return cost;
}
- }
-
- private int _doc = -1;
- private int _freq;
- private DocsQueue _queue;
- private IntQueue _posList;
- private long cost;
-
- public UnionPostingsEnum(Bits liveDocs, LeafReaderContext context, Term[] terms, Map<Term, TermContext> termContexts, TermsEnum termsEnum) throws IOException {
- List<PostingsEnum> postingsEnums = new LinkedList<>();
- for (int i = 0; i < terms.length; i++) {
- final Term term = terms[i];
- TermState termState = termContexts.get(term).get(context.ord);
- if (termState == null) {
- // Term doesn't exist in reader
- continue;
- }
- termsEnum.seekExact(term.bytes(), termState);
- PostingsEnum postings = termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS);
- cost += postings.cost();
- postingsEnums.add(postings);
+
+ @Override
+ public int startOffset() throws IOException {
+ return -1; // offsets are unsupported
}
- _queue = new DocsQueue(postingsEnums);
- _posList = new IntQueue();
- }
-
- @Override
- public final int nextDoc() throws IOException {
- if (_queue.size() == 0) {
- return _doc = NO_MORE_DOCS;
+ @Override
+ public int endOffset() throws IOException {
+ return -1; // offsets are unsupported
}
- // TODO: move this init into positions(): if the search
- // doesn't need the positions for this doc then don't
- // waste CPU merging them:
- _posList.clear();
- _doc = _queue.top().docID();
-
- // merge sort all positions together
- PostingsEnum postings;
- do {
- postings = _queue.top();
-
- final int freq = postings.freq();
- for (int i = 0; i < freq; i++) {
- _posList.add(postings.nextPosition());
+ @Override
+ public BytesRef getPayload() throws IOException {
+ return null; // payloads are unsupported
+ }
+
+ /**
+ * disjunction of postings ordered by docid.
+ */
+ static class DocsQueue extends PriorityQueue<PostingsEnum> {
+ DocsQueue(int size) {
+ super(size);
}
- if (postings.nextDoc() != NO_MORE_DOCS) {
- _queue.updateTop();
- } else {
- _queue.pop();
+ @Override
+ public final boolean lessThan(PostingsEnum a, PostingsEnum b) {
+ return a.docID() < b.docID();
}
- } while (_queue.size() > 0 && _queue.top().docID() == _doc);
-
- _posList.sort();
- _freq = _posList.size();
+ }
+
+ /**
+ * queue of terms for a single document. its a sorted array of
+ * all the positions from all the postings
+ */
+ static class PositionsQueue {
+ private int arraySize = 16;
+ private int index = 0;
+ private int size = 0;
+ private int[] array = new int[arraySize];
+
+ void add(int i) {
+ if (size == arraySize)
+ growArray();
- return _doc;
- }
+ array[size++] = i;
+ }
- @Override
- public int nextPosition() {
- return _posList.next();
- }
+ int next() {
+ return array[index++];
+ }
- @Override
- public int startOffset() {
- return -1;
- }
+ void sort() {
+ Arrays.sort(array, index, size);
+ }
- @Override
- public int endOffset() {
- return -1;
- }
+ void clear() {
+ index = 0;
+ size = 0;
+ }
- @Override
- public BytesRef getPayload() {
- return null;
- }
+ int size() {
+ return size;
+ }
- @Override
- public final int advance(int target) throws IOException {
- while (_queue.top() != null && target > _queue.top().docID()) {
- PostingsEnum postings = _queue.pop();
- if (postings.advance(target) != NO_MORE_DOCS) {
- _queue.add(postings);
+ private void growArray() {
+ int[] newArray = new int[arraySize * 2];
+ System.arraycopy(array, 0, newArray, 0, arraySize);
+ array = newArray;
+ arraySize *= 2;
}
}
- return nextDoc();
- }
-
- @Override
- public final int freq() {
- return _freq;
- }
-
- @Override
- public final int docID() {
- return _doc;
- }
-
- @Override
- public long cost() {
- return cost;
}
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java?rev=1673572&r1=1673571&r2=1673572&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java Tue Apr 14 22:26:21 2015
@@ -174,14 +174,12 @@ public class PhraseQuery extends Query {
static class PostingsAndFreq implements Comparable<PostingsAndFreq> {
final PostingsEnum postings;
- final int docFreq;
final int position;
final Term[] terms;
final int nTerms; // for faster comparisons
- public PostingsAndFreq(PostingsEnum postings, int docFreq, int position, Term... terms) {
+ public PostingsAndFreq(PostingsEnum postings, int position, Term... terms) {
this.postings = postings;
- this.docFreq = docFreq;
this.position = position;
nTerms = terms==null ? 0 : terms.length;
if (nTerms>0) {
@@ -200,9 +198,6 @@ public class PhraseQuery extends Query {
@Override
public int compareTo(PostingsAndFreq other) {
- if (docFreq != other.docFreq) {
- return docFreq - other.docFreq;
- }
if (position != other.position) {
return position - other.position;
}
@@ -223,7 +218,6 @@ public class PhraseQuery extends Query {
public int hashCode() {
final int prime = 31;
int result = 1;
- result = prime * result + docFreq;
result = prime * result + position;
for (int i=0; i<nTerms; i++) {
result = prime * result + terms[i].hashCode();
@@ -237,7 +231,6 @@ public class PhraseQuery extends Query {
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
PostingsAndFreq other = (PostingsAndFreq) obj;
- if (docFreq != other.docFreq) return false;
if (position != other.position) return false;
if (terms == null) return other.terms == null;
return Arrays.equals(terms, other.terms);
@@ -313,7 +306,7 @@ public class PhraseQuery extends Query {
}
te.seekExact(t.bytes(), state);
PostingsEnum postingsEnum = te.postings(liveDocs, null, PostingsEnum.POSITIONS);
- postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.docFreq(), positions.get(i), t);
+ postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions.get(i), t);
}
// sort by increasing docFreq order