You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by yo...@apache.org on 2010/12/15 19:16:24 UTC
svn commit: r1049660 - in
/lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr:
request/SimpleFacets.java request/UnInvertedField.java search/DocSet.java
search/SolrIndexSearcher.java
Author: yonik
Date: Wed Dec 15 18:16:23 2010
New Revision: 1049660
URL: http://svn.apache.org/viewvc?rev=1049660&view=rev
Log:
convert solr to new bulk API
Modified:
lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/request/SimpleFacets.java
lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/request/UnInvertedField.java
lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/search/DocSet.java
lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java
Modified: lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/request/SimpleFacets.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/request/SimpleFacets.java?rev=1049660&r1=1049659&r2=1049660&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/request/SimpleFacets.java (original)
+++ lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/request/SimpleFacets.java Wed Dec 15 18:16:23 2010
@@ -631,7 +631,6 @@ public class SimpleFacets {
Fields fields = MultiFields.getFields(r);
Terms terms = fields==null ? null : fields.terms(field);
TermsEnum termsEnum = null;
- SolrIndexSearcher.DocsEnumState deState = null;
BytesRef term = null;
if (terms != null) {
termsEnum = terms.iterator();
@@ -652,10 +651,14 @@ public class SimpleFacets {
}
Term template = new Term(field);
- DocsEnum docsEnum = null;
CharArr spare = new CharArr();
if (docs.size() >= mincount) {
+ SolrIndexSearcher.DocsEnumState deState = new SolrIndexSearcher.DocsEnumState();
+ deState.deletedDocs = MultiFields.getDeletedDocs(r);
+ deState.termsEnum = termsEnum;
+ deState.bulkPostings = null;
+
while (term != null) {
if (startTermBytes != null && !term.startsWith(startTermBytes))
@@ -671,29 +674,19 @@ public class SimpleFacets {
if (df >= minDfFilterCache) {
// use the filter cache
- // TODO: need a term query that takes a BytesRef to handle binary terms
- spare.reset();
- ByteUtils.UTF8toUTF16(term, spare);
- Term t = template.createTerm(spare.toString());
-
- if (deState==null) {
- deState = new SolrIndexSearcher.DocsEnumState();
- deState.deletedDocs = MultiFields.getDeletedDocs(r);
- deState.termsEnum = termsEnum;
- deState.reuse = docsEnum;
- }
+ Term t = template.createTerm(new BytesRef(term));
c = searcher.numDocs(new TermQuery(t), docs, deState);
-
- docsEnum = deState.reuse;
} else {
// iterate over TermDocs to calculate the intersection
+ c=0;
+ final BulkPostingsEnum docsEnum = deState.bulkPostings = deState.termsEnum.bulkPostings(deState.bulkPostings, false, false);
+ /*** do per-seg
// TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it matter for this?
// TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl)
// TODO: would passing deleted docs lead to better efficiency over checking the fastForRandomSet?
docsEnum = termsEnum.docs(null, docsEnum);
- c=0;
if (docsEnum instanceof MultiDocsEnum) {
MultiDocsEnum.EnumWithSlice[] subs = ((MultiDocsEnum)docsEnum).getSubs();
@@ -713,23 +706,32 @@ public class SimpleFacets {
}
}
}
- } else {
-
- // this should be the same bulk result object if sharing of the docsEnum succeeded
- DocsEnum.BulkReadResult bulk = docsEnum.getBulkResult();
+ } else
+ ***/
+ {
+ int docsLeft = df;
+ BulkPostingsEnum.BlockReader docDeltasReader = docsEnum.getDocDeltasReader();
+ int[] deltas = docDeltasReader.getBuffer();
+ int docPointer = docDeltasReader.offset();
+ int docPointerMax = docDeltasReader.end();
+ // assert docPointer < docPointerMax;
+ if (docPointerMax - docPointer > docsLeft) docPointerMax = docPointer + docsLeft;
+ docsLeft -= docPointerMax - docPointer;
+ int doc = 0;
for (;;) {
- int nDocs = docsEnum.read();
- if (nDocs == 0) break;
- int[] docArr = bulk.docs.ints; // this might be movable outside the loop, but perhaps not worth the risk.
- int end = bulk.docs.offset + nDocs;
- for (int i=bulk.docs.offset; i<end; i++) {
- if (fastForRandomSet.exists(docArr[i])) c++;
+ while (docPointer < docPointerMax) {
+ doc += deltas[docPointer++];
+ if (fastForRandomSet.exists(doc)) c++;
}
+
+ if (docsLeft <= 0) break;
+ docPointerMax = Math.min(docDeltasReader.fill(), docsLeft);
+ assert docPointerMax > 0;
+ docsLeft -= docPointerMax;
+ docPointer = 0; // offset() should always be 0 after fill
}
}
-
-
}
if (sortByCount) {
Modified: lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/request/UnInvertedField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/request/UnInvertedField.java?rev=1049660&r1=1049659&r2=1049660&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/request/UnInvertedField.java (original)
+++ lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/request/UnInvertedField.java Wed Dec 15 18:16:23 2010
@@ -17,14 +17,8 @@
package org.apache.solr.request;
+import org.apache.lucene.index.*;
import org.apache.lucene.search.FieldCache;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.DocsEnum;
-import org.apache.lucene.index.DocsAndPositionsEnum;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.MultiFields;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.PagedBytes;
@@ -230,7 +224,12 @@ public class UnInvertedField {
// values. This requires going over the field first to find the most
// frequent terms ahead of time.
- SolrIndexSearcher.DocsEnumState deState = null;
+ SolrIndexSearcher.DocsEnumState deState = new SolrIndexSearcher.DocsEnumState();
+ deState.deletedDocs = te.deletedDocs;
+ deState.termsEnum = te.tenum;
+ deState.bulkPostings = null;
+
+ final Bits deletes = deState.deletedDocs;
for (;;) {
BytesRef t = te.term();
@@ -253,13 +252,7 @@ public class UnInvertedField {
topTerm.termNum = termNum;
bigTerms.put(topTerm.termNum, topTerm);
- if (deState == null) {
- deState = new SolrIndexSearcher.DocsEnumState();
- deState.termsEnum = te.tenum;
- deState.reuse = te.docsEnum;
- }
DocSet set = searcher.getDocSet(new TermQuery(new Term(ti.field, topTerm.term)), deState);
- te.docsEnum = deState.reuse;
maxTermCounts[termNum] = set.size();
@@ -269,19 +262,23 @@ public class UnInvertedField {
termsInverted++;
- DocsEnum docsEnum = te.getDocsEnum();
-
- DocsEnum.BulkReadResult bulkResult = docsEnum.getBulkResult();
+ int docsLeft = df;
+ deState.bulkPostings = deState.termsEnum.bulkPostings(deState.bulkPostings, false, false);
+ final BulkPostingsEnum.BlockReader docDeltasReader = deState.bulkPostings.getDocDeltasReader();
+ final int[] deltas = docDeltasReader.getBuffer();
+ int docPointer = docDeltasReader.offset();
+ int docPointerMax = docDeltasReader.end();
+ // assert docPointer < docPointerMax;
+ if (docPointerMax - docPointer > docsLeft) docPointerMax = docPointer + docsLeft;
+ docsLeft -= docPointerMax - docPointer;
+ int doc = 0;
+ int nDocs = 0;
for(;;) {
- int n = docsEnum.read();
- if (n <= 0) break;
-
- maxTermCounts[termNum] += n;
-
- for (int i=0; i<n; i++) {
- termInstances++;
- int doc = bulkResult.docs.ints[i];
+ while (docPointer < docPointerMax) {
+ doc += deltas[docPointer++];
+ if (deletes != null && deletes.get(doc)) continue;
+ nDocs++;
// add 2 to the term number to make room for special reserved values:
// 0 (end term) and 1 (index into byte array follows)
int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
@@ -349,8 +346,15 @@ public class UnInvertedField {
}
+ if (docsLeft <= 0) break;
+ docPointerMax = Math.min(docDeltasReader.fill(), docsLeft);
+ assert docPointerMax > 0;
+ docsLeft -= docPointerMax;
+ docPointer = 0; // offset() should always be 0 after fill
}
+ termInstances += nDocs;
+ maxTermCounts[termNum] = nDocs;
te.next();
}
@@ -968,6 +972,11 @@ class NumberedTermsEnum extends TermsEnu
return tenum.getComparator();
}
+ @Override
+ public BulkPostingsEnum bulkPostings(BulkPostingsEnum reuse, boolean doFreqs, boolean doPositions) throws IOException {
+ return tenum.bulkPostings(reuse, doFreqs, doPositions);
+ }
+
public DocsEnum getDocsEnum() throws IOException {
docsEnum = tenum.docs(deletedDocs, docsEnum);
return docsEnum;
Modified: lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/search/DocSet.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/search/DocSet.java?rev=1049660&r1=1049659&r2=1049660&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/search/DocSet.java (original)
+++ lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/search/DocSet.java Wed Dec 15 18:16:23 2010
@@ -148,6 +148,8 @@ public interface DocSet /* extends Colle
* methods will be invoked with.
*/
public Filter getTopFilter();
+
+ public static final DocSet EMPTY = new SortedIntDocSet(new int[0], 0);
}
/** A base class that may be usefull for implementing DocSets */
Modified: lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java?rev=1049660&r1=1049659&r2=1049660&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java (original)
+++ lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java Wed Dec 15 18:16:23 2010
@@ -586,8 +586,8 @@ public class SolrIndexSearcher extends I
}
}
- DocSet absAnswer = getDocSetNC(absQ, null, deState);
- DocSet answer = positive ? absAnswer : getPositiveDocSet(matchAllDocsQuery, deState).andNot(absAnswer);
+ DocSet absAnswer = getDocSetNC(deState);
+ DocSet answer = positive ? absAnswer : getPositiveDocSet(matchAllDocsQuery).andNot(absAnswer);
if (filterCache != null) {
// cache negative queries as positive
@@ -616,7 +616,7 @@ public class SolrIndexSearcher extends I
answer = filterCache.get(q);
if (answer!=null) return answer;
}
- answer = getDocSetNC(q,null,deState);
+ answer = getDocSetNC(deState);
if (filterCache != null) filterCache.put(q,answer);
return answer;
}
@@ -752,22 +752,21 @@ public class SolrIndexSearcher extends I
}
// query must be positive
- protected DocSet getDocSetNC(Query query, DocSet filter, DocsEnumState deState) throws IOException {
- if (filter != null) return getDocSetNC(query, filter, null);
-
+ protected DocSet getDocSetNC(DocsEnumState deState) throws IOException {
int smallSetSize = maxDoc()>>6;
int largestPossible = deState.termsEnum.docFreq();
int[] docs = new int[Math.min(smallSetSize, largestPossible)];
- int upto = 0;
- int bitsSet = 0;
+ int upto = 0; // number of docs in the array
+ int nDocs = 0; // number of docs in this set
OpenBitSet obs = null;
- DocsEnum docsEnum = deState.termsEnum.docs(deState.deletedDocs, deState.reuse);
- if (deState.reuse == null) {
- deState.reuse = docsEnum;
- }
+ deState.bulkPostings = deState.termsEnum.bulkPostings(deState.bulkPostings, false, false);
+ final Bits deleted = deState.deletedDocs;
+ int docsLeft = largestPossible;
+
+ /** TODO: do per seg
if (docsEnum instanceof MultiDocsEnum) {
MultiDocsEnum.EnumWithSlice[] subs = ((MultiDocsEnum)docsEnum).getSubs();
int numSubs = ((MultiDocsEnum)docsEnum).getNumSubs();
@@ -795,25 +794,44 @@ public class SolrIndexSearcher extends I
}
}
}
- } else {
- DocsEnum.BulkReadResult bulk = docsEnum.getBulkResult();
- for (;;) {
- int nDocs = docsEnum.read();
- if (nDocs == 0) break;
- int[] docArr = bulk.docs.ints;
- int end = bulk.docs.offset + nDocs;
+ } else
+ **/
+
+ {
+ BulkPostingsEnum.BlockReader docDeltasReader = deState.bulkPostings.getDocDeltasReader();
+ int[] deltas = docDeltasReader.getBuffer();
+ int docPointer = docDeltasReader.offset();
+ int docPointerMax = docDeltasReader.end();
+ // assert docPointer < docPointerMax;
+ if (docPointerMax - docPointer > docsLeft) docPointerMax = docPointer + docsLeft;
+ docsLeft -= docPointerMax - docPointer;
- if (upto + nDocs > docs.length) {
+ int doc = 0;
+
+ for (;;) {
+ // to big to fit in our temporary int array?
+ if (obs != null || nDocs + (docPointerMax - docPointer) > docs.length) {
if (obs == null) obs = new OpenBitSet(maxDoc());
- for (int i=bulk.docs.offset; i<end; i++) {
- obs.fastSet(docArr[i]);
+ while (docPointer < docPointerMax) {
+ doc += deltas[docPointer++];
+ if (deleted != null && deleted.get(doc)) continue;
+ obs.fastSet(doc);
+ nDocs++;
}
- bitsSet += nDocs;
} else {
- for (int i=bulk.docs.offset; i<end; i++) {
- docs[upto++] = docArr[i];
+ while (docPointer < docPointerMax) {
+ doc += deltas[docPointer++];
+ if (deleted != null && deleted.get(doc)) continue;
+ docs[upto++] = doc;
}
+ nDocs = upto;
}
+
+ if (docsLeft <= 0) break;
+ docPointerMax = Math.min(docDeltasReader.fill(), docsLeft);
+ assert docPointerMax > 0;
+ docsLeft -= docPointerMax;
+ docPointer = 0; // offset() should always be 0 after fill
}
}
@@ -821,8 +839,7 @@ public class SolrIndexSearcher extends I
for (int i=0; i<upto; i++) {
obs.fastSet(docs[i]);
}
- bitsSet += upto;
- return new BitDocSet(obs, bitsSet);
+ return new BitDocSet(obs, nDocs);
}
return new SortedIntDocSet(docs, upto);
@@ -830,11 +847,21 @@ public class SolrIndexSearcher extends I
// query must be positive
protected DocSet getDocSetNC(Query query, DocSet filter) throws IOException {
- DocSetCollector collector = new DocSetCollector(maxDoc()>>6, maxDoc());
-
if (filter==null) {
if (query instanceof TermQuery) {
Term t = ((TermQuery)query).getTerm();
+ DocsEnumState deState = new DocsEnumState();
+ Terms terms = MultiFields.getTerms(reader, t.field());
+ if (terms == null) return DocSet.EMPTY;
+ deState.termsEnum = terms.iterator();
+ if (deState.termsEnum.seek(t.bytes()) != TermsEnum.SeekStatus.FOUND) return DocSet.EMPTY;
+ deState.deletedDocs = MultiFields.getDeletedDocs(reader);
+ deState.bulkPostings = null;
+
+ return getDocSetNC(deState);
+
+ /** TODO: do per seg
+ Term t = ((TermQuery)query).getTerm();
SolrIndexReader[] readers = reader.getLeafReaders();
int[] offsets = reader.getLeafOffsets();
@@ -863,12 +890,14 @@ public class SolrIndexSearcher extends I
}
}
}
+ ***/
} else {
+ DocSetCollector collector = new DocSetCollector(maxDoc()>>6, maxDoc());
super.search(query,null,collector);
+ return collector.getDocSet();
}
- return collector.getDocSet();
-
} else {
+ DocSetCollector collector = new DocSetCollector(maxDoc()>>6, maxDoc());
Filter luceneFilter = filter.getTopFilter();
super.search(query, luceneFilter, collector);
return collector.getDocSet();
@@ -1653,7 +1682,7 @@ public class SolrIndexSearcher extends I
public static class DocsEnumState {
public TermsEnum termsEnum;
public Bits deletedDocs;
- public DocsEnum reuse;
+ public BulkPostingsEnum bulkPostings;
}
/**