You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/07/07 02:44:14 UTC
svn commit: r1358485 - in
/lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene:
codecs/FieldsConsumer.java codecs/TermsConsumer.java
index/FreqProxTermsWriterPerField.java
Author: rmuir
Date: Sat Jul 7 00:44:13 2012
New Revision: 1358485
URL: http://svn.apache.org/viewvc?rev=1358485&view=rev
Log:
LUCENE-4198: incrementally accum stats in FreqProxTermsWriter
Modified:
lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java
lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java
lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
Modified: lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java?rev=1358485&r1=1358484&r2=1358485&view=diff
==============================================================================
--- lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java (original)
+++ lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java Sat Jul 7 00:44:13 2012
@@ -62,7 +62,7 @@ public abstract class FieldsConsumer imp
Terms terms = fieldsEnum.terms();
if (terms != null) {
final TermsConsumer termsConsumer = addField(mergeState.fieldInfo);
- termsConsumer.merge(mergeState, terms.iterator(null));
+ termsConsumer.merge(mergeState, terms);
}
}
}
Modified: lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java?rev=1358485&r1=1358484&r2=1358485&view=diff
==============================================================================
--- lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java (original)
+++ lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java Sat Jul 7 00:44:13 2012
@@ -23,6 +23,7 @@ import java.util.Comparator;
import org.apache.lucene.index.FieldInfo; // javadocs
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.MergeState;
+import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.MultiDocsEnum;
import org.apache.lucene.index.MultiDocsAndPositionsEnum;
@@ -72,8 +73,8 @@ public abstract class TermsConsumer {
private MappingMultiDocsAndPositionsEnum postingsEnum;
/** Default merge impl */
- public void merge(MergeState mergeState, TermsEnum termsEnum) throws IOException {
-
+ public void merge(MergeState mergeState, Terms terms) throws IOException {
+ TermsEnum termsEnum = terms.iterator(null);
BytesRef term;
assert termsEnum != null;
long sumTotalTermFreq = 0;
Modified: lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java?rev=1358485&r1=1358484&r2=1358485&view=diff
==============================================================================
--- lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (original)
+++ lucene/dev/branches/lucene4100/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java Sat Jul 7 00:44:13 2012
@@ -29,7 +29,7 @@ import org.apache.lucene.codecs.TermStat
import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.RamUsageEstimator;
// TODO: break into separate freq and prox writers as
@@ -88,6 +88,9 @@ final class FreqProxTermsWriterPerField
// with or without term freqs:
setIndexOptions(fieldInfo.getIndexOptions());
payloadAttribute = null;
+ sumDocFreq = 0;
+ sumTotalTermFreq = 0;
+ visitedDocs = new OpenBitSet();
}
private void setIndexOptions(IndexOptions indexOptions) {
@@ -111,6 +114,10 @@ final class FreqProxTermsWriterPerField
return false;
}
+ long sumDocFreq;
+ long sumTotalTermFreq;
+ OpenBitSet visitedDocs = new OpenBitSet();
+
@Override
void start(IndexableField f) {
if (fieldState.attributeSource.hasAttribute(PayloadAttribute.class)) {
@@ -169,11 +176,16 @@ final class FreqProxTermsWriterPerField
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
postings.lastDocIDs[termID] = docState.docID;
+ postings.docFreqs[termID] = 1;
+ sumDocFreq++;
+ visitedDocs.set(docState.docID);
if (!hasFreq) {
postings.lastDocCodes[termID] = docState.docID;
} else {
postings.lastDocCodes[termID] = docState.docID << 1;
- postings.docFreqs[termID] = 1;
+ postings.freqs[termID] = 1;
+ postings.totalTermFreqs[termID] = 1;
+ sumTotalTermFreq++;
if (hasProx) {
writeProx(termID, fieldState.position);
if (hasOffsets) {
@@ -194,15 +206,18 @@ final class FreqProxTermsWriterPerField
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
- assert !hasFreq || postings.docFreqs[termID] > 0;
+ assert !hasFreq || postings.freqs[termID] > 0;
if (!hasFreq) {
- assert postings.docFreqs == null;
+ assert postings.freqs == null;
if (docState.docID != postings.lastDocIDs[termID]) {
assert docState.docID > postings.lastDocIDs[termID];
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID];
postings.lastDocIDs[termID] = docState.docID;
+ postings.docFreqs[termID]++;
+ sumDocFreq++;
+ visitedDocs.set(docState.docID);
fieldState.uniqueTermCount++;
}
} else if (docState.docID != postings.lastDocIDs[termID]) {
@@ -210,18 +225,23 @@ final class FreqProxTermsWriterPerField
// Term not yet seen in the current doc but previously
// seen in other doc(s) since the last flush
- // Now that we know doc freq for previous doc,
+ // Now that we know freq for previous doc,
// write it & lastDocCode
- if (1 == postings.docFreqs[termID]) {
+ if (1 == postings.freqs[termID]) {
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1);
} else {
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
- termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
+ termsHashPerField.writeVInt(0, postings.freqs[termID]);
}
- postings.docFreqs[termID] = 1;
+ postings.docFreqs[termID]++;
+ sumDocFreq++;
+ postings.freqs[termID] = 1;
+ postings.totalTermFreqs[termID]++;
+ sumTotalTermFreq++;
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
postings.lastDocIDs[termID] = docState.docID;
+ visitedDocs.set(docState.docID);
if (hasProx) {
writeProx(termID, fieldState.position);
if (hasOffsets) {
@@ -233,7 +253,9 @@ final class FreqProxTermsWriterPerField
}
fieldState.uniqueTermCount++;
} else {
- fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
+ postings.totalTermFreqs[termID]++;
+ sumTotalTermFreq++;
+ fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.freqs[termID]);
if (hasProx) {
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
}
@@ -252,10 +274,12 @@ final class FreqProxTermsWriterPerField
public FreqProxPostingsArray(int size, boolean writeFreqs, boolean writeProx, boolean writeOffsets) {
super(size);
if (writeFreqs) {
- docFreqs = new int[size];
+ freqs = new int[size];
+ totalTermFreqs = new long[size];
}
lastDocIDs = new int[size];
lastDocCodes = new int[size];
+ docFreqs = new int[size];
if (writeProx) {
lastPositions = new int[size];
if (writeOffsets) {
@@ -267,15 +291,17 @@ final class FreqProxTermsWriterPerField
//System.out.println("PA init freqs=" + writeFreqs + " pos=" + writeProx + " offs=" + writeOffsets);
}
- int docFreqs[]; // # times this term occurs in the current doc
+ int freqs[]; // # times this term occurs in the current doc
int lastDocIDs[]; // Last docID where this term occurred
int lastDocCodes[]; // Code for prior doc
int lastPositions[]; // Last position where this term occurred
int lastOffsets[]; // Last endOffset where this term occurred
+ int docFreqs[]; // current docFreq for the term
+ long totalTermFreqs[]; // current totalTermFreq for the term
@Override
ParallelPostingsArray newInstance(int size) {
- return new FreqProxPostingsArray(size, docFreqs != null, lastPositions != null, lastOffsets != null);
+ return new FreqProxPostingsArray(size, freqs != null, lastPositions != null, lastOffsets != null);
}
@Override
@@ -287,6 +313,7 @@ final class FreqProxTermsWriterPerField
System.arraycopy(lastDocIDs, 0, to.lastDocIDs, 0, numToCopy);
System.arraycopy(lastDocCodes, 0, to.lastDocCodes, 0, numToCopy);
+ System.arraycopy(docFreqs, 0, to.docFreqs, 0, numToCopy);
if (lastPositions != null) {
assert to.lastPositions != null;
System.arraycopy(lastPositions, 0, to.lastPositions, 0, numToCopy);
@@ -295,23 +322,26 @@ final class FreqProxTermsWriterPerField
assert to.lastOffsets != null;
System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, numToCopy);
}
- if (docFreqs != null) {
- assert to.docFreqs != null;
- System.arraycopy(docFreqs, 0, to.docFreqs, 0, numToCopy);
+ if (freqs != null) {
+ assert to.freqs != null;
+ System.arraycopy(freqs, 0, to.freqs, 0, numToCopy);
+ assert to.totalTermFreqs != null;
+ System.arraycopy(totalTermFreqs, 0, to.totalTermFreqs, 0, numToCopy);
}
}
@Override
int bytesPerPosting() {
- int bytes = ParallelPostingsArray.BYTES_PER_POSTING + 2 * RamUsageEstimator.NUM_BYTES_INT;
+ int bytes = ParallelPostingsArray.BYTES_PER_POSTING + 3 * RamUsageEstimator.NUM_BYTES_INT;
if (lastPositions != null) {
bytes += RamUsageEstimator.NUM_BYTES_INT;
}
if (lastOffsets != null) {
bytes += RamUsageEstimator.NUM_BYTES_INT;
}
- if (docFreqs != null) {
+ if (freqs != null) {
bytes += RamUsageEstimator.NUM_BYTES_INT;
+ bytes += RamUsageEstimator.NUM_BYTES_LONG;
}
return bytes;
@@ -377,10 +407,6 @@ final class FreqProxTermsWriterPerField
final ByteSliceReader freq = new ByteSliceReader();
final ByteSliceReader prox = new ByteSliceReader();
- FixedBitSet visitedDocs = new FixedBitSet(state.segmentInfo.getDocCount());
- long sumTotalTermFreq = 0;
- long sumDocFreq = 0;
-
for (int i = 0; i < numTerms; i++) {
final int termID = termIDs[i];
//System.out.println("term=" + termID);
@@ -416,21 +442,21 @@ final class FreqProxTermsWriterPerField
// Now termStates has numToMerge FieldMergeStates
// which all share the same term. Now we must
// interleave the docID streams.
- int numDocs = 0;
- long totTF = 0;
+ final int docFreq = postings.docFreqs[termID];
+ final long totalTermFreq = writeTermFreq ? postings.totalTermFreqs[termID] : -1;
int docID = 0;
while(true) {
//System.out.println(" cycle");
- final int termDocFreq;
+ final int termFreq;
if (freq.eof()) {
if (postings.lastDocCodes[termID] != -1) {
// Return last doc
docID = postings.lastDocIDs[termID];
if (readTermFreq) {
- termDocFreq = postings.docFreqs[termID];
+ termFreq = postings.freqs[termID];
} else {
- termDocFreq = 0;
+ termFreq = 0;
}
postings.lastDocCodes[termID] = -1;
} else {
@@ -441,20 +467,19 @@ final class FreqProxTermsWriterPerField
final int code = freq.readVInt();
if (!readTermFreq) {
docID += code;
- termDocFreq = 0;
+ termFreq = 0;
} else {
docID += code >>> 1;
if ((code & 1) != 0) {
- termDocFreq = 1;
+ termFreq = 1;
} else {
- termDocFreq = freq.readVInt();
+ termFreq = freq.readVInt();
}
}
assert docID != postings.lastDocIDs[termID];
}
- numDocs++;
assert docID < state.segmentInfo.getDocCount(): "doc=" + docID + " maxDoc=" + state.segmentInfo.getDocCount();
// NOTE: we could check here if the docID was
@@ -468,8 +493,7 @@ final class FreqProxTermsWriterPerField
// passes, ie first sweep marks all del docs, and
// 2nd sweep does the real flush, but I suspect
// that'd add too much time to flush.
- visitedDocs.set(docID);
- postingsConsumer.startDoc(docID, termDocFreq);
+ postingsConsumer.startDoc(docID, termFreq);
if (docID < delDocLimit) {
// Mark it deleted. TODO: we could also skip
// writing its postings; this would be
@@ -484,8 +508,6 @@ final class FreqProxTermsWriterPerField
state.liveDocs.clear(docID);
}
}
-
- totTF += termDocFreq;
// Carefully copy over the prox + payload info,
// changing the format to match Lucene's segment
@@ -495,7 +517,7 @@ final class FreqProxTermsWriterPerField
// we did record positions (& maybe payload) and/or offsets
int position = 0;
int offset = 0;
- for(int j=0;j<termDocFreq;j++) {
+ for(int j=0;j<termFreq;j++) {
final BytesRef thisPayload;
if (readPositions) {
@@ -542,11 +564,9 @@ final class FreqProxTermsWriterPerField
}
postingsConsumer.finishDoc();
}
- termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
- sumTotalTermFreq += totTF;
- sumDocFreq += numDocs;
+ termsConsumer.finishTerm(text, new TermStats(docFreq, totalTermFreq));
}
- termsConsumer.finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality());
+ termsConsumer.finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, (int)visitedDocs.cardinality());
}
}