You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/10/14 17:55:58 UTC
svn commit: r1531949 [1/3] - in /lucene/dev/trunk/lucene: ./
codecs/src/java/org/apache/lucene/codecs/blockterms/
codecs/src/java/org/apache/lucene/codecs/bloom/
codecs/src/java/org/apache/lucene/codecs/memory/
codecs/src/java/org/apache/lucene/codecs/...
Author: mikemccand
Date: Mon Oct 14 15:55:57 2013
New Revision: 1531949
URL: http://svn.apache.org/r1531949
Log:
LUCENE-5268: cutover all postings formats to FieldsConsumer
Added:
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/Pulsing41PostingsFormat.java (with props)
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsFormat.java (with props)
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java (with props)
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (with props)
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsBaseFormat.java (with props)
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java (with props)
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java (with props)
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsBaseFormat.java (with props)
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java (with props)
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java (with props)
Removed:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PushFieldsConsumer.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPulsing41PostingsFormat.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPulsing41PostingsFormat.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40RWPostingsFormat.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockFixedIntBlockPostingsFormat.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockVariableIntBlockPostingsFormat.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSepPostingsFormat.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/ramonly/RAMOnlyPostingsFormat.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Mon Oct 14 15:55:57 2013
@@ -49,6 +49,11 @@ New Features
encode term metadata, and all dictionary implementations can now plug in any
PostingsBaseFormat. (Han Jiang, Mike McCandless)
+* LUCENE-5268: Full cutover of all postings formats to the "pull"
+ FieldsConsumer API, removing PushFieldsConsumer. Added new
+ PushPostingsWriterBase for single-pass push of docs/positions to the
+ postings format. (Mike McCandless)
+
Optimizations
* LUCENE-4848: Use Java 7 NIO2-FileChannel instead of RandomAccessFile
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java Mon Oct 14 15:55:57 2013
@@ -17,26 +17,29 @@ package org.apache.lucene.codecs.blockte
* limitations under the License.
*/
+import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
-import org.apache.lucene.codecs.PostingsConsumer;
+import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.PushFieldsConsumer;
import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.BlockTermState;
-import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
@@ -52,7 +55,7 @@ import org.apache.lucene.util.RamUsageEs
* @lucene.experimental
*/
-public class BlockTermsWriter extends PushFieldsConsumer {
+public class BlockTermsWriter extends FieldsConsumer implements Closeable {
final static String CODEC_NAME = "BLOCK_TERMS_DICT";
@@ -70,6 +73,7 @@ public class BlockTermsWriter extends Pu
final FieldInfos fieldInfos;
FieldInfo currentField;
private final TermsIndexWriterBase termsIndexWriter;
+ private final int maxDoc;
private static class FieldMetaData {
public final FieldInfo fieldInfo;
@@ -99,9 +103,9 @@ public class BlockTermsWriter extends Pu
public BlockTermsWriter(TermsIndexWriterBase termsIndexWriter,
SegmentWriteState state, PostingsWriterBase postingsWriter)
throws IOException {
- super(state);
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
this.termsIndexWriter = termsIndexWriter;
+ maxDoc = state.segmentInfo.getDocCount();
out = state.directory.createOutput(termsFileName, state.context);
boolean success = false;
try {
@@ -127,7 +131,43 @@ public class BlockTermsWriter extends Pu
}
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
+ public void write(Fields fields) throws IOException {
+
+ boolean success = false;
+ try {
+ for(String field : fields) {
+
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+
+ TermsEnum termsEnum = terms.iterator(null);
+
+ TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));
+
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+
+ termsWriter.write(term, termsEnum);
+ }
+
+ termsWriter.finish();
+ }
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(this);
+ } else {
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
+ }
+
+ private TermsWriter addField(FieldInfo field) throws IOException {
//System.out.println("\nBTW.addField seg=" + segment + " field=" + field.name);
assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field;
@@ -135,7 +175,6 @@ public class BlockTermsWriter extends Pu
return new TermsWriter(fieldIndexWriter, field, postingsWriter);
}
- @Override
public void close() throws IOException {
try {
final long dirStart = out.getFilePointer();
@@ -169,12 +208,13 @@ public class BlockTermsWriter extends Pu
public BlockTermState state;
}
- class TermsWriter extends TermsConsumer {
+ class TermsWriter {
private final FieldInfo fieldInfo;
private final PostingsWriterBase postingsWriter;
private final long termsStartPointer;
private long numTerms;
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
+ private final FixedBitSet docsSeen;
long sumTotalTermFreq;
long sumDocFreq;
int docCount;
@@ -191,6 +231,7 @@ public class BlockTermsWriter extends Pu
{
this.fieldInfo = fieldInfo;
this.fieldIndexWriter = fieldIndexWriter;
+ this.docsSeen = new FixedBitSet(maxDoc);
pendingTerms = new TermEntry[32];
for(int i=0;i<pendingTerms.length;i++) {
pendingTerms[i] = new TermEntry();
@@ -200,21 +241,22 @@ public class BlockTermsWriter extends Pu
this.longsSize = postingsWriter.setField(fieldInfo);
}
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- //System.out.println("BTW: startTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment);
- postingsWriter.startTerm();
- return postingsWriter;
- }
-
private final BytesRef lastPrevTerm = new BytesRef();
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ void write(BytesRef text, TermsEnum termsEnum) throws IOException {
+
+ BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen);
+ if (state == null) {
+ // No docs for this term:
+ return;
+ }
+ sumDocFreq += state.docFreq;
+ sumTotalTermFreq += state.totalTermFreq;
- assert stats.docFreq > 0;
+ assert state.docFreq > 0;
//System.out.println("BTW: finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment + " df=" + stats.docFreq);
+ TermStats stats = new TermStats(state.docFreq, state.totalTermFreq);
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
if (isIndexTerm) {
@@ -238,18 +280,14 @@ public class BlockTermsWriter extends Pu
}
final TermEntry te = pendingTerms[pendingCount];
te.term.copyBytes(text);
- te.state = postingsWriter.newTermState();
- te.state.docFreq = stats.docFreq;
- te.state.totalTermFreq = stats.totalTermFreq;
- postingsWriter.finishTerm(te.state);
+ te.state = state;
pendingCount++;
numTerms++;
}
// Finishes all terms in this field
- @Override
- public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
+ void finish() throws IOException {
if (pendingCount > 0) {
flushBlock();
}
@@ -264,9 +302,9 @@ public class BlockTermsWriter extends Pu
fields.add(new FieldMetaData(fieldInfo,
numTerms,
termsStartPointer,
- sumTotalTermFreq,
+ fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 ? sumTotalTermFreq : -1,
sumDocFreq,
- docCount,
+ docsSeen.cardinality(),
longsSize));
}
}
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java Mon Oct 14 15:55:57 2013
@@ -28,15 +28,12 @@ import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
-import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsFormat;
-import org.apache.lucene.codecs.PushFieldsConsumer;
-import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.codecs.bloom.FuzzySet.ContainsResult;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@@ -118,9 +115,7 @@ public final class BloomFilteringPosting
* "blm" file. This PostingsFormat delegates to a choice of delegate
* PostingsFormat for encoding all other postings data. This choice of
* constructor defaults to the {@link DefaultBloomFilterFactory} for
- * configuring per-field BloomFilters. Note that the
- * wrapped PostingsFormat must use a {@link PushFieldsConsumer}
- * for writing.
+ * configuring per-field BloomFilters.
*
* @param delegatePostingsFormat
* The PostingsFormat that records all the non-bloom filter data i.e.
@@ -144,11 +139,7 @@ public final class BloomFilteringPosting
+ " has been constructed without a choice of PostingsFormat");
}
FieldsConsumer fieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
- if (!(fieldsConsumer instanceof PushFieldsConsumer)) {
- throw new UnsupportedOperationException("Wrapped PostingsFormat must return a PushFieldsConsumer");
- }
- return new BloomFilteredFieldsConsumer(
- (PushFieldsConsumer) fieldsConsumer, state);
+ return new BloomFilteredFieldsConsumer(fieldsConsumer, state);
}
@Override
@@ -315,7 +306,7 @@ public final class BloomFilteringPosting
this.delegateTermsEnum = null;
}
- private final TermsEnum delegate() throws IOException {
+ private TermsEnum delegate() throws IOException {
if (delegateTermsEnum == null) {
/* pull the iterator only if we really need it -
* this can be a relativly heavy operation depending on the
@@ -327,12 +318,12 @@ public final class BloomFilteringPosting
}
@Override
- public final BytesRef next() throws IOException {
+ public BytesRef next() throws IOException {
return delegate().next();
}
@Override
- public final boolean seekExact(BytesRef text)
+ public boolean seekExact(BytesRef text)
throws IOException {
// The magical fail-fast speed up that is the entire point of all of
// this code - save a disk seek if there is a match on an in-memory
@@ -346,33 +337,33 @@ public final class BloomFilteringPosting
}
@Override
- public final SeekStatus seekCeil(BytesRef text)
+ public SeekStatus seekCeil(BytesRef text)
throws IOException {
return delegate().seekCeil(text);
}
@Override
- public final void seekExact(long ord) throws IOException {
+ public void seekExact(long ord) throws IOException {
delegate().seekExact(ord);
}
@Override
- public final BytesRef term() throws IOException {
+ public BytesRef term() throws IOException {
return delegate().term();
}
@Override
- public final long ord() throws IOException {
+ public long ord() throws IOException {
return delegate().ord();
}
@Override
- public final int docFreq() throws IOException {
+ public int docFreq() throws IOException {
return delegate().docFreq();
}
@Override
- public final long totalTermFreq() throws IOException {
+ public long totalTermFreq() throws IOException {
return delegate().totalTermFreq();
}
@@ -401,35 +392,60 @@ public final class BloomFilteringPosting
}
}
- class BloomFilteredFieldsConsumer extends PushFieldsConsumer {
- private PushFieldsConsumer delegateFieldsConsumer;
+ class BloomFilteredFieldsConsumer extends FieldsConsumer {
+ private FieldsConsumer delegateFieldsConsumer;
private Map<FieldInfo,FuzzySet> bloomFilters = new HashMap<FieldInfo,FuzzySet>();
private SegmentWriteState state;
- public BloomFilteredFieldsConsumer(PushFieldsConsumer fieldsConsumer,
+ public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
SegmentWriteState state) {
- super(state);
this.delegateFieldsConsumer = fieldsConsumer;
this.state = state;
}
-
+
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
- FuzzySet bloomFilter = bloomFilterFactory.getSetForField(state,field);
- if (bloomFilter != null) {
- assert bloomFilters.containsKey(field) == false;
- bloomFilters.put(field, bloomFilter);
- return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field), bloomFilter);
- } else {
- // No, use the unfiltered fieldsConsumer - we are not interested in
- // recording any term Bitsets.
- return delegateFieldsConsumer.addField(field);
+ public void write(Fields fields) throws IOException {
+ try {
+ for(String field : fields) {
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+ FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
+ TermsEnum termsEnum = terms.iterator(null);
+
+ FuzzySet bloomFilter = null;
+
+ DocsEnum docsEnum = null;
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ if (bloomFilter == null) {
+ bloomFilter = bloomFilterFactory.getSetForField(state, fieldInfo);
+ if (bloomFilter == null) {
+ // Field not bloom'd
+ break;
+ }
+ assert bloomFilters.containsKey(field) == false;
+ bloomFilters.put(fieldInfo, bloomFilter);
+ }
+ // Make sure there's at least one doc for this term:
+ docsEnum = termsEnum.docs(null, docsEnum, 0);
+ if (docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS) {
+ bloomFilter.addValue(term);
+ }
+ }
+ }
+ } finally {
+ close();
}
+
+ delegateFieldsConsumer.write(fields);
}
-
- @Override
+
public void close() throws IOException {
- delegateFieldsConsumer.close();
// Now we are done accumulating values for these fields
List<Entry<FieldInfo,FuzzySet>> nonSaturatedBlooms = new ArrayList<Map.Entry<FieldInfo,FuzzySet>>();
@@ -475,37 +491,5 @@ public final class BloomFilteringPosting
}
rightSizedSet.serialize(bloomOutput);
}
-
- }
-
- class WrappedTermsConsumer extends TermsConsumer {
- private TermsConsumer delegateTermsConsumer;
- private FuzzySet bloomFilter;
-
- public WrappedTermsConsumer(TermsConsumer termsConsumer,FuzzySet bloomFilter) {
- this.delegateTermsConsumer = termsConsumer;
- this.bloomFilter = bloomFilter;
- }
-
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- return delegateTermsConsumer.startTerm(text);
- }
-
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
-
- // Record this term in our BloomFilter
- if (stats.docFreq > 0) {
- bloomFilter.addValue(text);
- }
- delegateTermsConsumer.finishTerm(text, stats);
- }
-
- @Override
- public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
- throws IOException {
- delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount);
- }
}
}
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java Mon Oct 14 15:55:57 2013
@@ -25,9 +25,8 @@ import org.apache.lucene.codecs.FieldsPr
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPulsing41PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPulsing41PostingsFormat.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPulsing41PostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPulsing41PostingsFormat.java Mon Oct 14 15:55:57 2013
@@ -25,12 +25,9 @@ import org.apache.lucene.codecs.Postings
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsBaseFormat;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
-import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
import org.apache.lucene.codecs.pulsing.PulsingPostingsReader;
+import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java Mon Oct 14 15:55:57 2013
@@ -23,20 +23,21 @@ import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
-import org.apache.lucene.codecs.PostingsConsumer;
+import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.PushFieldsConsumer;
-import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
@@ -142,7 +143,7 @@ import org.apache.lucene.util.fst.Util;
* @lucene.experimental
*/
-public class FSTOrdTermsWriter extends PushFieldsConsumer {
+public class FSTOrdTermsWriter extends FieldsConsumer {
static final String TERMS_INDEX_EXTENSION = "tix";
static final String TERMS_BLOCK_EXTENSION = "tbk";
static final String TERMS_CODEC_NAME = "FST_ORD_TERMS_DICT";
@@ -152,17 +153,18 @@ public class FSTOrdTermsWriter extends P
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
+ final int maxDoc;
final List<FieldMetaData> fields = new ArrayList<FieldMetaData>();
IndexOutput blockOut = null;
IndexOutput indexOut = null;
public FSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException {
- super(state);
final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
final String termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_BLOCK_EXTENSION);
this.postingsWriter = postingsWriter;
this.fieldInfos = state.fieldInfos;
+ this.maxDoc = state.segmentInfo.getDocCount();
boolean success = false;
try {
@@ -180,11 +182,41 @@ public class FSTOrdTermsWriter extends P
}
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
- return new TermsWriter(field);
+ public void write(Fields fields) throws IOException {
+ try {
+ for(String field : fields) {
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ TermsEnum termsEnum = terms.iterator(null);
+ TermsWriter termsWriter = new TermsWriter(fieldInfo);
+
+ long sumTotalTermFreq = 0;
+ long sumDocFreq = 0;
+ FixedBitSet docsSeen = new FixedBitSet(maxDoc);
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen);
+ if (termState != null) {
+ termsWriter.finishTerm(term, termState);
+ sumTotalTermFreq += termState.totalTermFreq;
+ sumDocFreq += termState.docFreq;
+ }
+ }
+
+ termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
+ }
+ } finally {
+ close();
+ }
}
- @Override
public void close() throws IOException {
IOException ioe = null;
try {
@@ -247,7 +279,7 @@ public class FSTOrdTermsWriter extends P
public RAMOutputStream metaBytesOut;
}
- final class TermsWriter extends TermsConsumer {
+ final class TermsWriter {
private final Builder<Long> builder;
private final PositiveIntOutputs outputs;
private final FieldInfo fieldInfo;
@@ -284,34 +316,23 @@ public class FSTOrdTermsWriter extends P
this.lastMetaBytesFP = 0;
}
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- postingsWriter.startTerm();
- return postingsWriter;
- }
-
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
if (numTerms > 0 && numTerms % SKIP_INTERVAL == 0) {
bufferSkip();
}
// write term meta data into fst
final long longs[] = new long[longsSize];
- final long delta = stats.totalTermFreq - stats.docFreq;
- if (stats.totalTermFreq > 0) {
+ final long delta = state.totalTermFreq - state.docFreq;
+ if (state.totalTermFreq > 0) {
if (delta == 0) {
- statsOut.writeVInt(stats.docFreq<<1|1);
+ statsOut.writeVInt(state.docFreq<<1|1);
} else {
- statsOut.writeVInt(stats.docFreq<<1|0);
- statsOut.writeVLong(stats.totalTermFreq-stats.docFreq);
+ statsOut.writeVInt(state.docFreq<<1|0);
+ statsOut.writeVLong(state.totalTermFreq-state.docFreq);
}
} else {
- statsOut.writeVInt(stats.docFreq);
+ statsOut.writeVInt(state.docFreq);
}
- BlockTermState state = postingsWriter.newTermState();
- state.docFreq = stats.docFreq;
- state.totalTermFreq = stats.totalTermFreq;
- postingsWriter.finishTerm(state);
postingsWriter.encodeTerm(longs, metaBytesOut, fieldInfo, state, true);
for (int i = 0; i < longsSize; i++) {
metaLongsOut.writeVLong(longs[i] - lastLongs[i]);
@@ -325,7 +346,6 @@ public class FSTOrdTermsWriter extends P
lastMetaBytesFP = metaBytesOut.getFilePointer();
}
- @Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
if (numTerms > 0) {
final FieldMetaData metadata = new FieldMetaData();
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java Mon Oct 14 15:55:57 2013
@@ -25,9 +25,8 @@ import org.apache.lucene.codecs.FieldsPr
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPulsing41PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPulsing41PostingsFormat.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPulsing41PostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPulsing41PostingsFormat.java Mon Oct 14 15:55:57 2013
@@ -25,12 +25,9 @@ import org.apache.lucene.codecs.Postings
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsBaseFormat;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
-import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
import org.apache.lucene.codecs.pulsing.PulsingPostingsReader;
+import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java Mon Oct 14 15:55:57 2013
@@ -25,7 +25,6 @@ import org.apache.lucene.index.FieldInfo
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.fst.Outputs;
-import org.apache.lucene.util.LongsRef;
/**
* An FST {@link Outputs} implementation for
@@ -89,6 +88,11 @@ class FSTTermOutputs extends Outputs<FST
}
@Override
+ public String toString() {
+ return "FSTTermOutputs$TermData longs=" + Arrays.toString(longs) + " bytes=" + Arrays.toString(bytes) + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq;
+ }
+
+ @Override
public boolean equals(Object other_) {
if (other_ == this) {
return true;
@@ -221,6 +225,7 @@ class FSTTermOutputs extends Outputs<FST
@Override
public void write(TermData data, DataOutput out) throws IOException {
+ assert hasPos || data.totalTermFreq == -1;
int bit0 = allZero(data.longs) ? 0 : 1;
int bit1 = ((data.bytes == null || data.bytes.length == 0) ? 0 : 1) << 1;
int bit2 = ((data.docFreq == 0) ? 0 : 1) << 2;
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java Mon Oct 14 15:55:57 2013
@@ -23,20 +23,21 @@ import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
-import org.apache.lucene.codecs.PostingsConsumer;
+import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.PushFieldsConsumer;
-import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
@@ -119,7 +120,7 @@ import org.apache.lucene.util.fst.Util;
* @lucene.experimental
*/
-public class FSTTermsWriter extends PushFieldsConsumer {
+public class FSTTermsWriter extends FieldsConsumer {
static final String TERMS_EXTENSION = "tmp";
static final String TERMS_CODEC_NAME = "FST_TERMS_DICT";
public static final int TERMS_VERSION_START = 0;
@@ -128,15 +129,16 @@ public class FSTTermsWriter extends Push
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
final IndexOutput out;
+ final int maxDoc;
final List<FieldMetaData> fields = new ArrayList<FieldMetaData>();
public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException {
- super(state);
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
this.postingsWriter = postingsWriter;
this.fieldInfos = state.fieldInfos;
this.out = state.directory.createOutput(termsFileName, state.context);
+ this.maxDoc = state.segmentInfo.getDocCount();
boolean success = false;
try {
@@ -149,19 +151,53 @@ public class FSTTermsWriter extends Push
}
}
}
+
private void writeHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT);
}
+
private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
out.writeLong(dirStart);
}
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
- return new TermsWriter(field);
+ public void write(Fields fields) throws IOException {
+ try {
+ for(String field : fields) {
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ TermsEnum termsEnum = terms.iterator(null);
+ TermsWriter termsWriter = termsWriter = new TermsWriter(fieldInfo);
+
+ long sumTotalTermFreq = 0;
+ long sumDocFreq = 0;
+ FixedBitSet docsSeen = new FixedBitSet(maxDoc);
+
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+
+ BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen);
+ if (termState != null) {
+ termsWriter.finishTerm(term, termState);
+ sumTotalTermFreq += termState.totalTermFreq;
+ sumDocFreq += termState.docFreq;
+ }
+ }
+
+ termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
+ }
+ } finally {
+ close();
+ }
}
- @Override
public void close() throws IOException {
IOException ioe = null;
try {
@@ -208,7 +244,7 @@ public class FSTTermsWriter extends Push
}
}
- final class TermsWriter extends TermsConsumer {
+ final class TermsWriter {
private final Builder<FSTTermOutputs.TermData> builder;
private final FSTTermOutputs outputs;
private final FieldInfo fieldInfo;
@@ -226,22 +262,13 @@ public class FSTTermsWriter extends Push
this.builder = new Builder<FSTTermOutputs.TermData>(FST.INPUT_TYPE.BYTE1, outputs);
}
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- postingsWriter.startTerm();
- return postingsWriter;
- }
-
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
// write term meta data into fst
- final BlockTermState state = postingsWriter.newTermState();
final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData();
meta.longs = new long[longsSize];
meta.bytes = null;
- meta.docFreq = state.docFreq = stats.docFreq;
- meta.totalTermFreq = state.totalTermFreq = stats.totalTermFreq;
- postingsWriter.finishTerm(state);
+ meta.docFreq = state.docFreq;
+ meta.totalTermFreq = state.totalTermFreq;
postingsWriter.encodeTerm(meta.longs, metaWriter, fieldInfo, state, true);
final int bytesSize = (int)metaWriter.getFilePointer();
if (bytesSize > 0) {
@@ -253,7 +280,6 @@ public class FSTTermsWriter extends Push
numTerms++;
}
- @Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
// save FST dict
if (numTerms > 0) {
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java Mon Oct 14 15:55:57 2013
@@ -17,6 +17,7 @@ package org.apache.lucene.codecs.memory;
* limitations under the License.
*/
+import java.io.Closeable;
import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
@@ -26,16 +27,14 @@ import java.util.TreeMap;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
-import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsFormat;
-import org.apache.lucene.codecs.PushFieldsConsumer;
import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@@ -49,6 +48,8 @@ import org.apache.lucene.store.RAMOutput
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.Builder;
@@ -104,7 +105,7 @@ public final class MemoryPostingsFormat
return "PostingsFormat(name=" + getName() + " doPackFST= " + doPackFST + ")";
}
- private final static class TermsWriter extends TermsConsumer {
+ private final static class TermsWriter {
private final IndexOutput out;
private final FieldInfo field;
private final Builder<BytesRef> builder;
@@ -121,7 +122,7 @@ public final class MemoryPostingsFormat
builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doPackFST, acceptableOverheadRatio, true, 15);
}
- private class PostingsWriter extends PostingsConsumer {
+ private class PostingsWriter {
private int lastDocID;
private int lastPos;
private int lastPayloadLen;
@@ -133,7 +134,6 @@ public final class MemoryPostingsFormat
int lastOffsetLength;
int lastOffset;
- @Override
public void startDoc(int docID, int termDocFreq) throws IOException {
//System.out.println(" startDoc docID=" + docID + " freq=" + termDocFreq);
final int delta = docID - lastDocID;
@@ -155,7 +155,6 @@ public final class MemoryPostingsFormat
lastOffset = 0;
}
- @Override
public void addPosition(int pos, BytesRef payload, int startOffset, int endOffset) throws IOException {
assert payload == null || field.hasPayloads();
@@ -200,10 +199,6 @@ public final class MemoryPostingsFormat
}
}
- @Override
- public void finishDoc() {
- }
-
public PostingsWriter reset() {
assert buffer.getFilePointer() == 0;
lastDocID = 0;
@@ -215,13 +210,7 @@ public final class MemoryPostingsFormat
}
}
- private final PostingsWriter postingsWriter = new PostingsWriter();
-
- @Override
- public PostingsConsumer startTerm(BytesRef text) {
- //System.out.println(" startTerm term=" + text.utf8ToString());
- return postingsWriter.reset();
- }
+ final PostingsWriter postingsWriter = new PostingsWriter();
private final RAMOutputStream buffer2 = new RAMOutputStream();
private final BytesRef spare = new BytesRef();
@@ -229,9 +218,11 @@ public final class MemoryPostingsFormat
private final IntsRef scratchIntsRef = new IntsRef();
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ private void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ if (stats.docFreq == 0) {
+ return;
+ }
assert postingsWriter.docCount == stats.docFreq;
assert buffer2.getFilePointer() == 0;
@@ -263,7 +254,6 @@ public final class MemoryPostingsFormat
termCount++;
}
- @Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
if (termCount > 0) {
out.writeVInt(termCount);
@@ -282,29 +272,146 @@ public final class MemoryPostingsFormat
private static String EXTENSION = "ram";
- @Override
- public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+ private class MemoryFieldsConsumer extends FieldsConsumer implements Closeable {
+ private final SegmentWriteState state;
+ private final IndexOutput out;
- final String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, EXTENSION);
- final IndexOutput out = state.directory.createOutput(fileName, state.context);
-
- return new PushFieldsConsumer(state) {
- @Override
- public TermsConsumer addField(FieldInfo field) {
- //System.out.println("\naddField field=" + field.name);
- return new TermsWriter(out, field, doPackFST, acceptableOverheadRatio);
- }
+ private MemoryFieldsConsumer(SegmentWriteState state) throws IOException {
+ final String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, EXTENSION);
+ out = state.directory.createOutput(fileName, state.context);
+ this.state = state;
+ }
- @Override
- public void close() throws IOException {
- // EOF marker:
- try {
- out.writeVInt(0);
- } finally {
- out.close();
+ @Override
+ public void write(Fields fields) throws IOException {
+ boolean success = false;
+ try {
+ for(String field : fields) {
+
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+
+ TermsEnum termsEnum = terms.iterator(null);
+
+ FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
+ TermsWriter termsWriter = new TermsWriter(out, fieldInfo,
+ doPackFST, acceptableOverheadRatio);
+
+ FixedBitSet docsSeen = new FixedBitSet(state.segmentInfo.getDocCount());
+ long sumTotalTermFreq = 0;
+ long sumDocFreq = 0;
+ DocsEnum docsEnum = null;
+ DocsAndPositionsEnum posEnum = null;
+ int enumFlags;
+
+ IndexOptions indexOptions = fieldInfo.getIndexOptions();
+ boolean writeFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ boolean writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ boolean writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ boolean writePayloads = fieldInfo.hasPayloads();
+
+ if (writeFreqs == false) {
+ enumFlags = 0;
+ } else if (writePositions == false) {
+ enumFlags = DocsEnum.FLAG_FREQS;
+ } else if (writeOffsets == false) {
+ if (writePayloads) {
+ enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS;
+ } else {
+ enumFlags = 0;
+ }
+ } else {
+ if (writePayloads) {
+ enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS | DocsAndPositionsEnum.FLAG_OFFSETS;
+ } else {
+ enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS;
+ }
+ }
+
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ termsWriter.postingsWriter.reset();
+
+ if (writePositions) {
+ posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
+ docsEnum = posEnum;
+ } else {
+ docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
+ posEnum = null;
+ }
+
+ int docFreq = 0;
+ long totalTermFreq = 0;
+ while (true) {
+ int docID = docsEnum.nextDoc();
+ if (docID == DocsEnum.NO_MORE_DOCS) {
+ break;
+ }
+ docsSeen.set(docID);
+ docFreq++;
+
+ int freq;
+ if (writeFreqs) {
+ freq = docsEnum.freq();
+ totalTermFreq += freq;
+ } else {
+ freq = -1;
+ }
+
+ termsWriter.postingsWriter.startDoc(docID, freq);
+ if (writePositions) {
+ for (int i=0;i<freq;i++) {
+ int pos = posEnum.nextPosition();
+ BytesRef payload = writePayloads ? posEnum.getPayload() : null;
+ int startOffset;
+ int endOffset;
+ if (writeOffsets) {
+ startOffset = posEnum.startOffset();
+ endOffset = posEnum.endOffset();
+ } else {
+ startOffset = -1;
+ endOffset = -1;
+ }
+ termsWriter.postingsWriter.addPosition(pos, payload, startOffset, endOffset);
+ }
+ }
+ }
+ termsWriter.finishTerm(term, new TermStats(docFreq, totalTermFreq));
+ sumDocFreq += docFreq;
+ sumTotalTermFreq += totalTermFreq;
+ }
+
+ termsWriter.finish(sumTotalTermFreq, sumDocFreq, docsSeen.cardinality());
+ }
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(this);
+ } else {
+ IOUtils.closeWhileHandlingException(this);
}
}
- };
+ }
+
+ @Override
+ public void close() throws IOException {
+ // EOF marker:
+ try {
+ out.writeVInt(0);
+ } finally {
+ out.close();
+ }
+ }
+ }
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+ return new MemoryFieldsConsumer(state);
}
private final static class FSTDocsEnum extends DocsEnum {
Added: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/Pulsing41PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/Pulsing41PostingsFormat.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/Pulsing41PostingsFormat.java (added)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/Pulsing41PostingsFormat.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,45 @@
+package org.apache.lucene.codecs.pulsing;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.BlockTreeTermsWriter;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsBaseFormat;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; // javadocs
+
+/**
+ * Concrete pulsing implementation over {@link Lucene41PostingsFormat}.
+ *
+ * @lucene.experimental
+ */
+public class Pulsing41PostingsFormat extends PulsingPostingsFormat {
+
+ /** Inlines docFreq=1 terms, otherwise uses the normal "Lucene41" format. */
+ public Pulsing41PostingsFormat() {
+ this(1);
+ }
+
+ /** Inlines docFreq=<code>freqCutoff</code> terms, otherwise uses the normal "Lucene41" format. */
+ public Pulsing41PostingsFormat(int freqCutoff) {
+ this(freqCutoff, BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
+ }
+
+ /** Inlines docFreq=<code>freqCutoff</code> terms, otherwise uses the normal "Lucene41" format. */
+ public Pulsing41PostingsFormat(int freqCutoff, int minBlockSize, int maxBlockSize) {
+ super("Pulsing41", new Lucene41PostingsBaseFormat(), freqCutoff, minBlockSize, maxBlockSize);
+ }
+}
Added: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsFormat.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsFormat.java (added)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsFormat.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,119 @@
+package org.apache.lucene.codecs.pulsing;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.BlockTreeTermsReader;
+import org.apache.lucene.codecs.BlockTreeTermsWriter;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsBaseFormat;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.PostingsReaderBase;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.IOUtils;
+
+/** This postings format "inlines" the postings for terms that have
+ * low docFreq. It wraps another postings format, which is used for
+ * writing the non-inlined terms.
+ *
+ * @lucene.experimental */
+
+public abstract class PulsingPostingsFormat extends PostingsFormat {
+
+ private final int freqCutoff;
+ private final int minBlockSize;
+ private final int maxBlockSize;
+ private final PostingsBaseFormat wrappedPostingsBaseFormat;
+
+ public PulsingPostingsFormat(String name, PostingsBaseFormat wrappedPostingsBaseFormat, int freqCutoff) {
+ this(name, wrappedPostingsBaseFormat, freqCutoff, BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
+ }
+
+ /** Terms with freq <= freqCutoff are inlined into terms
+ * dict. */
+ public PulsingPostingsFormat(String name, PostingsBaseFormat wrappedPostingsBaseFormat, int freqCutoff, int minBlockSize, int maxBlockSize) {
+ super(name);
+ this.freqCutoff = freqCutoff;
+ this.minBlockSize = minBlockSize;
+ assert minBlockSize > 1;
+ this.maxBlockSize = maxBlockSize;
+ this.wrappedPostingsBaseFormat = wrappedPostingsBaseFormat;
+ }
+
+ @Override
+ public String toString() {
+ return getName() + "(freqCutoff=" + freqCutoff + " minBlockSize=" + minBlockSize + " maxBlockSize=" + maxBlockSize + ")";
+ }
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+ PostingsWriterBase docsWriter = null;
+
+ // Terms that have <= freqCutoff number of docs are
+ // "pulsed" (inlined):
+ PostingsWriterBase pulsingWriter = null;
+
+ // Terms dict
+ boolean success = false;
+ try {
+ docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
+
+ // Terms that have <= freqCutoff number of docs are
+ // "pulsed" (inlined):
+ pulsingWriter = new PulsingPostingsWriter(state, freqCutoff, docsWriter);
+ FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter, minBlockSize, maxBlockSize);
+ success = true;
+ return ret;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(docsWriter, pulsingWriter);
+ }
+ }
+ }
+
+ @Override
+ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
+ PostingsReaderBase docsReader = null;
+ PostingsReaderBase pulsingReader = null;
+
+ boolean success = false;
+ try {
+ docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
+ pulsingReader = new PulsingPostingsReader(state, docsReader);
+ FieldsProducer ret = new BlockTreeTermsReader(
+ state.directory, state.fieldInfos, state.segmentInfo,
+ pulsingReader,
+ state.context,
+ state.segmentSuffix);
+ success = true;
+ return ret;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(docsReader, pulsingReader);
+ }
+ }
+ }
+
+ public int getFreqCutoff() {
+ return freqCutoff;
+ }
+}
Added: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java (added)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,378 @@
+package org.apache.lucene.codecs.pulsing;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOUtils;
+
+// TODO: we now inline based on total TF of the term,
+// but it might be better to inline by "net bytes used"
+// so that a term that has only 1 posting but a huge
+// payload would not be inlined. Though this is
+// presumably rare in practice...
+
+/**
+ * Writer for the pulsing format.
+ * <p>
+ * Wraps another postings implementation and decides
+ * (based on total number of occurrences), whether a terms
+ * postings should be inlined into the term dictionary,
+ * or passed through to the wrapped writer.
+ *
+ * @lucene.experimental */
+public final class PulsingPostingsWriter extends PostingsWriterBase {
+
+ final static String CODEC = "PulsedPostingsWriter";
+
+ // recording field summary
+ final static String SUMMARY_EXTENSION = "smy";
+
+ // To add a new version, increment from the last one, and
+ // change VERSION_CURRENT to point to your new version:
+ final static int VERSION_START = 0;
+
+ final static int VERSION_META_ARRAY = 1;
+
+ final static int VERSION_CURRENT = VERSION_META_ARRAY;
+
+ private SegmentWriteState segmentState;
+
+ private List<FieldMetaData> fields;
+
+ // Reused by writeTerm:
+ private DocsEnum docsEnum;
+ private DocsAndPositionsEnum posEnum;
+ private int enumFlags;
+
+ private final RAMOutputStream buffer = new RAMOutputStream();
+
+ private IndexOptions indexOptions;
+
+ // information for wrapped PF, in current field
+ private int longsSize;
+ private long[] longs;
+ private boolean fieldHasFreqs;
+ private boolean fieldHasPositions;
+ private boolean fieldHasOffsets;
+ private boolean fieldHasPayloads;
+ boolean absolute;
+
+ private static class PulsingTermState extends BlockTermState {
+ private byte[] bytes;
+ private BlockTermState wrappedState;
+
+ @Override
+ public String toString() {
+ if (bytes != null) {
+ return "inlined";
+ } else {
+ return "not inlined wrapped=" + wrappedState;
+ }
+ }
+ }
+
+ private static final class FieldMetaData {
+ int fieldNumber;
+ int longsSize;
+ FieldMetaData(int number, int size) {
+ fieldNumber = number;
+ longsSize = size;
+ }
+ }
+
+ // TODO: -- lazy init this? ie, if every single term
+ // was inlined (eg for a "primary key" field) then we
+ // never need to use this fallback? Fallback writer for
+ // non-inlined terms:
+ final PostingsWriterBase wrappedPostingsWriter;
+
+ final int maxPositions;
+
+ /** If the total number of positions (summed across all docs
+ * for this term) is <= maxPositions, then the postings are
+ * inlined into terms dict */
+ public PulsingPostingsWriter(SegmentWriteState state, int maxPositions, PostingsWriterBase wrappedPostingsWriter) {
+ fields = new ArrayList<FieldMetaData>();
+ this.maxPositions = maxPositions;
+ // We simply wrap another postings writer, but only call
+ // on it when tot positions is >= the cutoff:
+ this.wrappedPostingsWriter = wrappedPostingsWriter;
+ this.segmentState = state;
+ }
+
+ @Override
+ public void init(IndexOutput termsOut) throws IOException {
+ CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
+ termsOut.writeVInt(maxPositions); // encode maxPositions in header
+ wrappedPostingsWriter.init(termsOut);
+ }
+
+ @Override
+ public BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException {
+
+ // First pass: figure out whether we should pulse this term
+ long posCount = 0;
+
+ if (fieldHasPositions == false) {
+ // No positions:
+ docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
+ assert docsEnum != null;
+ while (posCount <= maxPositions) {
+ if (docsEnum.nextDoc() == DocsEnum.NO_MORE_DOCS) {
+ break;
+ }
+ posCount++;
+ }
+ } else {
+ posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
+ assert posEnum != null;
+ while (posCount <= maxPositions) {
+ if (posEnum.nextDoc() == DocsEnum.NO_MORE_DOCS) {
+ break;
+ }
+ posCount += posEnum.freq();
+ }
+ }
+
+ if (posCount == 0) {
+ // All docs were deleted
+ return null;
+ }
+
+ // Second pass: write postings
+ if (posCount > maxPositions) {
+ // Too many positions; do not pulse. Just lset
+ // wrapped postingsWriter encode the postings:
+
+ PulsingTermState state = new PulsingTermState();
+ state.wrappedState = wrappedPostingsWriter.writeTerm(term, termsEnum, docsSeen);
+ state.docFreq = state.wrappedState.docFreq;
+ state.totalTermFreq = state.wrappedState.totalTermFreq;
+ return state;
+ } else {
+ // Pulsed:
+ if (fieldHasPositions == false) {
+ docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
+ } else {
+ posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
+ docsEnum = posEnum;
+ }
+ assert docsEnum != null;
+
+ // There were few enough total occurrences for this
+ // term, so we fully inline our postings data into
+ // terms dict, now:
+
+ // TODO: it'd be better to share this encoding logic
+ // in some inner codec that knows how to write a
+ // single doc / single position, etc. This way if a
+ // given codec wants to store other interesting
+ // stuff, it could use this pulsing codec to do so
+
+ int lastDocID = 0;
+ int lastPayloadLength = -1;
+ int lastOffsetLength = -1;
+
+ int docFreq = 0;
+ long totalTermFreq = 0;
+ while (true) {
+ int docID = docsEnum.nextDoc();
+ if (docID == DocsEnum.NO_MORE_DOCS) {
+ break;
+ }
+ docsSeen.set(docID);
+
+ int delta = docID - lastDocID;
+ lastDocID = docID;
+
+ docFreq++;
+
+ if (fieldHasFreqs) {
+ int freq = docsEnum.freq();
+ totalTermFreq += freq;
+
+ if (freq == 1) {
+ buffer.writeVInt((delta << 1) | 1);
+ } else {
+ buffer.writeVInt(delta << 1);
+ buffer.writeVInt(freq);
+ }
+
+ if (fieldHasPositions) {
+ int lastPos = 0;
+ int lastOffset = 0;
+ for(int posIDX=0;posIDX<freq;posIDX++) {
+ int pos = posEnum.nextPosition();
+ int posDelta = pos - lastPos;
+ lastPos = pos;
+ int payloadLength;
+ BytesRef payload;
+ if (fieldHasPayloads) {
+ payload = posEnum.getPayload();
+ payloadLength = payload == null ? 0 : payload.length;
+ if (payloadLength != lastPayloadLength) {
+ buffer.writeVInt((posDelta << 1)|1);
+ buffer.writeVInt(payloadLength);
+ lastPayloadLength = payloadLength;
+ } else {
+ buffer.writeVInt(posDelta << 1);
+ }
+ } else {
+ payloadLength = 0;
+ payload = null;
+ buffer.writeVInt(posDelta);
+ }
+
+ if (fieldHasOffsets) {
+ int startOffset = posEnum.startOffset();
+ int endOffset = posEnum.endOffset();
+ int offsetDelta = startOffset - lastOffset;
+ int offsetLength = endOffset - startOffset;
+ if (offsetLength != lastOffsetLength) {
+ buffer.writeVInt(offsetDelta << 1 | 1);
+ buffer.writeVInt(offsetLength);
+ } else {
+ buffer.writeVInt(offsetDelta << 1);
+ }
+ lastOffset = startOffset;
+ lastOffsetLength = offsetLength;
+ }
+
+ if (payloadLength > 0) {
+ assert fieldHasPayloads;
+ assert payload != null;
+ buffer.writeBytes(payload.bytes, payload.offset, payload.length);
+ }
+ }
+ }
+ } else {
+ buffer.writeVInt(delta);
+ }
+ }
+
+ PulsingTermState state = new PulsingTermState();
+ state.bytes = new byte[(int) buffer.getFilePointer()];
+ state.docFreq = docFreq;
+ state.totalTermFreq = fieldHasFreqs ? totalTermFreq : -1;
+ buffer.writeTo(state.bytes, 0);
+ buffer.reset();
+ return state;
+ }
+ }
+
+ // TODO: -- should we NOT reuse across fields? would
+ // be cleaner
+
+ // Currently, this instance is re-used across fields, so
+ // our parent calls setField whenever the field changes
+ @Override
+ public int setField(FieldInfo fieldInfo) {
+ this.indexOptions = fieldInfo.getIndexOptions();
+ //if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
+ fieldHasPayloads = fieldInfo.hasPayloads();
+ absolute = false;
+ longsSize = wrappedPostingsWriter.setField(fieldInfo);
+ longs = new long[longsSize];
+ fields.add(new FieldMetaData(fieldInfo.number, longsSize));
+
+ fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+
+ if (fieldHasFreqs == false) {
+ enumFlags = 0;
+ } else if (fieldHasPositions == false) {
+ enumFlags = DocsEnum.FLAG_FREQS;
+ } else if (fieldHasOffsets == false) {
+ if (fieldHasPayloads) {
+ enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS;
+ } else {
+ enumFlags = 0;
+ }
+ } else {
+ if (fieldHasPayloads) {
+ enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS | DocsAndPositionsEnum.FLAG_OFFSETS;
+ } else {
+ enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS;
+ }
+ }
+ return 0;
+ //DEBUG = BlockTreeTermsWriter.DEBUG;
+ }
+
+ @Override
+ public void encodeTerm(long[] empty, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
+ PulsingTermState state = (PulsingTermState)_state;
+ assert empty.length == 0;
+ this.absolute = this.absolute || absolute;
+ if (state.bytes == null) {
+ wrappedPostingsWriter.encodeTerm(longs, buffer, fieldInfo, state.wrappedState, this.absolute);
+ for (int i = 0; i < longsSize; i++) {
+ out.writeVLong(longs[i]);
+ }
+ buffer.writeTo(out);
+ buffer.reset();
+ this.absolute = false;
+ } else {
+ out.writeVInt(state.bytes.length);
+ out.writeBytes(state.bytes, 0, state.bytes.length);
+ this.absolute = this.absolute || absolute;
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ wrappedPostingsWriter.close();
+ if (wrappedPostingsWriter instanceof PulsingPostingsWriter ||
+ VERSION_CURRENT < VERSION_META_ARRAY) {
+ return;
+ }
+ String summaryFileName = IndexFileNames.segmentFileName(segmentState.segmentInfo.name, segmentState.segmentSuffix, SUMMARY_EXTENSION);
+ IndexOutput out = null;
+ try {
+ out = segmentState.directory.createOutput(summaryFileName, segmentState.context);
+ CodecUtil.writeHeader(out, CODEC, VERSION_CURRENT);
+ out.writeVInt(fields.size());
+ for (FieldMetaData field : fields) {
+ out.writeVInt(field.fieldNumber);
+ out.writeVInt(field.longsSize);
+ }
+ out.close();
+ } finally {
+ IOUtils.closeWhileHandlingException(out);
+ }
+ }
+}
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java Mon Oct 14 15:55:57 2013
@@ -21,16 +21,15 @@ import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
-import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsEnum;
-import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
@@ -38,7 +37,7 @@ import org.apache.lucene.util.IOUtils;
* to .pyl, skip data to .skp
*
* @lucene.experimental */
-public final class SepPostingsWriter extends PostingsWriterBase {
+public final class SepPostingsWriter extends PushPostingsWriterBase {
final static String CODEC = "SepPostingsWriter";
final static String DOC_EXTENSION = "doc";
@@ -85,11 +84,8 @@ public final class SepPostingsWriter ext
final int totalNumDocs;
- boolean storePayloads;
IndexOptions indexOptions;
- FieldInfo fieldInfo;
-
int lastPayloadLength;
int lastPosition;
long payloadStart;
@@ -190,13 +186,12 @@ public final class SepPostingsWriter ext
// our parent calls setField whenever the field changes
@Override
public int setField(FieldInfo fieldInfo) {
- this.fieldInfo = fieldInfo;
+ super.setField(fieldInfo);
this.indexOptions = fieldInfo.getIndexOptions();
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new UnsupportedOperationException("this codec cannot index offsets");
}
skipListWriter.setIndexOptions(indexOptions);
- storePayloads = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && fieldInfo.hasPayloads();
lastPayloadFP = 0;
lastSkipFP = 0;
lastState = setEmptyState();
@@ -233,7 +228,7 @@ public final class SepPostingsWriter ext
// TODO: -- awkward we have to make these two
// separate calls to skipper
//System.out.println(" buffer skip lastDocID=" + lastDocID);
- skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
+ skipListWriter.setSkipData(lastDocID, writePayloads, lastPayloadLength);
skipListWriter.bufferSkip(df);
}
@@ -254,7 +249,7 @@ public final class SepPostingsWriter ext
assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)
lastPosition = position;
- if (storePayloads) {
+ if (writePayloads) {
final int payloadLength = payload == null ? 0 : payload.length;
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
@@ -344,7 +339,7 @@ public final class SepPostingsWriter ext
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
lastState.posIndex.copyFrom(state.posIndex, false);
lastState.posIndex.write(out, absolute);
- if (storePayloads) {
+ if (writePayloads) {
if (absolute) {
out.writeVLong(state.payloadFP);
} else {