You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/08/25 03:52:09 UTC
svn commit: r1620250 [1/4] - in /lucene/dev/branches/branch_4x/lucene: ./
codecs/src/java/org/apache/lucene/codecs/blockterms/
codecs/src/java/org/apache/lucene/codecs/blocktreeords/
codecs/src/java/org/apache/lucene/codecs/bloom/ codecs/src/java/org/a...
Author: rmuir
Date: Mon Aug 25 01:52:08 2014
New Revision: 1620250
URL: http://svn.apache.org/r1620250
Log:
LUCENE-5123, LUCENE-5268: invert codec postings api (backport from trunk)
Added:
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java (with props)
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java (with props)
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/MappedMultiFields.java (with props)
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/MappingMultiDocsAndPositionsEnum.java (with props)
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/MappingMultiDocsEnum.java (with props)
Removed:
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsAndPositionsEnum.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsEnum.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java
Modified:
lucene/dev/branches/branch_4x/lucene/CHANGES.txt
lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java
lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java
lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java
lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java
lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java
lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/MergeState.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/SegmentWriteState.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java
lucene/dev/branches/branch_4x/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java
lucene/dev/branches/branch_4x/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java
lucene/dev/branches/branch_4x/lucene/sandbox/src/test/org/apache/lucene/codecs/idversion/TestIDVersionPostingsFormat.java
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/cranky/CrankyPostingsFormat.java
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene3x/PreFlexRWFieldsWriter.java
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene3x/TermInfosWriter.java
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/ramonly/RAMOnlyPostingsFormat.java
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1620250&r1=1620249&r2=1620250&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Mon Aug 25 01:52:08 2014
@@ -13,6 +13,16 @@ New Features
* LUCENE-5889: Add commit method to AnalyzingInfixSuggester, and allow just using .add
to build up the suggester. (Varun Thacker via Mike McCandless)
+* LUCENE-5123: Add a "push" option to the postings writing API, so
+ that a PostingsFormat now receives a Fields instance and it is
+ responsible for iterating through all fields, terms, documents and
+ positions. (Robert Muir, Mike McCandless)
+
+* LUCENE-5268: Full cutover of all postings formats to the "pull"
+ FieldsConsumer API, removing PushFieldsConsumer. Added new
+ PushPostingsWriterBase for single-pass push of docs/positions to the
+ postings format. (Mike McCandless)
+
Bug Fixes
* LUCENE-5650: Enforce read-only access to any path outside the temporary
Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java?rev=1620250&r1=1620249&r2=1620250&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java Mon Aug 25 01:52:08 2014
@@ -17,24 +17,26 @@ package org.apache.lucene.codecs.blockte
* limitations under the License.
*/
+import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Comparator;
import java.util.Arrays;
+import java.util.Comparator;
import java.util.List;
+import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
-import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.BlockTermState;
-import org.apache.lucene.codecs.TermsConsumer;
-import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
@@ -56,7 +58,7 @@ import org.apache.lucene.util.RamUsageEs
* @lucene.experimental
*/
-public class BlockTermsWriter extends FieldsConsumer {
+public class BlockTermsWriter extends FieldsConsumer implements Closeable {
final static String CODEC_NAME = "BLOCK_TERMS_DICT";
@@ -75,6 +77,7 @@ public class BlockTermsWriter extends Fi
final FieldInfos fieldInfos;
FieldInfo currentField;
private final TermsIndexWriterBase termsIndexWriter;
+ private final int maxDoc;
private static class FieldMetaData {
public final FieldInfo fieldInfo;
@@ -106,6 +109,7 @@ public class BlockTermsWriter extends Fi
throws IOException {
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
this.termsIndexWriter = termsIndexWriter;
+ maxDoc = state.segmentInfo.getDocCount();
out = state.directory.createOutput(termsFileName, state.context);
boolean success = false;
try {
@@ -131,7 +135,33 @@ public class BlockTermsWriter extends Fi
}
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
+ public void write(Fields fields) throws IOException {
+
+ for(String field : fields) {
+
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+
+ TermsEnum termsEnum = terms.iterator(null);
+
+ TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));
+
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+
+ termsWriter.write(term, termsEnum);
+ }
+
+ termsWriter.finish();
+ }
+ }
+
+ private TermsWriter addField(FieldInfo field) throws IOException {
//System.out.println("\nBTW.addField seg=" + segment + " field=" + field.name);
assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field;
@@ -177,12 +207,13 @@ public class BlockTermsWriter extends Fi
public BlockTermState state;
}
- class TermsWriter extends TermsConsumer {
+ class TermsWriter {
private final FieldInfo fieldInfo;
private final PostingsWriterBase postingsWriter;
private final long termsStartPointer;
private long numTerms;
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
+ private final FixedBitSet docsSeen;
long sumTotalTermFreq;
long sumDocFreq;
int docCount;
@@ -199,6 +230,7 @@ public class BlockTermsWriter extends Fi
{
this.fieldInfo = fieldInfo;
this.fieldIndexWriter = fieldIndexWriter;
+ this.docsSeen = new FixedBitSet(maxDoc);
pendingTerms = new TermEntry[32];
for(int i=0;i<pendingTerms.length;i++) {
pendingTerms[i] = new TermEntry();
@@ -208,26 +240,22 @@ public class BlockTermsWriter extends Fi
this.longsSize = postingsWriter.setField(fieldInfo);
}
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- //System.out.println("BTW: startTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment);
- postingsWriter.startTerm();
- return postingsWriter;
- }
-
private final BytesRefBuilder lastPrevTerm = new BytesRefBuilder();
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ void write(BytesRef text, TermsEnum termsEnum) throws IOException {
+
+ BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen);
+ if (state == null) {
+ // No docs for this term:
+ return;
+ }
+ sumDocFreq += state.docFreq;
+ sumTotalTermFreq += state.totalTermFreq;
- assert stats.docFreq > 0;
+ assert state.docFreq > 0;
//System.out.println("BTW: finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment + " df=" + stats.docFreq);
+ TermStats stats = new TermStats(state.docFreq, state.totalTermFreq);
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
if (isIndexTerm) {
@@ -249,18 +277,14 @@ public class BlockTermsWriter extends Fi
}
final TermEntry te = pendingTerms[pendingCount];
te.term.copyBytes(text);
- te.state = postingsWriter.newTermState();
- te.state.docFreq = stats.docFreq;
- te.state.totalTermFreq = stats.totalTermFreq;
- postingsWriter.finishTerm(te.state);
+ te.state = state;
pendingCount++;
numTerms++;
}
// Finishes all terms in this field
- @Override
- public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
+ void finish() throws IOException {
if (pendingCount > 0) {
flushBlock();
}
@@ -275,9 +299,9 @@ public class BlockTermsWriter extends Fi
fields.add(new FieldMetaData(fieldInfo,
numTerms,
termsStartPointer,
- sumTotalTermFreq,
+ fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 ? sumTotalTermFreq : -1,
sumDocFreq,
- docCount,
+ docsSeen.cardinality(),
longsSize));
}
}
@@ -365,4 +389,9 @@ public class BlockTermsWriter extends Fi
pendingCount = 0;
}
}
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
}
Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java?rev=1620250&r1=1620249&r2=1620250&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java Mon Aug 25 01:52:08 2014
@@ -25,10 +25,7 @@ import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
-import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter; // javadocs
import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
import org.apache.lucene.index.FieldInfo.IndexOptions;
@@ -139,7 +136,6 @@ public final class OrdsBlockTreeTermsWri
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
- FieldInfo currentField;
private static class FieldMetaData {
public final FieldInfo fieldInfo;
@@ -230,14 +226,34 @@ public final class OrdsBlockTreeTermsWri
this.indexOut = indexOut;
}
- public TermsConsumer addField(FieldInfo field) throws IOException {
- //DEBUG = field.name.equals("id");
- //if (DEBUG) System.out.println("\nBTTW.addField seg=" + segment + " field=" + field.name);
- assert currentField == null || currentField.name.compareTo(field.name) < 0;
- currentField = field;
- return new TermsWriter(field);
- }
+ @Override
+ public void write(Fields fields) throws IOException {
+
+ String lastField = null;
+ for(String field : fields) {
+ assert lastField == null || lastField.compareTo(field) < 0;
+ lastField = field;
+
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+
+ TermsEnum termsEnum = terms.iterator(null);
+
+ TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field));
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ termsWriter.write(term, termsEnum);
+ }
+ termsWriter.finish();
+ }
+ }
+
static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) {
assert fp < (1L << 62);
return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0);
@@ -424,13 +440,13 @@ public final class OrdsBlockTreeTermsWri
private final RAMOutputStream scratchBytes = new RAMOutputStream();
private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
- class TermsWriter extends TermsConsumer {
+ class TermsWriter {
private final FieldInfo fieldInfo;
private final int longsSize;
private long numTerms;
+ final FixedBitSet docsSeen;
long sumTotalTermFreq;
long sumDocFreq;
- int docCount;
long indexStartFP;
// Records index into pending where the current prefix at that
@@ -770,50 +786,37 @@ public final class OrdsBlockTreeTermsWri
TermsWriter(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
+ docsSeen = new FixedBitSet(maxDoc);
this.longsSize = postingsWriter.setField(fieldInfo);
this.longs = new long[longsSize];
}
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- //if (DEBUG) System.out.println("\nBTTW.startTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment);
- postingsWriter.startTerm();
+ /** Writes one term's worth of postings. */
+ public void write(BytesRef text, TermsEnum termsEnum) throws IOException {
/*
- if (fieldInfo.name.equals("id")) {
- postingsWriter.termID = Integer.parseInt(text.utf8ToString());
- } else {
- postingsWriter.termID = -1;
+ if (DEBUG) {
+ int[] tmp = new int[lastTerm.length];
+ System.arraycopy(prefixStarts, 0, tmp, 0, tmp.length);
+ System.out.println("BTTW: write term=" + brToString(text) + " prefixStarts=" + Arrays.toString(tmp) + " pending.size()=" + pending.size());
}
*/
- return postingsWriter;
- }
-
- /** Writes one term's worth of postings. */
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
- assert stats.docFreq != 0;
- assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY || stats.totalTermFreq >= stats.docFreq: "postingsWriter=" + postingsWriter;
-
- pushTerm(text);
- BlockTermState state = postingsWriter.newTermState();
- state.docFreq = stats.docFreq;
- state.totalTermFreq = stats.totalTermFreq;
- postingsWriter.finishTerm(state);
-
- PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state);
- pending.add(term);
- numTerms++;
-
- if (firstPendingTerm == null) {
- firstPendingTerm = term;
+ BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen);
+ if (state != null) {
+ assert state.docFreq != 0;
+ assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter;
+ sumDocFreq += state.docFreq;
+ sumTotalTermFreq += state.totalTermFreq;
+ pushTerm(text);
+
+ PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state);
+ pending.add(term);
+ numTerms++;
+ if (firstPendingTerm == null) {
+ firstPendingTerm = term;
+ }
+ lastPendingTerm = term;
}
- lastPendingTerm = term;
}
/** Pushes the new term to the top of the stack, and writes new blocks. */
@@ -854,7 +857,7 @@ public final class OrdsBlockTreeTermsWri
}
// Finishes all terms in this field
- public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
+ public void finish() throws IOException {
if (numTerms > 0) {
// if (DEBUG) System.out.println("BTTW.finish pending.size()=" + pending.size());
@@ -869,10 +872,6 @@ public final class OrdsBlockTreeTermsWri
assert root.prefix.length == 0;
assert root.index.getEmptyOutput() != null;
- this.sumTotalTermFreq = sumTotalTermFreq;
- this.sumDocFreq = sumDocFreq;
- this.docCount = docCount;
-
// Write FST to index
indexStartFP = indexOut.getFilePointer();
root.index.save(indexOut);
@@ -898,13 +897,11 @@ public final class OrdsBlockTreeTermsWri
indexStartFP,
sumTotalTermFreq,
sumDocFreq,
- docCount,
+ docsSeen.cardinality(),
longsSize,
minTerm, maxTerm));
} else {
- assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1;
- assert sumDocFreq == 0;
- assert docCount == 0;
+ assert docsSeen.cardinality() == 0;
}
}
@@ -960,4 +957,9 @@ public final class OrdsBlockTreeTermsWri
out.writeVInt(bytes.length);
out.writeBytes(bytes.bytes, bytes.offset, bytes.length);
}
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
}
Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java?rev=1620250&r1=1620249&r2=1620250&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java Mon Aug 25 01:52:08 2014
@@ -23,20 +23,18 @@ import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
-import java.util.Map;
import java.util.Map.Entry;
+import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
-import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsFormat;
-import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.codecs.bloom.FuzzySet.ContainsResult;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@@ -114,7 +112,7 @@ public final class BloomFilteringPosting
this.delegatePostingsFormat = delegatePostingsFormat;
this.bloomFilterFactory = bloomFilterFactory;
}
-
+
/**
* Creates Bloom filters for a selection of fields created in the index. This
* is recorded as a set of Bitsets held as a segment summary in an additional
@@ -144,9 +142,8 @@ public final class BloomFilteringPosting
throw new UnsupportedOperationException("Error - " + getClass().getName()
+ " has been constructed without a choice of PostingsFormat");
}
- return new BloomFilteredFieldsConsumer(
- delegatePostingsFormat.fieldsConsumer(state), state,
- delegatePostingsFormat);
+ FieldsConsumer fieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
+ return new BloomFilteredFieldsConsumer(fieldsConsumer, state);
}
@Override
@@ -336,7 +333,7 @@ public final class BloomFilteringPosting
this.delegateTermsEnum = null;
}
- private final TermsEnum delegate() throws IOException {
+ private TermsEnum delegate() throws IOException {
if (delegateTermsEnum == null) {
/* pull the iterator only if we really need it -
* this can be a relativly heavy operation depending on the
@@ -372,33 +369,33 @@ public final class BloomFilteringPosting
}
@Override
- public final SeekStatus seekCeil(BytesRef text)
+ public SeekStatus seekCeil(BytesRef text)
throws IOException {
return delegate().seekCeil(text);
}
@Override
- public final void seekExact(long ord) throws IOException {
+ public void seekExact(long ord) throws IOException {
delegate().seekExact(ord);
}
@Override
- public final BytesRef term() throws IOException {
+ public BytesRef term() throws IOException {
return delegate().term();
}
@Override
- public final long ord() throws IOException {
+ public long ord() throws IOException {
return delegate().ord();
}
@Override
- public final int docFreq() throws IOException {
+ public int docFreq() throws IOException {
return delegate().docFreq();
}
@Override
- public final long totalTermFreq() throws IOException {
+ public long totalTermFreq() throws IOException {
return delegate().totalTermFreq();
}
@@ -414,8 +411,6 @@ public final class BloomFilteringPosting
throws IOException {
return delegate().docs(liveDocs, reuse, flags);
}
-
-
}
@Override
@@ -439,30 +434,62 @@ public final class BloomFilteringPosting
private Map<FieldInfo,FuzzySet> bloomFilters = new HashMap<>();
private SegmentWriteState state;
-
public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
- SegmentWriteState state, PostingsFormat delegatePostingsFormat) {
+ SegmentWriteState state) {
this.delegateFieldsConsumer = fieldsConsumer;
this.state = state;
}
-
+
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
- FuzzySet bloomFilter = bloomFilterFactory.getSetForField(state,field);
- if (bloomFilter != null) {
- assert bloomFilters.containsKey(field) == false;
- bloomFilters.put(field, bloomFilter);
- return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field),bloomFilter);
- } else {
- // No, use the unfiltered fieldsConsumer - we are not interested in
- // recording any term Bitsets.
- return delegateFieldsConsumer.addField(field);
+ public void write(Fields fields) throws IOException {
+
+ // Delegate must write first: it may have opened files
+ // on creating the class
+ // (e.g. Lucene41PostingsConsumer), and write() will
+ // close them; alternatively, if we delayed pulling
+ // the fields consumer until here, we could do it
+ // afterwards:
+ delegateFieldsConsumer.write(fields);
+
+ for(String field : fields) {
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+ FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
+ TermsEnum termsEnum = terms.iterator(null);
+
+ FuzzySet bloomFilter = null;
+
+ DocsEnum docsEnum = null;
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ if (bloomFilter == null) {
+ bloomFilter = bloomFilterFactory.getSetForField(state, fieldInfo);
+ if (bloomFilter == null) {
+ // Field not bloom'd
+ break;
+ }
+ assert bloomFilters.containsKey(field) == false;
+ bloomFilters.put(fieldInfo, bloomFilter);
+ }
+ // Make sure there's at least one doc for this term:
+ docsEnum = termsEnum.docs(null, docsEnum, 0);
+ if (docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS) {
+ bloomFilter.addValue(term);
+ }
+ }
}
}
-
+
@Override
public void close() throws IOException {
+
delegateFieldsConsumer.close();
+
// Now we are done accumulating values for these fields
List<Entry<FieldInfo,FuzzySet>> nonSaturatedBlooms = new ArrayList<>();
@@ -497,6 +524,11 @@ public final class BloomFilteringPosting
bloomFilters.clear();
}
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return delegateFieldsConsumer.getComparator();
+ }
+
private void saveAppropriatelySizedBloomFilter(IndexOutput bloomOutput,
FuzzySet bloomFilter, FieldInfo fieldInfo) throws IOException {
@@ -507,44 +539,10 @@ public final class BloomFilteringPosting
}
rightSizedSet.serialize(bloomOutput);
}
-
}
-
- class WrappedTermsConsumer extends TermsConsumer {
- private TermsConsumer delegateTermsConsumer;
- private FuzzySet bloomFilter;
-
- public WrappedTermsConsumer(TermsConsumer termsConsumer,FuzzySet bloomFilter) {
- this.delegateTermsConsumer = termsConsumer;
- this.bloomFilter = bloomFilter;
- }
-
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- return delegateTermsConsumer.startTerm(text);
- }
-
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
-
- // Record this term in our BloomFilter
- if (stats.docFreq > 0) {
- bloomFilter.addValue(text);
- }
- delegateTermsConsumer.finishTerm(text, stats);
- }
-
- @Override
- public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
- throws IOException {
- delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount);
- }
-
- @Override
- public Comparator<BytesRef> getComparator() throws IOException {
- return delegateTermsConsumer.getComparator();
- }
-
+
+ @Override
+ public String toString() {
+ return "BloomFilteringPostingsFormat(" + delegatePostingsFormat + ")";
}
-
}
Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java?rev=1620250&r1=1620249&r2=1620250&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java Mon Aug 25 01:52:08 2014
@@ -18,20 +18,27 @@ package org.apache.lucene.codecs.memory;
*/
import java.io.IOException;
-import java.util.List;
import java.util.ArrayList;
import java.util.Comparator;
+import java.util.List;
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
-import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
@@ -39,13 +46,6 @@ import org.apache.lucene.util.fst.Builde
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
-import org.apache.lucene.codecs.BlockTermState;
-import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.PostingsConsumer;
-import org.apache.lucene.codecs.FieldsConsumer;
-import org.apache.lucene.codecs.TermsConsumer;
-import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.CodecUtil;
/**
* FST-based term dict, using ord as FST output.
@@ -158,6 +158,7 @@ public class FSTOrdTermsWriter extends F
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
+ final int maxDoc;
final List<FieldMetaData> fields = new ArrayList<>();
IndexOutput blockOut = null;
IndexOutput indexOut = null;
@@ -168,6 +169,7 @@ public class FSTOrdTermsWriter extends F
this.postingsWriter = postingsWriter;
this.fieldInfos = state.fieldInfos;
+ this.maxDoc = state.segmentInfo.getDocCount();
boolean success = false;
try {
@@ -185,8 +187,35 @@ public class FSTOrdTermsWriter extends F
}
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
- return new TermsWriter(field);
+ public void write(Fields fields) throws IOException {
+ for(String field : fields) {
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ TermsEnum termsEnum = terms.iterator(null);
+ TermsWriter termsWriter = new TermsWriter(fieldInfo);
+
+ long sumTotalTermFreq = 0;
+ long sumDocFreq = 0;
+ FixedBitSet docsSeen = new FixedBitSet(maxDoc);
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen);
+ if (termState != null) {
+ termsWriter.finishTerm(term, termState);
+ sumTotalTermFreq += termState.totalTermFreq;
+ sumDocFreq += termState.docFreq;
+ }
+ }
+
+ termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
+ }
}
@Override
@@ -260,7 +289,7 @@ public class FSTOrdTermsWriter extends F
public RAMOutputStream metaBytesOut;
}
- final class TermsWriter extends TermsConsumer {
+ final class TermsWriter {
private final Builder<Long> builder;
private final PositiveIntOutputs outputs;
private final FieldInfo fieldInfo;
@@ -297,39 +326,23 @@ public class FSTOrdTermsWriter extends F
this.lastMetaBytesFP = 0;
}
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- postingsWriter.startTerm();
- return postingsWriter;
- }
-
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
if (numTerms > 0 && numTerms % SKIP_INTERVAL == 0) {
bufferSkip();
}
// write term meta data into fst
final long longs[] = new long[longsSize];
- final long delta = stats.totalTermFreq - stats.docFreq;
- if (stats.totalTermFreq > 0) {
+ final long delta = state.totalTermFreq - state.docFreq;
+ if (state.totalTermFreq > 0) {
if (delta == 0) {
- statsOut.writeVInt(stats.docFreq<<1|1);
+ statsOut.writeVInt(state.docFreq<<1|1);
} else {
- statsOut.writeVInt(stats.docFreq<<1|0);
- statsOut.writeVLong(stats.totalTermFreq-stats.docFreq);
+ statsOut.writeVInt(state.docFreq<<1|0);
+ statsOut.writeVLong(state.totalTermFreq-state.docFreq);
}
} else {
- statsOut.writeVInt(stats.docFreq);
+ statsOut.writeVInt(state.docFreq);
}
- BlockTermState state = postingsWriter.newTermState();
- state.docFreq = stats.docFreq;
- state.totalTermFreq = stats.totalTermFreq;
- postingsWriter.finishTerm(state);
postingsWriter.encodeTerm(longs, metaBytesOut, fieldInfo, state, true);
for (int i = 0; i < longsSize; i++) {
metaLongsOut.writeVLong(longs[i] - lastLongs[i]);
@@ -343,7 +356,6 @@ public class FSTOrdTermsWriter extends F
lastMetaBytesFP = metaBytesOut.getFilePointer();
}
- @Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
if (numTerms > 0) {
final FieldMetaData metadata = new FieldMetaData();
@@ -375,4 +387,9 @@ public class FSTOrdTermsWriter extends F
System.arraycopy(lastLongs, 0, lastBlockLongs, 0, longsSize);
}
}
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
}
Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java?rev=1620250&r1=1620249&r2=1620250&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java Mon Aug 25 01:52:08 2014
@@ -18,33 +18,33 @@ package org.apache.lucene.codecs.memory;
*/
import java.io.IOException;
-import java.util.List;
import java.util.ArrayList;
import java.util.Comparator;
+import java.util.List;
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
-import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
-import org.apache.lucene.codecs.BlockTermState;
-import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.PostingsConsumer;
-import org.apache.lucene.codecs.FieldsConsumer;
-import org.apache.lucene.codecs.TermsConsumer;
-import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.CodecUtil;
/**
* FST-based term dict, using metadata as FST output.
@@ -132,6 +132,7 @@ public class FSTTermsWriter extends Fiel
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
IndexOutput out;
+ final int maxDoc;
final List<FieldMetaData> fields = new ArrayList<>();
public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException {
@@ -140,6 +141,7 @@ public class FSTTermsWriter extends Fiel
this.postingsWriter = postingsWriter;
this.fieldInfos = state.fieldInfos;
this.out = state.directory.createOutput(termsFileName, state.context);
+ this.maxDoc = state.segmentInfo.getDocCount();
boolean success = false;
try {
@@ -152,16 +154,47 @@ public class FSTTermsWriter extends Fiel
}
}
}
+
private void writeHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT);
}
+
private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
out.writeLong(dirStart);
}
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
- return new TermsWriter(field);
+ public void write(Fields fields) throws IOException {
+ for(String field : fields) {
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ TermsEnum termsEnum = terms.iterator(null);
+ TermsWriter termsWriter = termsWriter = new TermsWriter(fieldInfo);
+
+ long sumTotalTermFreq = 0;
+ long sumDocFreq = 0;
+ FixedBitSet docsSeen = new FixedBitSet(maxDoc);
+
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+
+ BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen);
+ if (termState != null) {
+ termsWriter.finishTerm(term, termState);
+ sumTotalTermFreq += termState.totalTermFreq;
+ sumDocFreq += termState.docFreq;
+ }
+ }
+
+ termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
+ }
}
@Override
@@ -218,7 +251,7 @@ public class FSTTermsWriter extends Fiel
}
}
- final class TermsWriter extends TermsConsumer {
+ final class TermsWriter {
private final Builder<FSTTermOutputs.TermData> builder;
private final FSTTermOutputs outputs;
private final FieldInfo fieldInfo;
@@ -226,7 +259,6 @@ public class FSTTermsWriter extends Fiel
private long numTerms;
private final IntsRefBuilder scratchTerm = new IntsRefBuilder();
- private final RAMOutputStream statsWriter = new RAMOutputStream();
private final RAMOutputStream metaWriter = new RAMOutputStream();
TermsWriter(FieldInfo fieldInfo) {
@@ -237,27 +269,13 @@ public class FSTTermsWriter extends Fiel
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
}
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- postingsWriter.startTerm();
- return postingsWriter;
- }
-
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
// write term meta data into fst
- final BlockTermState state = postingsWriter.newTermState();
final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData();
meta.longs = new long[longsSize];
meta.bytes = null;
- meta.docFreq = state.docFreq = stats.docFreq;
- meta.totalTermFreq = state.totalTermFreq = stats.totalTermFreq;
- postingsWriter.finishTerm(state);
+ meta.docFreq = state.docFreq;
+ meta.totalTermFreq = state.totalTermFreq;
postingsWriter.encodeTerm(meta.longs, metaWriter, fieldInfo, state, true);
final int bytesSize = (int)metaWriter.getFilePointer();
if (bytesSize > 0) {
@@ -269,7 +287,6 @@ public class FSTTermsWriter extends Fiel
numTerms++;
}
- @Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
// save FST dict
if (numTerms > 0) {
@@ -278,4 +295,9 @@ public class FSTTermsWriter extends Fiel
}
}
}
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
}
Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java?rev=1620250&r1=1620249&r2=1620250&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java Mon Aug 25 01:52:08 2014
@@ -28,15 +28,14 @@ import java.util.TreeMap;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
-import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@@ -52,6 +51,7 @@ import org.apache.lucene.util.Accountabl
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
@@ -109,7 +109,7 @@ public final class MemoryPostingsFormat
return "PostingsFormat(name=" + getName() + " doPackFST= " + doPackFST + ")";
}
- private final static class TermsWriter extends TermsConsumer {
+ private final static class TermsWriter {
private final IndexOutput out;
private final FieldInfo field;
private final Builder<BytesRef> builder;
@@ -126,7 +126,7 @@ public final class MemoryPostingsFormat
builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, doPackFST, acceptableOverheadRatio, true, 15);
}
- private class PostingsWriter extends PostingsConsumer {
+ private class PostingsWriter {
private int lastDocID;
private int lastPos;
private int lastPayloadLen;
@@ -138,7 +138,6 @@ public final class MemoryPostingsFormat
int lastOffsetLength;
int lastOffset;
- @Override
public void startDoc(int docID, int termDocFreq) throws IOException {
//System.out.println(" startDoc docID=" + docID + " freq=" + termDocFreq);
final int delta = docID - lastDocID;
@@ -160,7 +159,6 @@ public final class MemoryPostingsFormat
lastOffset = 0;
}
- @Override
public void addPosition(int pos, BytesRef payload, int startOffset, int endOffset) throws IOException {
assert payload == null || field.hasPayloads();
@@ -205,10 +203,6 @@ public final class MemoryPostingsFormat
}
}
- @Override
- public void finishDoc() {
- }
-
public PostingsWriter reset() {
assert buffer.getFilePointer() == 0;
lastDocID = 0;
@@ -220,13 +214,7 @@ public final class MemoryPostingsFormat
}
}
- private final PostingsWriter postingsWriter = new PostingsWriter();
-
- @Override
- public PostingsConsumer startTerm(BytesRef text) {
- //System.out.println(" startTerm term=" + text.utf8ToString());
- return postingsWriter.reset();
- }
+ final PostingsWriter postingsWriter = new PostingsWriter();
private final RAMOutputStream buffer2 = new RAMOutputStream();
private final BytesRef spare = new BytesRef();
@@ -234,9 +222,11 @@ public final class MemoryPostingsFormat
private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ private void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ if (stats.docFreq == 0) {
+ return;
+ }
assert postingsWriter.docCount == stats.docFreq;
assert buffer2.getFilePointer() == 0;
@@ -268,7 +258,6 @@ public final class MemoryPostingsFormat
termCount++;
}
- @Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
if (termCount > 0) {
out.writeVInt(termCount);
@@ -283,11 +272,6 @@ public final class MemoryPostingsFormat
//System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer());
}
}
-
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
}
private static String EXTENSION = "ram";
@@ -295,39 +279,151 @@ public final class MemoryPostingsFormat
private static final int VERSION_START = 0;
private static final int VERSION_CURRENT = VERSION_START;
- @Override
- public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+ private class MemoryFieldsConsumer extends FieldsConsumer {
+ private final SegmentWriteState state;
+ private final IndexOutput out;
- final String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, EXTENSION);
- final IndexOutput out = state.directory.createOutput(fileName, state.context);
- boolean success = false;
- try {
- CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
- success = true;
- } finally {
- if (!success) {
- IOUtils.closeWhileHandlingException(out);
- }
+ private MemoryFieldsConsumer(SegmentWriteState state) throws IOException {
+ final String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, EXTENSION);
+ out = state.directory.createOutput(fileName, state.context);
+ boolean success = false;
+ try {
+ CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
+ success = true;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(out);
+ }
+ }
+ this.state = state;
}
-
- return new FieldsConsumer() {
- @Override
- public TermsConsumer addField(FieldInfo field) {
- //System.out.println("\naddField field=" + field.name);
- return new TermsWriter(out, field, doPackFST, acceptableOverheadRatio);
- }
- @Override
- public void close() throws IOException {
- // EOF marker:
- try {
- out.writeVInt(0);
- CodecUtil.writeFooter(out);
- } finally {
- out.close();
+ @Override
+ public void write(Fields fields) throws IOException {
+ for(String field : fields) {
+
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+
+ TermsEnum termsEnum = terms.iterator(null);
+
+ FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
+ TermsWriter termsWriter = new TermsWriter(out, fieldInfo,
+ doPackFST, acceptableOverheadRatio);
+
+ FixedBitSet docsSeen = new FixedBitSet(state.segmentInfo.getDocCount());
+ long sumTotalTermFreq = 0;
+ long sumDocFreq = 0;
+ DocsEnum docsEnum = null;
+ DocsAndPositionsEnum posEnum = null;
+ int enumFlags;
+
+ IndexOptions indexOptions = fieldInfo.getIndexOptions();
+ boolean writeFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ boolean writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ boolean writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ boolean writePayloads = fieldInfo.hasPayloads();
+
+ if (writeFreqs == false) {
+ enumFlags = 0;
+ } else if (writePositions == false) {
+ enumFlags = DocsEnum.FLAG_FREQS;
+ } else if (writeOffsets == false) {
+ if (writePayloads) {
+ enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS;
+ } else {
+ enumFlags = 0;
+ }
+ } else {
+ if (writePayloads) {
+ enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS | DocsAndPositionsEnum.FLAG_OFFSETS;
+ } else {
+ enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS;
+ }
}
+
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ termsWriter.postingsWriter.reset();
+
+ if (writePositions) {
+ posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
+ docsEnum = posEnum;
+ } else {
+ docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
+ posEnum = null;
+ }
+
+ int docFreq = 0;
+ long totalTermFreq = 0;
+ while (true) {
+ int docID = docsEnum.nextDoc();
+ if (docID == DocsEnum.NO_MORE_DOCS) {
+ break;
+ }
+ docsSeen.set(docID);
+ docFreq++;
+
+ int freq;
+ if (writeFreqs) {
+ freq = docsEnum.freq();
+ totalTermFreq += freq;
+ } else {
+ freq = -1;
+ }
+
+ termsWriter.postingsWriter.startDoc(docID, freq);
+ if (writePositions) {
+ for (int i=0;i<freq;i++) {
+ int pos = posEnum.nextPosition();
+ BytesRef payload = writePayloads ? posEnum.getPayload() : null;
+ int startOffset;
+ int endOffset;
+ if (writeOffsets) {
+ startOffset = posEnum.startOffset();
+ endOffset = posEnum.endOffset();
+ } else {
+ startOffset = -1;
+ endOffset = -1;
+ }
+ termsWriter.postingsWriter.addPosition(pos, payload, startOffset, endOffset);
+ }
+ }
+ }
+ termsWriter.finishTerm(term, new TermStats(docFreq, totalTermFreq));
+ sumDocFreq += docFreq;
+ sumTotalTermFreq += totalTermFreq;
+ }
+
+ termsWriter.finish(sumTotalTermFreq, sumDocFreq, docsSeen.cardinality());
}
- };
+ }
+
+ @Override
+ public void close() throws IOException {
+ // EOF marker:
+ try {
+ out.writeVInt(0);
+ CodecUtil.writeFooter(out);
+ } finally {
+ out.close();
+ }
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
+ }
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+ return new MemoryFieldsConsumer(state);
}
private final static class FSTDocsEnum extends DocsEnum {
@@ -931,7 +1027,7 @@ public final class MemoryPostingsFormat
}
return sizeInBytes;
}
-
+
@Override
public void checkIntegrity() throws IOException {}
};
Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java?rev=1620250&r1=1620249&r2=1620250&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsWriter.java Mon Aug 25 01:52:08 2014
@@ -18,22 +18,24 @@ package org.apache.lucene.codecs.pulsing
*/
import java.io.IOException;
-import java.util.List;
import java.util.ArrayList;
+import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
// TODO: we now inline based on total TF of the term,
@@ -67,21 +69,31 @@ public final class PulsingPostingsWriter
final static int VERSION_CURRENT = VERSION_META_ARRAY;
private SegmentWriteState segmentState;
- private IndexOutput termsOut;
private List<FieldMetaData> fields;
+ // Reused by writeTerm:
+ private DocsEnum docsEnum;
+ private DocsAndPositionsEnum posEnum;
+ private int enumFlags;
+
+ private final RAMOutputStream buffer = new RAMOutputStream();
+
private IndexOptions indexOptions;
- private boolean storePayloads;
// information for wrapped PF, in current field
private int longsSize;
private long[] longs;
+ private boolean fieldHasFreqs;
+ private boolean fieldHasPositions;
+ private boolean fieldHasOffsets;
+ private boolean fieldHasPayloads;
boolean absolute;
private static class PulsingTermState extends BlockTermState {
private byte[] bytes;
private BlockTermState wrappedState;
+
@Override
public String toString() {
if (bytes != null) {
@@ -92,20 +104,6 @@ public final class PulsingPostingsWriter
}
}
- // one entry per position
- private final Position[] pending;
- private int pendingCount = 0; // -1 once we've hit too many positions
- private Position currentDoc; // first Position entry of current doc
-
- private static final class Position {
- BytesRefBuilder payload;
- int termFreq; // only incremented on first position for a given doc
- int pos;
- int docID;
- int startOffset;
- int endOffset;
- }
-
private static final class FieldMetaData {
int fieldNumber;
int longsSize;
@@ -121,17 +119,14 @@ public final class PulsingPostingsWriter
// non-inlined terms:
final PostingsWriterBase wrappedPostingsWriter;
+ final int maxPositions;
+
/** If the total number of positions (summed across all docs
* for this term) is <= maxPositions, then the postings are
* inlined into terms dict */
public PulsingPostingsWriter(SegmentWriteState state, int maxPositions, PostingsWriterBase wrappedPostingsWriter) {
-
- pending = new Position[maxPositions];
- for(int i=0;i<maxPositions;i++) {
- pending[i] = new Position();
- }
fields = new ArrayList<>();
-
+ this.maxPositions = maxPositions;
// We simply wrap another postings writer, but only call
// on it when tot positions is >= the cutoff:
this.wrappedPostingsWriter = wrappedPostingsWriter;
@@ -140,143 +135,63 @@ public final class PulsingPostingsWriter
@Override
public void init(IndexOutput termsOut) throws IOException {
- this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
- termsOut.writeVInt(pending.length); // encode maxPositions in header
+ termsOut.writeVInt(maxPositions); // encode maxPositions in header
wrappedPostingsWriter.init(termsOut);
}
@Override
- public BlockTermState newTermState() throws IOException {
- PulsingTermState state = new PulsingTermState();
- state.wrappedState = wrappedPostingsWriter.newTermState();
- return state;
- }
+ public BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException {
- @Override
- public void startTerm() {
- //if (DEBUG) System.out.println("PW startTerm");
- assert pendingCount == 0;
- }
+ // First pass: figure out whether we should pulse this term
+ long posCount = 0;
- // TODO: -- should we NOT reuse across fields? would
- // be cleaner
-
- // Currently, this instance is re-used across fields, so
- // our parent calls setField whenever the field changes
- @Override
- public int setField(FieldInfo fieldInfo) {
- this.indexOptions = fieldInfo.getIndexOptions();
- //if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
- storePayloads = fieldInfo.hasPayloads();
- absolute = false;
- longsSize = wrappedPostingsWriter.setField(fieldInfo);
- longs = new long[longsSize];
- fields.add(new FieldMetaData(fieldInfo.number, longsSize));
- return 0;
- //DEBUG = BlockTreeTermsWriter.DEBUG;
- }
-
- private boolean DEBUG;
-
- @Override
- public void startDoc(int docID, int termDocFreq) throws IOException {
- assert docID >= 0: "got docID=" + docID;
-
- /*
- if (termID != -1) {
- if (docID == 0) {
- baseDocID = termID;
- } else if (baseDocID + docID != termID) {
- throw new RuntimeException("WRITE: baseDocID=" + baseDocID + " docID=" + docID + " termID=" + termID);
- }
- }
- */
-
- //if (DEBUG) System.out.println("PW doc=" + docID);
-
- if (pendingCount == pending.length) {
- push();
- //if (DEBUG) System.out.println("PW: wrapped.finishDoc");
- wrappedPostingsWriter.finishDoc();
- }
-
- if (pendingCount != -1) {
- assert pendingCount < pending.length;
- currentDoc = pending[pendingCount];
- currentDoc.docID = docID;
- if (indexOptions == IndexOptions.DOCS_ONLY) {
- pendingCount++;
- } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) {
- pendingCount++;
- currentDoc.termFreq = termDocFreq;
- } else {
- currentDoc.termFreq = termDocFreq;
+ if (fieldHasPositions == false) {
+ // No positions:
+ docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
+ assert docsEnum != null;
+ while (posCount <= maxPositions) {
+ if (docsEnum.nextDoc() == DocsEnum.NO_MORE_DOCS) {
+ break;
+ }
+ posCount++;
}
} else {
- // We've already seen too many docs for this term --
- // just forward to our fallback writer
- wrappedPostingsWriter.startDoc(docID, termDocFreq);
- }
- }
-
- @Override
- public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
-
- //if (DEBUG) System.out.println("PW pos=" + position + " payload=" + (payload == null ? "null" : payload.length + " bytes"));
- if (pendingCount == pending.length) {
- push();
- }
-
- if (pendingCount == -1) {
- // We've already seen too many docs for this term --
- // just forward to our fallback writer
- wrappedPostingsWriter.addPosition(position, payload, startOffset, endOffset);
- } else {
- // buffer up
- final Position pos = pending[pendingCount++];
- pos.pos = position;
- pos.startOffset = startOffset;
- pos.endOffset = endOffset;
- pos.docID = currentDoc.docID;
- if (payload != null && payload.length > 0) {
- if (pos.payload == null) {
- pos.payload = new BytesRefBuilder();
+ posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
+ assert posEnum != null;
+ while (posCount <= maxPositions) {
+ if (posEnum.nextDoc() == DocsEnum.NO_MORE_DOCS) {
+ break;
}
- pos.payload.copyBytes(payload);
- } else if (pos.payload != null) {
- pos.payload.clear();
+ posCount += posEnum.freq();
}
}
- }
- @Override
- public void finishDoc() throws IOException {
- // if (DEBUG) System.out.println("PW finishDoc");
- if (pendingCount == -1) {
- wrappedPostingsWriter.finishDoc();
+ if (posCount == 0) {
+ // All docs were deleted
+ return null;
}
- }
- private final RAMOutputStream buffer = new RAMOutputStream();
-
- // private int baseDocID;
-
- /** Called when we are done adding docs to this term */
- @Override
- public void finishTerm(BlockTermState _state) throws IOException {
- PulsingTermState state = (PulsingTermState) _state;
-
- // if (DEBUG) System.out.println("PW finishTerm docCount=" + stats.docFreq + " pendingCount=" + pendingCount + " pendingTerms.size()=" + pendingTerms.size());
-
- assert pendingCount > 0 || pendingCount == -1;
-
- if (pendingCount == -1) {
- state.wrappedState.docFreq = state.docFreq;
- state.wrappedState.totalTermFreq = state.totalTermFreq;
- state.bytes = null;
- wrappedPostingsWriter.finishTerm(state.wrappedState);
+ // Second pass: write postings
+ if (posCount > maxPositions) {
+ // Too many positions; do not pulse. Just lset
+ // wrapped postingsWriter encode the postings:
+
+ PulsingTermState state = new PulsingTermState();
+ state.wrappedState = wrappedPostingsWriter.writeTerm(term, termsEnum, docsSeen);
+ state.docFreq = state.wrappedState.docFreq;
+ state.totalTermFreq = state.wrappedState.totalTermFreq;
+ return state;
} else {
+ // Pulsed:
+ if (fieldHasPositions == false) {
+ docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
+ } else {
+ posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
+ docsEnum = posEnum;
+ }
+ assert docsEnum != null;
+
// There were few enough total occurrences for this
// term, so we fully inline our postings data into
// terms dict, now:
@@ -287,95 +202,135 @@ public final class PulsingPostingsWriter
// given codec wants to store other interesting
// stuff, it could use this pulsing codec to do so
- if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
- int lastDocID = 0;
- int pendingIDX = 0;
- int lastPayloadLength = -1;
- int lastOffsetLength = -1;
- while(pendingIDX < pendingCount) {
- final Position doc = pending[pendingIDX];
+ int lastDocID = 0;
+ int lastPayloadLength = -1;
+ int lastOffsetLength = -1;
+
+ int docFreq = 0;
+ long totalTermFreq = 0;
+ while (true) {
+ int docID = docsEnum.nextDoc();
+ if (docID == DocsEnum.NO_MORE_DOCS) {
+ break;
+ }
+ docsSeen.set(docID);
+
+ int delta = docID - lastDocID;
+ lastDocID = docID;
- final int delta = doc.docID - lastDocID;
- lastDocID = doc.docID;
+ docFreq++;
- // if (DEBUG) System.out.println(" write doc=" + doc.docID + " freq=" + doc.termFreq);
+ if (fieldHasFreqs) {
+ int freq = docsEnum.freq();
+ totalTermFreq += freq;
- if (doc.termFreq == 1) {
- buffer.writeVInt((delta<<1)|1);
+ if (freq == 1) {
+ buffer.writeVInt((delta << 1) | 1);
} else {
- buffer.writeVInt(delta<<1);
- buffer.writeVInt(doc.termFreq);
+ buffer.writeVInt(delta << 1);
+ buffer.writeVInt(freq);
}
- int lastPos = 0;
- int lastOffset = 0;
- for(int posIDX=0;posIDX<doc.termFreq;posIDX++) {
- final Position pos = pending[pendingIDX++];
- assert pos.docID == doc.docID;
- final int posDelta = pos.pos - lastPos;
- lastPos = pos.pos;
- // if (DEBUG) System.out.println(" write pos=" + pos.pos);
- final int payloadLength = pos.payload == null ? 0 : pos.payload.length();
- if (storePayloads) {
- if (payloadLength != lastPayloadLength) {
- buffer.writeVInt((posDelta << 1)|1);
- buffer.writeVInt(payloadLength);
- lastPayloadLength = payloadLength;
+ if (fieldHasPositions) {
+ int lastPos = 0;
+ int lastOffset = 0;
+ for(int posIDX=0;posIDX<freq;posIDX++) {
+ int pos = posEnum.nextPosition();
+ int posDelta = pos - lastPos;
+ lastPos = pos;
+ int payloadLength;
+ BytesRef payload;
+ if (fieldHasPayloads) {
+ payload = posEnum.getPayload();
+ payloadLength = payload == null ? 0 : payload.length;
+ if (payloadLength != lastPayloadLength) {
+ buffer.writeVInt((posDelta << 1)|1);
+ buffer.writeVInt(payloadLength);
+ lastPayloadLength = payloadLength;
+ } else {
+ buffer.writeVInt(posDelta << 1);
+ }
} else {
- buffer.writeVInt(posDelta << 1);
+ payloadLength = 0;
+ payload = null;
+ buffer.writeVInt(posDelta);
}
- } else {
- buffer.writeVInt(posDelta);
- }
-
- if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
- //System.out.println("write=" + pos.startOffset + "," + pos.endOffset);
- int offsetDelta = pos.startOffset - lastOffset;
- int offsetLength = pos.endOffset - pos.startOffset;
- if (offsetLength != lastOffsetLength) {
- buffer.writeVInt(offsetDelta << 1 | 1);
- buffer.writeVInt(offsetLength);
- } else {
- buffer.writeVInt(offsetDelta << 1);
+
+ if (fieldHasOffsets) {
+ int startOffset = posEnum.startOffset();
+ int endOffset = posEnum.endOffset();
+ int offsetDelta = startOffset - lastOffset;
+ int offsetLength = endOffset - startOffset;
+ if (offsetLength != lastOffsetLength) {
+ buffer.writeVInt(offsetDelta << 1 | 1);
+ buffer.writeVInt(offsetLength);
+ } else {
+ buffer.writeVInt(offsetDelta << 1);
+ }
+ lastOffset = startOffset;
+ lastOffsetLength = offsetLength;
}
- lastOffset = pos.startOffset;
- lastOffsetLength = offsetLength;
- }
- if (payloadLength > 0) {
- assert storePayloads;
- buffer.writeBytes(pos.payload.bytes(), 0, pos.payload.length());
+ if (payloadLength > 0) {
+ assert fieldHasPayloads;
+ assert payload != null;
+ buffer.writeBytes(payload.bytes, payload.offset, payload.length);
+ }
}
}
- }
- } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) {
- int lastDocID = 0;
- for(int posIDX=0;posIDX<pendingCount;posIDX++) {
- final Position doc = pending[posIDX];
- final int delta = doc.docID - lastDocID;
- assert doc.termFreq != 0;
- if (doc.termFreq == 1) {
- buffer.writeVInt((delta<<1)|1);
- } else {
- buffer.writeVInt(delta<<1);
- buffer.writeVInt(doc.termFreq);
- }
- lastDocID = doc.docID;
- }
- } else if (indexOptions == IndexOptions.DOCS_ONLY) {
- int lastDocID = 0;
- for(int posIDX=0;posIDX<pendingCount;posIDX++) {
- final Position doc = pending[posIDX];
- buffer.writeVInt(doc.docID - lastDocID);
- lastDocID = doc.docID;
+ } else {
+ buffer.writeVInt(delta);
}
}
-
+
+ PulsingTermState state = new PulsingTermState();
state.bytes = new byte[(int) buffer.getFilePointer()];
+ state.docFreq = docFreq;
+ state.totalTermFreq = fieldHasFreqs ? totalTermFreq : -1;
buffer.writeTo(state.bytes, 0);
buffer.reset();
+ return state;
+ }
+ }
+
+ // TODO: -- should we NOT reuse across fields? would
+ // be cleaner
+
+ // Currently, this instance is re-used across fields, so
+ // our parent calls setField whenever the field changes
+ @Override
+ public int setField(FieldInfo fieldInfo) {
+ this.indexOptions = fieldInfo.getIndexOptions();
+ //if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
+ fieldHasPayloads = fieldInfo.hasPayloads();
+ absolute = false;
+ longsSize = wrappedPostingsWriter.setField(fieldInfo);
+ longs = new long[longsSize];
+ fields.add(new FieldMetaData(fieldInfo.number, longsSize));
+
+ fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+
+ if (fieldHasFreqs == false) {
+ enumFlags = 0;
+ } else if (fieldHasPositions == false) {
+ enumFlags = DocsEnum.FLAG_FREQS;
+ } else if (fieldHasOffsets == false) {
+ if (fieldHasPayloads) {
+ enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS;
+ } else {
+ enumFlags = 0;
+ }
+ } else {
+ if (fieldHasPayloads) {
+ enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS | DocsAndPositionsEnum.FLAG_OFFSETS;
+ } else {
+ enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS;
+ }
}
- pendingCount = 0;
+ return 0;
+ //DEBUG = BlockTreeTermsWriter.DEBUG;
}
@Override
@@ -420,40 +375,4 @@ public final class PulsingPostingsWriter
IOUtils.closeWhileHandlingException(out);
}
}
-
- // Pushes pending positions to the wrapped codec
- private void push() throws IOException {
- // if (DEBUG) System.out.println("PW now push @ " + pendingCount + " wrapped=" + wrappedPostingsWriter);
- assert pendingCount == pending.length;
-
- wrappedPostingsWriter.startTerm();
-
- // Flush all buffered docs
- if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
- Position doc = null;
- for(Position pos : pending) {
- if (doc == null) {
- doc = pos;
- // if (DEBUG) System.out.println("PW: wrapped.startDoc docID=" + doc.docID + " tf=" + doc.termFreq);
- wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq);
- } else if (doc.docID != pos.docID) {
- assert pos.docID > doc.docID;
- // if (DEBUG) System.out.println("PW: wrapped.finishDoc");
- wrappedPostingsWriter.finishDoc();
- doc = pos;
- // if (DEBUG) System.out.println("PW: wrapped.startDoc docID=" + doc.docID + " tf=" + doc.termFreq);
- wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq);
- }
- // if (DEBUG) System.out.println("PW: wrapped.addPos pos=" + pos.pos);
- final BytesRef payload = pos.payload == null ? null : pos.payload.get();
- wrappedPostingsWriter.addPosition(pos.pos, payload, pos.startOffset, pos.endOffset);
- }
- //wrappedPostingsWriter.finishDoc();
- } else {
- for(Position doc : pending) {
- wrappedPostingsWriter.startDoc(doc.docID, indexOptions == IndexOptions.DOCS_ONLY ? 0 : doc.termFreq);
- }
- }
- pendingCount = -1;
- }
}
Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java?rev=1620250&r1=1620249&r2=1620250&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/sep/SepPostingsWriter.java Mon Aug 25 01:52:08 2014
@@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
@@ -38,7 +39,7 @@ import org.apache.lucene.util.IOUtils;
* to .pyl, skip data to .skp
*
* @lucene.experimental */
-public final class SepPostingsWriter extends PostingsWriterBase {
+public final class SepPostingsWriter extends PushPostingsWriterBase {
final static String CODEC = "SepPostingsWriter";
final static String DOC_EXTENSION = "doc";
@@ -190,6 +191,7 @@ public final class SepPostingsWriter ext
// our parent calls setField whenever the field changes
@Override
public int setField(FieldInfo fieldInfo) {
+ super.setField(fieldInfo);
this.fieldInfo = fieldInfo;
this.indexOptions = fieldInfo.getIndexOptions();
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java?rev=1620250&r1=1620249&r2=1620250&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java Mon Aug 25 01:52:08 2014
@@ -21,12 +21,14 @@ import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.codecs.FieldsConsumer;
-import org.apache.lucene.codecs.PostingsConsumer;
-import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.TermsConsumer;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
@@ -35,6 +37,7 @@ class SimpleTextFieldsWriter extends Fie
private IndexOutput out;
private final BytesRefBuilder scratch = new BytesRefBuilder();
+ private final SegmentWriteState writeState;
final static BytesRef END = new BytesRef("END");
final static BytesRef FIELD = new BytesRef("field ");
@@ -46,9 +49,146 @@ class SimpleTextFieldsWriter extends Fie
final static BytesRef END_OFFSET = new BytesRef(" endOffset ");
final static BytesRef PAYLOAD = new BytesRef(" payload ");
- public SimpleTextFieldsWriter(SegmentWriteState state) throws IOException {
- final String fileName = SimpleTextPostingsFormat.getPostingsFileName(state.segmentInfo.name, state.segmentSuffix);
- out = state.directory.createOutput(fileName, state.context);
+ public SimpleTextFieldsWriter(SegmentWriteState writeState) throws IOException {
+ final String fileName = SimpleTextPostingsFormat.getPostingsFileName(writeState.segmentInfo.name, writeState.segmentSuffix);
+ out = writeState.directory.createOutput(fileName, writeState.context);
+ this.writeState = writeState;
+ }
+
+ @Override
+ public void write(Fields fields) throws IOException {
+ write(writeState.fieldInfos, fields);
+ }
+
+ public void write(FieldInfos fieldInfos, Fields fields) throws IOException {
+
+ // for each field
+ for(String field : fields) {
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ // Annoyingly, this can happen!
+ continue;
+ }
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+
+ boolean wroteField = false;
+
+ boolean hasPositions = terms.hasPositions();
+ boolean hasFreqs = terms.hasFreqs();
+ boolean hasPayloads = fieldInfo.hasPayloads();
+ boolean hasOffsets = terms.hasOffsets();
+
+ int flags = 0;
+ if (hasPositions) {
+
+ if (hasPayloads) {
+ flags = flags | DocsAndPositionsEnum.FLAG_PAYLOADS;
+ }
+ if (hasOffsets) {
+ flags = flags | DocsAndPositionsEnum.FLAG_OFFSETS;
+ }
+ } else {
+ if (hasFreqs) {
+ flags = flags | DocsEnum.FLAG_FREQS;
+ }
+ }
+
+ TermsEnum termsEnum = terms.iterator(null);
+ DocsAndPositionsEnum posEnum = null;
+ DocsEnum docsEnum = null;
+
+ // for each term in field
+ while(true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+
+ if (hasPositions) {
+ posEnum = termsEnum.docsAndPositions(null, posEnum, flags);
+ docsEnum = posEnum;
+ } else {
+ docsEnum = termsEnum.docs(null, docsEnum, flags);
+ }
+ assert docsEnum != null: "termsEnum=" + termsEnum + " hasPos=" + hasPositions + " flags=" + flags;
+
+ boolean wroteTerm = false;
+
+ // for each doc in field+term
+ while(true) {
+ int doc = docsEnum.nextDoc();
+ if (doc == DocsEnum.NO_MORE_DOCS) {
+ break;
+ }
+
+ if (!wroteTerm) {
+
+ if (!wroteField) {
+ // we lazily do this, in case the field had
+ // no terms
+ write(FIELD);
+ write(field);
+ newline();
+ wroteField = true;
+ }
+
+ // we lazily do this, in case the term had
+ // zero docs
+ write(TERM);
+ write(term);
+ newline();
+ wroteTerm = true;
+ }
+
+ write(DOC);
+ write(Integer.toString(doc));
+ newline();
+ if (hasFreqs) {
+ int freq = docsEnum.freq();
+ write(FREQ);
+ write(Integer.toString(freq));
+ newline();
+
+ if (hasPositions) {
+ // for assert:
+ int lastStartOffset = 0;
+
+ // for each pos in field+term+doc
+ for(int i=0;i<freq;i++) {
+ int position = posEnum.nextPosition();
+
+ write(POS);
+ write(Integer.toString(position));
+ newline();
+
+ if (hasOffsets) {
+ int startOffset = posEnum.startOffset();
+ int endOffset = posEnum.endOffset();
+ assert endOffset >= startOffset;
+ assert startOffset >= lastStartOffset: "startOffset=" + startOffset + " lastStartOffset=" + lastStartOffset;
+ lastStartOffset = startOffset;
+ write(START_OFFSET);
+ write(Integer.toString(startOffset));
+ newline();
+ write(END_OFFSET);
+ write(Integer.toString(endOffset));
+ newline();
+ }
+
+ BytesRef payload = posEnum.getPayload();
+
+ if (payload != null && payload.length > 0) {
+ assert payload.length != 0;
+ write(PAYLOAD);
+ write(payload);
+ newline();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
}
private void write(String s) throws IOException {
@@ -64,119 +204,6 @@ class SimpleTextFieldsWriter extends Fie
}
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
- write(FIELD);
- write(field.name);
- newline();
- return new SimpleTextTermsWriter(field);
- }
-
- private class SimpleTextTermsWriter extends TermsConsumer {
- private final SimpleTextPostingsWriter postingsWriter;
-
- public SimpleTextTermsWriter(FieldInfo field) {
- postingsWriter = new SimpleTextPostingsWriter(field);
- }
-
- @Override
- public PostingsConsumer startTerm(BytesRef term) throws IOException {
- return postingsWriter.reset(term);
- }
-
- @Override
- public void finishTerm(BytesRef term, TermStats stats) throws IOException {
- }
-
- @Override
- public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
- }
-
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
- }
-
- private class SimpleTextPostingsWriter extends PostingsConsumer {
- private BytesRef term;
- private boolean wroteTerm;
- private final IndexOptions indexOptions;
- private final boolean writePositions;
- private final boolean writeOffsets;
-
- // for assert:
- private int lastStartOffset = 0;
-
- public SimpleTextPostingsWriter(FieldInfo field) {
- this.indexOptions = field.getIndexOptions();
- writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
- writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
- //System.out.println("writeOffsets=" + writeOffsets);
- //System.out.println("writePos=" + writePositions);
- }
-
- @Override
- public void startDoc(int docID, int termDocFreq) throws IOException {
- if (!wroteTerm) {
- // we lazily do this, in case the term had zero docs
- write(TERM);
- write(term);
- newline();
- wroteTerm = true;
- }
-
- write(DOC);
- write(Integer.toString(docID));
- newline();
- if (indexOptions != IndexOptions.DOCS_ONLY) {
- write(FREQ);
- write(Integer.toString(termDocFreq));
- newline();
- }
-
- lastStartOffset = 0;
- }
-
- public PostingsConsumer reset(BytesRef term) {
- this.term = term;
- wroteTerm = false;
- return this;
- }
-
- @Override
- public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
- if (writePositions) {
- write(POS);
- write(Integer.toString(position));
- newline();
- }
-
- if (writeOffsets) {
- assert endOffset >= startOffset;
- assert startOffset >= lastStartOffset: "startOffset=" + startOffset + " lastStartOffset=" + lastStartOffset;
- lastStartOffset = startOffset;
- write(START_OFFSET);
- write(Integer.toString(startOffset));
- newline();
- write(END_OFFSET);
- write(Integer.toString(endOffset));
- newline();
- }
-
- if (payload != null && payload.length > 0) {
- assert payload.length != 0;
- write(PAYLOAD);
- write(payload);
- newline();
- }
- }
-
- @Override
- public void finishDoc() {
- }
- }
-
- @Override
public void close() throws IOException {
if (out != null) {
try {
@@ -189,4 +216,9 @@ class SimpleTextFieldsWriter extends Fie
}
}
}
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
}