You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 20:58:44 UTC
svn commit: r1534320 [9/39] - in /lucene/dev/branches/lucene4956: ./
dev-tools/ dev-tools/idea/.idea/ dev-tools/idea/lucene/expressions/
dev-tools/idea/solr/contrib/velocity/ dev-tools/maven/
dev-tools/maven/lucene/ dev-tools/maven/lucene/expressions/ ...
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java Mon Oct 21 18:58:24 2013
@@ -30,21 +30,28 @@ import java.io.IOException;
call {@link AttributeSource#clearAttributes()} before
setting attributes.
*/
-public abstract class Tokenizer extends TokenStream {
+public abstract class Tokenizer extends TokenStream {
/** The text source for this Tokenizer. */
- protected Reader input;
+ protected Reader input = ILLEGAL_STATE_READER;
+
+ /** Pending reader: not actually assigned to input until reset() */
+ private Reader inputPending = ILLEGAL_STATE_READER;
/** Construct a token stream processing the given input. */
protected Tokenizer(Reader input) {
- assert input != null: "input must not be null";
- this.input = input;
+ if (input == null) {
+ throw new NullPointerException("input must not be null");
+ }
+ this.inputPending = input;
}
/** Construct a token stream processing the given input using the given AttributeFactory. */
protected Tokenizer(AttributeFactory factory, Reader input) {
super(factory);
- assert input != null: "input must not be null";
- this.input = input;
+ if (input == null) {
+ throw new NullPointerException("input must not be null");
+ }
+ this.inputPending = input;
}
/**
@@ -56,12 +63,10 @@ public abstract class Tokenizer extends
*/
@Override
public void close() throws IOException {
- if (input != null) {
- input.close();
- // LUCENE-2387: don't hold onto Reader after close, so
- // GC can reclaim
- input = null;
- }
+ input.close();
+ // LUCENE-2387: don't hold onto Reader after close, so
+ // GC can reclaim
+ inputPending = input = ILLEGAL_STATE_READER;
}
/** Return the corrected offset. If {@link #input} is a {@link CharFilter} subclass
@@ -71,7 +76,6 @@ public abstract class Tokenizer extends
* @see CharFilter#correctOffset
*/
protected final int correctOffset(int currentOff) {
- assert input != null: "this tokenizer is closed";
return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(currentOff) : currentOff;
}
@@ -79,14 +83,37 @@ public abstract class Tokenizer extends
* analyzer (in its tokenStream method) will use
* this to re-use a previously created tokenizer. */
public final void setReader(Reader input) throws IOException {
- assert input != null: "input must not be null";
- this.input = input;
+ if (input == null) {
+ throw new NullPointerException("input must not be null");
+ } else if (this.input != ILLEGAL_STATE_READER) {
+ throw new IllegalStateException("TokenStream contract violation: close() call missing");
+ }
+ this.inputPending = input;
assert setReaderTestPoint();
}
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ input = inputPending;
+ inputPending = ILLEGAL_STATE_READER;
+ }
+
// only used by assert, for testing
boolean setReaderTestPoint() {
return true;
}
+
+ private static final Reader ILLEGAL_STATE_READER = new Reader() {
+ @Override
+ public int read(char[] cbuf, int off, int len) {
+ throw new IllegalStateException("TokenStream contract violation: reset()/close() call missing, " +
+ "reset() called multiple times, or subclass does not call super.reset(). " +
+ "Please see Javadocs of TokenStream class for more information about the correct consuming workflow.");
+ }
+
+ @Override
+ public void close() {}
+ };
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java Mon Oct 21 18:58:24 2013
@@ -85,13 +85,11 @@ public class CharTermAttributeImpl exten
// *** TermToBytesRefAttribute interface ***
private BytesRef bytes = new BytesRef(MIN_BUFFER_SIZE);
- // not until java 6 @Override
@Override
public int fillBytesRef() {
return UnicodeUtil.UTF16toUTF8WithHash(termBuffer, 0, termLength, bytes);
}
- // not until java 6 @Override
@Override
public BytesRef getBytesRef() {
return bytes;
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/TermToBytesRefAttribute.java Mon Oct 21 18:58:24 2013
@@ -30,7 +30,7 @@ import org.apache.lucene.util.BytesRef;
* final TermToBytesRefAttribute termAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
* final BytesRef bytes = termAtt.getBytesRef();
*
- * while (termAtt.incrementToken() {
+ * while (tokenStream.incrementToken() {
*
* // you must call termAtt.fillBytesRef() before doing something with the bytes.
* // this encodes the term value (internally it might be a char[], etc) into the bytes.
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/BlockTermState.java Mon Oct 21 18:58:24 2013
@@ -34,6 +34,7 @@ public class BlockTermState extends OrdT
/** the term's ord in the current block */
public int termBlockOrd;
/** fp into the terms dict primary file (_X.tim) that holds this term */
+ // TODO: update BTR to nuke this
public long blockFilePointer;
/** Sole constructor. (For invocation by subclass
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java Mon Oct 21 18:58:24 2013
@@ -22,7 +22,6 @@ import java.io.IOException;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
-import java.util.Comparator;
import java.util.Iterator;
import java.util.Locale;
import java.util.TreeMap;
@@ -158,6 +157,7 @@ public class BlockTreeTermsReader extend
final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
final long sumDocFreq = in.readVLong();
final int docCount = in.readVInt();
+ final int longsSize = version >= BlockTreeTermsWriter.TERMS_VERSION_META_ARRAY ? in.readVInt() : 0;
if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
}
@@ -168,7 +168,7 @@ public class BlockTreeTermsReader extend
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + in + ")");
}
final long indexStartFP = indexIn.readVLong();
- FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, indexIn));
+ FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, longsSize, indexIn));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")");
}
@@ -448,11 +448,12 @@ public class BlockTreeTermsReader extend
final long indexStartFP;
final long rootBlockFP;
final BytesRef rootCode;
- private final FST<BytesRef> index;
+ final int longsSize;
+ private final FST<BytesRef> index;
//private boolean DEBUG;
- FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, IndexInput indexIn) throws IOException {
+ FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, int longsSize, IndexInput indexIn) throws IOException {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
//DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id");
@@ -462,6 +463,7 @@ public class BlockTreeTermsReader extend
this.docCount = docCount;
this.indexStartFP = indexStartFP;
this.rootCode = rootCode;
+ this.longsSize = longsSize;
// if (DEBUG) {
// System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor);
// }
@@ -495,8 +497,8 @@ public class BlockTreeTermsReader extend
}
@Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
+ public boolean hasFreqs() {
+ return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
}
@Override
@@ -547,6 +549,11 @@ public class BlockTreeTermsReader extend
return new IntersectEnum(compiled, startTerm);
}
+ /** Returns approximate RAM bytes used */
+ public long ramBytesUsed() {
+ return ((index!=null)? index.sizeInBytes() : 0);
+ }
+
// NOTE: cannot seek!
private final class IntersectEnum extends TermsEnum {
private final IndexInput in;
@@ -612,6 +619,12 @@ public class BlockTreeTermsReader extend
FST.Arc<BytesRef> arc;
final BlockTermState termState;
+
+ // metadata buffer, holding monotonic values
+ public long[] longs;
+ // metadata buffer, holding general values
+ public byte[] bytes;
+ ByteArrayDataInput bytesReader;
// Cumulative output so far
BytesRef outputPrefix;
@@ -621,8 +634,9 @@ public class BlockTreeTermsReader extend
public Frame(int ord) throws IOException {
this.ord = ord;
- termState = postingsReader.newTermState();
- termState.totalTermFreq = -1;
+ this.termState = postingsReader.newTermState();
+ this.termState.totalTermFreq = -1;
+ this.longs = new long[longsSize];
}
void loadNextFloorBlock() throws IOException {
@@ -720,8 +734,17 @@ public class BlockTreeTermsReader extend
termState.termBlockOrd = 0;
nextEnt = 0;
-
- postingsReader.readTermsBlock(in, fieldInfo, termState);
+
+ // metadata
+ numBytes = in.readVInt();
+ if (bytes == null) {
+ bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ bytesReader = new ByteArrayDataInput();
+ } else if (bytes.length < numBytes) {
+ bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ in.readBytes(bytes, 0, numBytes);
+ bytesReader.reset(bytes, 0, numBytes);
if (!isLastInFloor) {
// Sub-blocks of a single floor block are always
@@ -774,12 +797,9 @@ public class BlockTreeTermsReader extend
// lazily catch up on metadata decode:
final int limit = getTermBlockOrd();
+ boolean absolute = metaDataUpto == 0;
assert limit > 0;
- // We must set/incr state.termCount because
- // postings impl can look at this
- termState.termBlockOrd = metaDataUpto;
-
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) {
@@ -791,17 +811,24 @@ public class BlockTreeTermsReader extend
// TODO: if docFreq were bulk decoded we could
// just skipN here:
+
+ // stats
termState.docFreq = statsReader.readVInt();
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
}
+ // metadata
+ for (int i = 0; i < longsSize; i++) {
+ longs[i] = bytesReader.readVLong();
+ }
+ postingsReader.decodeTerm(longs, bytesReader, fieldInfo, termState, absolute);
- postingsReader.nextTerm(fieldInfo, termState);
metaDataUpto++;
- termState.termBlockOrd++;
+ absolute = false;
}
+ termState.termBlockOrd = metaDataUpto;
}
}
@@ -1211,11 +1238,6 @@ public class BlockTreeTermsReader extend
}
@Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
public boolean seekExact(BytesRef text) {
throw new UnsupportedOperationException();
}
@@ -1426,11 +1448,6 @@ public class BlockTreeTermsReader extend
return arcs[ord];
}
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
// Pushes a frame we seek'd to
Frame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length) throws IOException {
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
@@ -1707,6 +1724,7 @@ public class BlockTreeTermsReader extend
if (arc.output != NO_OUTPUT) {
output = fstOutputs.add(output, arc.output);
}
+
// if (DEBUG) {
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
// }
@@ -2290,10 +2308,17 @@ public class BlockTreeTermsReader extend
final BlockTermState state;
+ // metadata buffer, holding monotonic values
+ public long[] longs;
+ // metadata buffer, holding general values
+ public byte[] bytes;
+ ByteArrayDataInput bytesReader;
+
public Frame(int ord) throws IOException {
this.ord = ord;
- state = postingsReader.newTermState();
- state.totalTermFreq = -1;
+ this.state = postingsReader.newTermState();
+ this.state.totalTermFreq = -1;
+ this.longs = new long[longsSize];
}
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
@@ -2391,7 +2416,17 @@ public class BlockTreeTermsReader extend
// TODO: we could skip this if !hasTerms; but
// that's rare so won't help much
- postingsReader.readTermsBlock(in, fieldInfo, state);
+ // metadata
+ numBytes = in.readVInt();
+ if (bytes == null) {
+ bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ bytesReader = new ByteArrayDataInput();
+ } else if (bytes.length < numBytes) {
+ bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ in.readBytes(bytes, 0, numBytes);
+ bytesReader.reset(bytes, 0, numBytes);
+
// Sub-blocks of a single floor block are always
// written one after another -- tail recurse:
@@ -2575,12 +2610,9 @@ public class BlockTreeTermsReader extend
// lazily catch up on metadata decode:
final int limit = getTermBlockOrd();
+ boolean absolute = metaDataUpto == 0;
assert limit > 0;
- // We must set/incr state.termCount because
- // postings impl can look at this
- state.termBlockOrd = metaDataUpto;
-
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) {
@@ -2592,17 +2624,24 @@ public class BlockTreeTermsReader extend
// TODO: if docFreq were bulk decoded we could
// just skipN here:
+
+ // stats
state.docFreq = statsReader.readVInt();
//if (DEBUG) System.out.println(" dF=" + state.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
state.totalTermFreq = state.docFreq + statsReader.readVLong();
//if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
}
+ // metadata
+ for (int i = 0; i < longsSize; i++) {
+ longs[i] = bytesReader.readVLong();
+ }
+ postingsReader.decodeTerm(longs, bytesReader, fieldInfo, state, absolute);
- postingsReader.nextTerm(fieldInfo, state);
metaDataUpto++;
- state.termBlockOrd++;
+ absolute = false;
}
+ state.termBlockOrd = metaDataUpto;
}
// Used only by assert
@@ -2929,4 +2968,13 @@ public class BlockTreeTermsReader extend
}
}
}
+
+ @Override
+ public long ramBytesUsed() {
+ long sizeInByes = ((postingsReader!=null) ? postingsReader.ramBytesUsed() : 0);
+ for(FieldReader reader : fields.values()) {
+ sizeInByes += reader.ramBytesUsed();
+ }
+ return sizeInByes;
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/Codec.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/Codec.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/Codec.java Mon Oct 21 18:58:24 2013
@@ -119,7 +119,7 @@ public abstract class Codec implements N
loader.reload(classloader);
}
- private static Codec defaultCodec = Codec.forName("Lucene42");
+ private static Codec defaultCodec = Codec.forName("Lucene46");
/** expert: returns the default codec used for newly created
* {@link IndexWriterConfig}s.
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java Mon Oct 21 18:58:24 2013
@@ -69,7 +69,8 @@ public abstract class DocValuesConsumer
/**
* Writes numeric docvalues for a field.
* @param field field information
- * @param values Iterable of numeric values (one for each document).
+ * @param values Iterable of numeric values (one for each document). {@code null} indicates
+ * a missing value.
* @throws IOException if an I/O error occurred.
*/
public abstract void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException;
@@ -77,7 +78,8 @@ public abstract class DocValuesConsumer
/**
* Writes binary docvalues for a field.
* @param field field information
- * @param values Iterable of binary values (one for each document).
+ * @param values Iterable of binary values (one for each document). {@code null} indicates
+ * a missing value.
* @throws IOException if an I/O error occurred.
*/
public abstract void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException;
@@ -86,7 +88,8 @@ public abstract class DocValuesConsumer
* Writes pre-sorted binary docvalues for a field.
* @param field field information
* @param values Iterable of binary values in sorted order (deduplicated).
- * @param docToOrd Iterable of ordinals (one for each document).
+ * @param docToOrd Iterable of ordinals (one for each document). {@code -1} indicates
+ * a missing value.
* @throws IOException if an I/O error occurred.
*/
public abstract void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException;
@@ -95,7 +98,8 @@ public abstract class DocValuesConsumer
* Writes pre-sorted set docvalues for a field
* @param field field information
* @param values Iterable of binary values in sorted order (deduplicated).
- * @param docToOrdCount Iterable of the number of values for each document.
+ * @param docToOrdCount Iterable of the number of values for each document. A zero ordinal
+ * count indicates a missing value.
* @param ords Iterable of ordinal occurrences (docToOrdCount*maxDoc total).
* @throws IOException if an I/O error occurred.
*/
@@ -107,7 +111,7 @@ public abstract class DocValuesConsumer
* The default implementation calls {@link #addNumericField}, passing
* an Iterable that merges and filters deleted documents on the fly.
*/
- public void mergeNumericField(FieldInfo fieldInfo, final MergeState mergeState, final List<NumericDocValues> toMerge) throws IOException {
+ public void mergeNumericField(final FieldInfo fieldInfo, final MergeState mergeState, final List<NumericDocValues> toMerge, final List<Bits> docsWithField) throws IOException {
addNumericField(fieldInfo,
new Iterable<Number>() {
@@ -116,10 +120,11 @@ public abstract class DocValuesConsumer
return new Iterator<Number>() {
int readerUpto = -1;
int docIDUpto;
- long nextValue;
+ Long nextValue;
AtomicReader currentReader;
NumericDocValues currentValues;
Bits currentLiveDocs;
+ Bits currentDocsWithField;
boolean nextIsSet;
@Override
@@ -139,7 +144,6 @@ public abstract class DocValuesConsumer
}
assert nextIsSet;
nextIsSet = false;
- // TODO: make a mutable number
return nextValue;
}
@@ -155,6 +159,7 @@ public abstract class DocValuesConsumer
currentReader = mergeState.readers.get(readerUpto);
currentValues = toMerge.get(readerUpto);
currentLiveDocs = currentReader.getLiveDocs();
+ currentDocsWithField = docsWithField.get(readerUpto);
}
docIDUpto = 0;
continue;
@@ -162,7 +167,11 @@ public abstract class DocValuesConsumer
if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
nextIsSet = true;
- nextValue = currentValues.get(docIDUpto);
+ if (currentDocsWithField.get(docIDUpto)) {
+ nextValue = currentValues.get(docIDUpto);
+ } else {
+ nextValue = null;
+ }
docIDUpto++;
return true;
}
@@ -181,7 +190,7 @@ public abstract class DocValuesConsumer
* The default implementation calls {@link #addBinaryField}, passing
* an Iterable that merges and filters deleted documents on the fly.
*/
- public void mergeBinaryField(FieldInfo fieldInfo, final MergeState mergeState, final List<BinaryDocValues> toMerge) throws IOException {
+ public void mergeBinaryField(FieldInfo fieldInfo, final MergeState mergeState, final List<BinaryDocValues> toMerge, final List<Bits> docsWithField) throws IOException {
addBinaryField(fieldInfo,
new Iterable<BytesRef>() {
@@ -191,9 +200,11 @@ public abstract class DocValuesConsumer
int readerUpto = -1;
int docIDUpto;
BytesRef nextValue = new BytesRef();
+ BytesRef nextPointer; // points to null if missing, or nextValue
AtomicReader currentReader;
BinaryDocValues currentValues;
Bits currentLiveDocs;
+ Bits currentDocsWithField;
boolean nextIsSet;
@Override
@@ -213,8 +224,7 @@ public abstract class DocValuesConsumer
}
assert nextIsSet;
nextIsSet = false;
- // TODO: make a mutable number
- return nextValue;
+ return nextPointer;
}
private boolean setNext() {
@@ -228,6 +238,7 @@ public abstract class DocValuesConsumer
if (readerUpto < toMerge.size()) {
currentReader = mergeState.readers.get(readerUpto);
currentValues = toMerge.get(readerUpto);
+ currentDocsWithField = docsWithField.get(readerUpto);
currentLiveDocs = currentReader.getLiveDocs();
}
docIDUpto = 0;
@@ -236,7 +247,12 @@ public abstract class DocValuesConsumer
if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
nextIsSet = true;
- currentValues.get(docIDUpto, nextValue);
+ if (currentDocsWithField.get(docIDUpto)) {
+ currentValues.get(docIDUpto, nextValue);
+ nextPointer = nextValue;
+ } else {
+ nextPointer = null;
+ }
docIDUpto++;
return true;
}
@@ -272,7 +288,10 @@ public abstract class DocValuesConsumer
OpenBitSet bitset = new OpenBitSet(dv.getValueCount());
for (int i = 0; i < reader.maxDoc(); i++) {
if (liveDocs.get(i)) {
- bitset.set(dv.getOrd(i));
+ int ord = dv.getOrd(i);
+ if (ord >= 0) {
+ bitset.set(ord);
+ }
}
}
liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
@@ -368,7 +387,7 @@ public abstract class DocValuesConsumer
if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
nextIsSet = true;
int segOrd = dvs[readerUpto].getOrd(docIDUpto);
- nextValue = (int) map.getGlobalOrd(readerUpto, segOrd);
+ nextValue = segOrd == -1 ? -1 : (int) map.getGlobalOrd(readerUpto, segOrd);
docIDUpto++;
return true;
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java Mon Oct 21 18:58:24 2013
@@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.util.Bits;
/** Abstract API that produces numeric, binary and
* sorted docvalues.
@@ -56,4 +57,70 @@ public abstract class DocValuesProducer
* The returned instance need not be thread-safe: it will only be
* used by a single thread. */
public abstract SortedSetDocValues getSortedSet(FieldInfo field) throws IOException;
+
+ /** Returns a {@link Bits} at the size of <code>reader.maxDoc()</code>,
+ * with turned on bits for each docid that does have a value for this field.
+ * The returned instance need not be thread-safe: it will only be
+ * used by a single thread. */
+ public abstract Bits getDocsWithField(FieldInfo field) throws IOException;
+
+ /** Returns approximate RAM bytes used */
+ public abstract long ramBytesUsed();
+
+ /**
+ * A simple implementation of {@link DocValuesProducer#getDocsWithField} that
+ * returns {@code true} if a document has an ordinal >= 0
+ * <p>
+ * Codecs can choose to use this (or implement it more efficiently another way), but
+ * in most cases a Bits is unnecessary anyway: users can check this as they go.
+ */
+ public static class SortedDocsWithField implements Bits {
+ final SortedDocValues in;
+ final int maxDoc;
+
+ /** Creates a {@link Bits} returning true if the document has a value */
+ public SortedDocsWithField(SortedDocValues in, int maxDoc) {
+ this.in = in;
+ this.maxDoc = maxDoc;
+ }
+
+ @Override
+ public boolean get(int index) {
+ return in.getOrd(index) >= 0;
+ }
+
+ @Override
+ public int length() {
+ return maxDoc;
+ }
+ }
+
+ /**
+ * A simple implementation of {@link DocValuesProducer#getDocsWithField} that
+ * returns {@code true} if a document has any ordinals.
+ * <p>
+ * Codecs can choose to use this (or implement it more efficiently another way), but
+ * in most cases a Bits is unnecessary anyway: users can check this as they go.
+ */
+ public static class SortedSetDocsWithField implements Bits {
+ final SortedSetDocValues in;
+ final int maxDoc;
+
+ /** Creates a {@link Bits} returning true if the document has a value */
+ public SortedSetDocsWithField(SortedSetDocValues in, int maxDoc) {
+ this.in = in;
+ this.maxDoc = maxDoc;
+ }
+
+ @Override
+ public boolean get(int index) {
+ in.setDocument(index);
+ return in.nextOrd() != SortedSetDocValues.NO_MORE_ORDS;
+ }
+
+ @Override
+ public int length() {
+ return maxDoc;
+ }
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldInfosReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldInfosReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldInfosReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldInfosReader.java Mon Oct 21 18:58:24 2013
@@ -35,5 +35,5 @@ public abstract class FieldInfosReader {
/** Read the {@link FieldInfos} previously written with {@link
* FieldInfosWriter}. */
- public abstract FieldInfos read(Directory directory, String segmentName, IOContext iocontext) throws IOException;
+ public abstract FieldInfos read(Directory directory, String segmentName, String segmentSuffix, IOContext iocontext) throws IOException;
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldInfosWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldInfosWriter.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldInfosWriter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldInfosWriter.java Mon Oct 21 18:58:24 2013
@@ -35,5 +35,5 @@ public abstract class FieldInfosWriter {
/** Writes the provided {@link FieldInfos} to the
* directory. */
- public abstract void write(Directory directory, String segmentName, FieldInfos infos, IOContext context) throws IOException;
+ public abstract void write(Directory directory, String segmentName, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException;
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java Mon Oct 21 18:58:24 2013
@@ -17,60 +17,59 @@ package org.apache.lucene.codecs;
* limitations under the License.
*/
-import java.io.Closeable;
import java.io.IOException;
-import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfo; // javadocs
import org.apache.lucene.index.Fields;
-import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentWriteState; // javadocs
-import org.apache.lucene.index.Terms;
/**
* Abstract API that consumes terms, doc, freq, prox, offset and
* payloads postings. Concrete implementations of this
* actually do "something" with the postings (write it into
* the index in a specific format).
- * <p>
- * The lifecycle is:
- * <ol>
- * <li>FieldsConsumer is created by
- * {@link PostingsFormat#fieldsConsumer(SegmentWriteState)}.
- * <li>For each field, {@link #addField(FieldInfo)} is called,
- * returning a {@link TermsConsumer} for the field.
- * <li>After all fields are added, the consumer is {@link #close}d.
- * </ol>
*
* @lucene.experimental
*/
-public abstract class FieldsConsumer implements Closeable {
+
+public abstract class FieldsConsumer {
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected FieldsConsumer() {
}
- /** Add a new field */
- public abstract TermsConsumer addField(FieldInfo field) throws IOException;
-
- /** Called when we are done adding everything. */
- @Override
- public abstract void close() throws IOException;
-
- /** Called during merging to merge all {@link Fields} from
- * sub-readers. This must recurse to merge all postings
- * (terms, docs, positions, etc.). A {@link
- * PostingsFormat} can override this default
- * implementation to do its own merging. */
- public void merge(MergeState mergeState, Fields fields) throws IOException {
- for (String field : fields) {
- FieldInfo info = mergeState.fieldInfos.fieldInfo(field);
- assert info != null : "FieldInfo for field is null: "+ field;
- Terms terms = fields.terms(field);
- if (terms != null) {
- final TermsConsumer termsConsumer = addField(info);
- termsConsumer.merge(mergeState, info.getIndexOptions(), terms.iterator(null));
- }
- }
- }
+ // TODO: can we somehow compute stats for you...?
+
+ // TODO: maybe we should factor out "limited" (only
+ // iterables, no counts/stats) base classes from
+ // Fields/Terms/Docs/AndPositions?
+
+ /** Write all fields, terms and postings. This the "pull"
+ * API, allowing you to iterate more than once over the
+ * postings, somewhat analogous to using a DOM API to
+ * traverse an XML tree.
+ *
+ * <p><b>Notes</b>:
+ *
+ * <ul>
+ * <li> You must compute index statistics,
+ * including each Term's docFreq and totalTermFreq,
+ * as well as the summary sumTotalTermFreq,
+ * sumTotalDocFreq and docCount.
+ *
+ * <li> You must skip terms that have no docs and
+ * fields that have no terms, even though the provided
+ * Fields API will expose them; this typically
+ * requires lazily writing the field or term until
+ * you've actually seen the first term or
+ * document.
+ *
+ * <li> The provided Fields instance is limited: you
+ * cannot call any methods that return
+ * statistics/counts; you cannot pass a non-null
+ * live docs when pulling docs/positions enums.
+ * </ul>
+ */
+ public abstract void write(Fields fields) throws IOException;
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldsProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldsProducer.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldsProducer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FieldsProducer.java Mon Oct 21 18:58:24 2013
@@ -36,4 +36,7 @@ public abstract class FieldsProducer ext
@Override
public abstract void close() throws IOException;
+
+ /** Returns approximate RAM bytes used */
+ public abstract long ramBytesUsed();
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java Mon Oct 21 18:58:24 2013
@@ -21,13 +21,13 @@ package org.apache.lucene.codecs;
* A codec that forwards all its method calls to another codec.
* <p>
* Extend this class when you need to reuse the functionality of an existing
- * codec. For example, if you want to build a codec that redefines Lucene42's
+ * codec. For example, if you want to build a codec that redefines Lucene46's
* {@link LiveDocsFormat}:
* <pre class="prettyprint">
* public final class CustomCodec extends FilterCodec {
*
* public CustomCodec() {
- * super("CustomCodec", new Lucene42Codec());
+ * super("CustomCodec", new Lucene46Codec());
* }
*
* public LiveDocsFormat liveDocsFormat() {
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java Mon Oct 21 18:58:24 2013
@@ -22,8 +22,8 @@ import java.util.ServiceLoader;
import java.util.Set;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; // javadocs
-import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.NamedSPILoader;
/**
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java Mon Oct 21 18:58:24 2013
@@ -24,6 +24,7 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.Bits;
/** The core terms dictionaries (BlockTermsReader,
@@ -54,8 +55,10 @@ public abstract class PostingsReaderBase
/** Return a newly created empty TermState */
public abstract BlockTermState newTermState() throws IOException;
- /** Actually decode metadata for next term */
- public abstract void nextTerm(FieldInfo fieldInfo, BlockTermState state) throws IOException;
+ /** Actually decode metadata for next term
+ * @see PostingsWriterBase#encodeTerm
+ */
+ public abstract void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
/** Must fully consume state, since after this call that
* TermState may be reused. */
@@ -66,11 +69,9 @@ public abstract class PostingsReaderBase
public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsAndPositionsEnum reuse,
int flags) throws IOException;
+ /** Returns approximate RAM bytes used */
+ public abstract long ramBytesUsed();
+
@Override
public abstract void close() throws IOException;
-
- /** Reads data for all terms in the next block; this
- * method should merely load the byte[] blob but not
- * decode, which is done in {@link #nextTerm}. */
- public abstract void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState termState) throws IOException;
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsReader.java Mon Oct 21 18:58:24 2013
@@ -40,4 +40,7 @@ public abstract class StoredFieldsReader
@Override
public abstract StoredFieldsReader clone();
+
+ /** Returns approximate RAM bytes used */
+ public abstract long ramBytesUsed();
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsReader.java Mon Oct 21 18:58:24 2013
@@ -42,6 +42,9 @@ public abstract class TermVectorsReader
* available from the {@link DocsAndPositionsEnum}. */
public abstract Fields get(int doc) throws IOException;
+ /** Returns approximate RAM bytes used */
+ public abstract long ramBytesUsed();
+
/** Create a clone that one caller at a time may use to
* read term vectors. */
@Override
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java Mon Oct 21 18:58:24 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.codecs;
import java.io.Closeable;
import java.io.IOException;
-import java.util.Comparator;
import java.util.Iterator;
import org.apache.lucene.index.AtomicReader;
@@ -293,10 +292,6 @@ public abstract class TermVectorsWriter
assert fieldCount == numFields;
finishDocument();
}
-
- /** Return the BytesRef Comparator used to sort terms
- * before feeding to this API. */
- public abstract Comparator<BytesRef> getComparator() throws IOException;
@Override
public abstract void close() throws IOException;
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexReader.java Mon Oct 21 18:58:24 2013
@@ -24,6 +24,7 @@ import org.apache.lucene.index.CorruptIn
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;
/**
@@ -160,5 +161,23 @@ public final class CompressingStoredFiel
public CompressingStoredFieldsIndexReader clone() {
return this;
}
+
+ long ramBytesUsed() {
+ long res = 0;
+
+ for(PackedInts.Reader r : docBasesDeltas) {
+ res += r.ramBytesUsed();
+ }
+ for(PackedInts.Reader r : startPointersDeltas) {
+ res += r.ramBytesUsed();
+ }
+
+ res += RamUsageEstimator.sizeOf(docBases);
+ res += RamUsageEstimator.sizeOf(startPointers);
+ res += RamUsageEstimator.sizeOf(avgChunkDocs);
+ res += RamUsageEstimator.sizeOf(avgChunkSizes);
+
+ return res;
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java Mon Oct 21 18:58:24 2013
@@ -27,11 +27,13 @@ import static org.apache.lucene.codecs.c
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.STRING;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_BITS;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_MASK;
+import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_BIG_CHUNKS;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_CURRENT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_START;
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_EXTENSION;
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION;
+import java.io.EOFException;
import java.io.IOException;
import java.util.Arrays;
@@ -45,6 +47,7 @@ import org.apache.lucene.index.SegmentIn
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@@ -63,9 +66,23 @@ public final class CompressingStoredFiel
// Do not reuse the decompression buffer when there is more than 32kb to decompress
private static final int BUFFER_REUSE_THRESHOLD = 1 << 15;
+ private static final byte[] SKIP_BUFFER = new byte[1024];
+
+ // TODO: should this be a method on DataInput?
+ private static void skipBytes(DataInput in, long numBytes) throws IOException {
+ assert numBytes >= 0;
+ for (long skipped = 0; skipped < numBytes; ) {
+ final int toRead = (int) Math.min(numBytes - skipped, SKIP_BUFFER.length);
+ in.readBytes(SKIP_BUFFER, 0, toRead);
+ skipped += toRead;
+ }
+ }
+
+ private final int version;
private final FieldInfos fieldInfos;
private final CompressingStoredFieldsIndexReader indexReader;
private final IndexInput fieldsStream;
+ private final int chunkSize;
private final int packedIntsVersion;
private final CompressionMode compressionMode;
private final Decompressor decompressor;
@@ -75,9 +92,11 @@ public final class CompressingStoredFiel
// used by clone
private CompressingStoredFieldsReader(CompressingStoredFieldsReader reader) {
+ this.version = reader.version;
this.fieldInfos = reader.fieldInfos;
this.fieldsStream = reader.fieldsStream.clone();
this.indexReader = reader.indexReader.clone();
+ this.chunkSize = reader.chunkSize;
this.packedIntsVersion = reader.packedIntsVersion;
this.compressionMode = reader.compressionMode;
this.decompressor = reader.decompressor.clone();
@@ -100,7 +119,7 @@ public final class CompressingStoredFiel
final String indexStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_INDEX_EXTENSION);
indexStream = d.openInput(indexStreamFN, context);
final String codecNameIdx = formatName + CODEC_SFX_IDX;
- CodecUtil.checkHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT);
+ version = CodecUtil.checkHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT);
assert CodecUtil.headerLength(codecNameIdx) == indexStream.getFilePointer();
indexReader = new CompressingStoredFieldsIndexReader(indexStream, si);
indexStream.close();
@@ -110,9 +129,17 @@ public final class CompressingStoredFiel
final String fieldsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION);
fieldsStream = d.openInput(fieldsStreamFN, context);
final String codecNameDat = formatName + CODEC_SFX_DAT;
- CodecUtil.checkHeader(fieldsStream, codecNameDat, VERSION_START, VERSION_CURRENT);
+ final int fieldsVersion = CodecUtil.checkHeader(fieldsStream, codecNameDat, VERSION_START, VERSION_CURRENT);
+ if (version != fieldsVersion) {
+ throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + fieldsVersion);
+ }
assert CodecUtil.headerLength(codecNameDat) == fieldsStream.getFilePointer();
+ if (version >= VERSION_BIG_CHUNKS) {
+ chunkSize = fieldsStream.readVInt();
+ } else {
+ chunkSize = -1;
+ }
packedIntsVersion = fieldsStream.readVInt();
decompressor = compressionMode.newDecompressor();
this.bytes = new BytesRef();
@@ -145,7 +172,7 @@ public final class CompressingStoredFiel
}
}
- private static void readField(ByteArrayDataInput in, StoredFieldVisitor visitor, FieldInfo info, int bits) throws IOException {
+ private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInfo info, int bits) throws IOException {
switch (bits & TYPE_MASK) {
case BYTE_ARR:
int length = in.readVInt();
@@ -176,12 +203,12 @@ public final class CompressingStoredFiel
}
}
- private static void skipField(ByteArrayDataInput in, int bits) throws IOException {
+ private static void skipField(DataInput in, int bits) throws IOException {
switch (bits & TYPE_MASK) {
case BYTE_ARR:
case STRING:
final int length = in.readVInt();
- in.skipBytes(length);
+ skipBytes(in, length);
break;
case NUMERIC_INT:
case NUMERIC_FLOAT:
@@ -261,11 +288,56 @@ public final class CompressingStoredFiel
return;
}
- final BytesRef bytes = totalLength <= BUFFER_REUSE_THRESHOLD ? this.bytes : new BytesRef();
- decompressor.decompress(fieldsStream, totalLength, offset, length, bytes);
- assert bytes.length == length;
+ final DataInput documentInput;
+ if (version >= VERSION_BIG_CHUNKS && totalLength >= 2 * chunkSize) {
+ assert chunkSize > 0;
+ assert offset < chunkSize;
+
+ decompressor.decompress(fieldsStream, chunkSize, offset, Math.min(length, chunkSize - offset), bytes);
+ documentInput = new DataInput() {
+
+ int decompressed = bytes.length;
+
+ void fillBuffer() throws IOException {
+ assert decompressed <= length;
+ if (decompressed == length) {
+ throw new EOFException();
+ }
+ final int toDecompress = Math.min(length - decompressed, chunkSize);
+ decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, bytes);
+ decompressed += toDecompress;
+ }
+
+ @Override
+ public byte readByte() throws IOException {
+ if (bytes.length == 0) {
+ fillBuffer();
+ }
+ --bytes.length;
+ return bytes.bytes[bytes.offset++];
+ }
+
+ @Override
+ public void readBytes(byte[] b, int offset, int len) throws IOException {
+ while (len > bytes.length) {
+ System.arraycopy(bytes.bytes, bytes.offset, b, offset, bytes.length);
+ len -= bytes.length;
+ offset += bytes.length;
+ fillBuffer();
+ }
+ System.arraycopy(bytes.bytes, bytes.offset, b, offset, len);
+ bytes.offset += len;
+ bytes.length -= len;
+ }
+
+ };
+ } else {
+ final BytesRef bytes = totalLength <= BUFFER_REUSE_THRESHOLD ? this.bytes : new BytesRef();
+ decompressor.decompress(fieldsStream, totalLength, offset, length, bytes);
+ assert bytes.length == length;
+ documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);
+ }
- final ByteArrayDataInput documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);
for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++) {
final long infoAndBits = documentInput.readVLong();
final int fieldNumber = (int) (infoAndBits >>> TYPE_BITS);
@@ -277,17 +349,14 @@ public final class CompressingStoredFiel
switch(visitor.needsField(fieldInfo)) {
case YES:
readField(documentInput, visitor, fieldInfo, bits);
- assert documentInput.getPosition() <= bytes.offset + bytes.length : documentInput.getPosition() + " " + bytes.offset + bytes.length;
break;
case NO:
skipField(documentInput, bits);
- assert documentInput.getPosition() <= bytes.offset + bytes.length : documentInput.getPosition() + " " + bytes.offset + bytes.length;
break;
case STOP:
return;
}
}
- assert documentInput.getPosition() == bytes.offset + bytes.length : documentInput.getPosition() + " " + bytes.offset + " " + bytes.length;
}
@Override
@@ -296,6 +365,10 @@ public final class CompressingStoredFiel
return new CompressingStoredFieldsReader(this);
}
+ int getVersion() {
+ return version;
+ }
+
CompressionMode getCompressionMode() {
return compressionMode;
}
@@ -308,6 +381,7 @@ public final class CompressingStoredFiel
final class ChunkIterator {
+ BytesRef spare;
BytesRef bytes;
int docBase;
int chunkDocs;
@@ -317,6 +391,7 @@ public final class CompressingStoredFiel
private ChunkIterator() {
this.docBase = -1;
bytes = new BytesRef();
+ spare = new BytesRef();
numStoredFields = new int[1];
lengths = new int[1];
}
@@ -392,7 +467,19 @@ public final class CompressingStoredFiel
void decompress() throws IOException {
// decompress data
final int chunkSize = chunkSize();
- decompressor.decompress(fieldsStream, chunkSize, 0, chunkSize, bytes);
+ if (version >= VERSION_BIG_CHUNKS && chunkSize >= 2 * CompressingStoredFieldsReader.this.chunkSize) {
+ bytes.offset = bytes.length = 0;
+ for (int decompressed = 0; decompressed < chunkSize; ) {
+ final int toDecompress = Math.min(chunkSize - decompressed, CompressingStoredFieldsReader.this.chunkSize);
+ decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, spare);
+ bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + spare.length);
+ System.arraycopy(spare.bytes, spare.offset, bytes.bytes, bytes.length, spare.length);
+ bytes.length += spare.length;
+ decompressed += toDecompress;
+ }
+ } else {
+ decompressor.decompress(fieldsStream, chunkSize, 0, chunkSize, bytes);
+ }
if (bytes.length != chunkSize) {
throw new CorruptIndexException("Corrupted: expected chunk size = " + chunkSize() + ", got " + bytes.length + " (resource=" + fieldsStream + ")");
}
@@ -410,4 +497,9 @@ public final class CompressingStoredFiel
}
+ @Override
+ public long ramBytesUsed() {
+ return indexReader.ramBytesUsed();
+ }
+
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java Mon Oct 21 18:58:24 2013
@@ -70,7 +70,8 @@ public final class CompressingStoredFiel
static final String CODEC_SFX_IDX = "Index";
static final String CODEC_SFX_DAT = "Data";
static final int VERSION_START = 0;
- static final int VERSION_CURRENT = VERSION_START;
+ static final int VERSION_BIG_CHUNKS = 1;
+ static final int VERSION_CURRENT = VERSION_BIG_CHUNKS;
private final Directory directory;
private final String segment;
@@ -119,6 +120,7 @@ public final class CompressingStoredFiel
indexWriter = new CompressingStoredFieldsIndexWriter(indexStream);
indexStream = null;
+ fieldsStream.writeVInt(chunkSize);
fieldsStream.writeVInt(PackedInts.VERSION_CURRENT);
success = true;
@@ -219,7 +221,14 @@ public final class CompressingStoredFiel
writeHeader(docBase, numBufferedDocs, numStoredFields, lengths);
// compress stored fields to fieldsStream
- compressor.compress(bufferedDocs.bytes, 0, bufferedDocs.length, fieldsStream);
+ if (bufferedDocs.length >= 2 * chunkSize) {
+ // big chunk, slice it
+ for (int compressed = 0; compressed < bufferedDocs.length; compressed += chunkSize) {
+ compressor.compress(bufferedDocs.bytes, compressed, Math.min(chunkSize, bufferedDocs.length - compressed), fieldsStream);
+ }
+ } else {
+ compressor.compress(bufferedDocs.bytes, 0, bufferedDocs.length, fieldsStream);
+ }
// reset
docBase += numBufferedDocs;
@@ -327,7 +336,8 @@ public final class CompressingStoredFiel
final int maxDoc = reader.maxDoc();
final Bits liveDocs = reader.getLiveDocs();
- if (matchingFieldsReader == null) {
+ if (matchingFieldsReader == null
+ || matchingFieldsReader.getVersion() != VERSION_CURRENT) { // means reader version is not the same as the writer version
// naive merge...
for (int i = nextLiveDoc(0, liveDocs, maxDoc); i < maxDoc; i = nextLiveDoc(i + 1, liveDocs, maxDoc)) {
StoredDocument doc = reader.document(i);
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java Mon Oct 21 18:58:24 2013
@@ -31,7 +31,6 @@ import static org.apache.lucene.codecs.c
import java.io.Closeable;
import java.io.IOException;
-import java.util.Comparator;
import java.util.Iterator;
import java.util.NoSuchElementException;
@@ -722,11 +721,6 @@ public final class CompressingTermVector
}
@Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
public long size() throws IOException {
return numTerms;
}
@@ -747,6 +741,11 @@ public final class CompressingTermVector
}
@Override
+ public boolean hasFreqs() {
+ return true;
+ }
+
+ @Override
public boolean hasOffsets() {
return (flags & OFFSETS) != 0;
}
@@ -819,11 +818,6 @@ public final class CompressingTermVector
}
@Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
public SeekStatus seekCeil(BytesRef text)
throws IOException {
if (ord < numTerms && ord >= 0) {
@@ -1041,4 +1035,9 @@ public final class CompressingTermVector
return sum;
}
+ @Override
+ public long ramBytesUsed() {
+ return indexReader.ramBytesUsed();
+ }
+
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java Mon Oct 21 18:58:24 2013
@@ -20,7 +20,6 @@ package org.apache.lucene.codecs.compres
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Arrays;
-import java.util.Comparator;
import java.util.Deque;
import java.util.Iterator;
import java.util.SortedSet;
@@ -663,11 +662,6 @@ public final class CompressingTermVector
}
@Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
public void addProx(int numProx, DataInput positions, DataInput offsets)
throws IOException {
assert (curField.hasPositions) == (positions != null);
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java Mon Oct 21 18:58:24 2013
@@ -219,7 +219,7 @@ final class LZ4 {
final PackedInts.Mutable hashTable = ht.hashTable;
main:
- while (off < limit) {
+ while (off <= limit) {
// find a match
int ref;
while (true) {
@@ -295,22 +295,23 @@ final class LZ4 {
private int hashPointer(byte[] bytes, int off) {
final int v = readInt(bytes, off);
final int h = hashHC(v);
- return base + hashTable[h];
+ return hashTable[h];
}
private int next(int off) {
- return base + off - (chainTable[off & MASK] & 0xFFFF);
+ return off - (chainTable[off & MASK] & 0xFFFF);
}
private void addHash(byte[] bytes, int off) {
final int v = readInt(bytes, off);
final int h = hashHC(v);
int delta = off - hashTable[h];
+ assert delta > 0 : delta;
if (delta >= MAX_DISTANCE) {
delta = MAX_DISTANCE - 1;
}
chainTable[off & MASK] = (short) delta;
- hashTable[h] = off - base;
+ hashTable[h] = off;
}
void insert(int off, byte[] bytes) {
@@ -322,12 +323,24 @@ final class LZ4 {
boolean insertAndFindBestMatch(byte[] buf, int off, int matchLimit, Match match) {
match.start = off;
match.len = 0;
+ int delta = 0;
+ int repl = 0;
insert(off, buf);
int ref = hashPointer(buf, off);
+
+ if (ref >= off - 4 && ref <= off && ref >= base) { // potential repetition
+ if (readIntEquals(buf, ref, off)) { // confirmed
+ delta = off - ref;
+ repl = match.len = MIN_MATCH + commonBytes(buf, ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
+ match.ref = ref;
+ }
+ ref = next(ref);
+ }
+
for (int i = 0; i < MAX_ATTEMPTS; ++i) {
- if (ref < Math.max(base, off - MAX_DISTANCE + 1)) {
+ if (ref < Math.max(base, off - MAX_DISTANCE + 1) || ref > off) {
break;
}
if (buf[ref + match.len] == buf[off + match.len] && readIntEquals(buf, ref, off)) {
@@ -340,6 +353,21 @@ final class LZ4 {
ref = next(ref);
}
+ if (repl != 0) {
+ int ptr = off;
+ final int end = off + repl - (MIN_MATCH - 1);
+ while (ptr < end - delta) {
+ chainTable[ptr & MASK] = (short) delta; // pre load
+ ++ptr;
+ }
+ do {
+ chainTable[ptr & MASK] = (short) delta;
+ hashTable[hashHC(readInt(buf, ptr))] = ptr;
+ ++ptr;
+ } while (ptr < end);
+ nextToUpdate = end;
+ }
+
return match.len != 0;
}
@@ -351,7 +379,7 @@ final class LZ4 {
final int delta = off - startLimit;
int ref = hashPointer(buf, off);
for (int i = 0; i < MAX_ATTEMPTS; ++i) {
- if (ref < Math.max(base, off - MAX_DISTANCE + 1)) {
+ if (ref < Math.max(base, off - MAX_DISTANCE + 1) || ref > off) {
break;
}
if (buf[ref - delta + match.len] == buf[startLimit + match.len]
@@ -386,6 +414,7 @@ final class LZ4 {
final int srcEnd = srcOff + srcLen;
final int matchLimit = srcEnd - LAST_LITERALS;
+ final int mfLimit = matchLimit - MIN_MATCH;
int sOff = srcOff;
int anchor = sOff++;
@@ -397,7 +426,7 @@ final class LZ4 {
final Match match3 = new Match();
main:
- while (sOff < matchLimit) {
+ while (sOff <= mfLimit) {
if (!ht.insertAndFindBestMatch(src, sOff, matchLimit, match1)) {
++sOff;
continue;
@@ -409,7 +438,7 @@ final class LZ4 {
search2:
while (true) {
assert match1.start >= anchor;
- if (match1.end() >= matchLimit
+ if (match1.end() >= mfLimit
|| !ht.insertAndFindWiderMatch(src, match1.end() - 2, match1.start + 1, matchLimit, match1.len, match2)) {
// no better match
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
@@ -445,24 +474,11 @@ final class LZ4 {
}
}
- if (match2.start + match2.len >= matchLimit
+ if (match2.start + match2.len >= mfLimit
|| !ht.insertAndFindWiderMatch(src, match2.end() - 3, match2.start, matchLimit, match2.len, match3)) {
// no better match -> 2 sequences to encode
if (match2.start < match1.end()) {
- if (match2.start - match1.start < OPTIMAL_ML) {
- if (match1.len > OPTIMAL_ML) {
- match1.len = OPTIMAL_ML;
- }
- if (match1.end() > match2.end() - MIN_MATCH) {
- match1.len = match2.end() - match1.start - MIN_MATCH;
- }
- final int correction = match1.len - (match2.start - match1.start);
- if (correction > 0) {
- match2.fix(correction);
- }
- } else {
- match1.len = match2.start - match1.start;
- }
+ match1.len = match2.start - match1.start;
}
// encode seq 1
encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40Codec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40Codec.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40Codec.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40Codec.java Mon Oct 21 18:58:24 2013
@@ -27,7 +27,6 @@ import org.apache.lucene.codecs.DocValue
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
-import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
/**
@@ -83,7 +82,7 @@ public class Lucene40Codec extends Codec
}
@Override
- public final SegmentInfoFormat segmentInfoFormat() {
+ public SegmentInfoFormat segmentInfoFormat() {
return infosFormat;
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesFormat.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesFormat.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesFormat.java Mon Oct 21 18:58:24 2013
@@ -118,6 +118,11 @@ import org.apache.lucene.util.packed.Pac
* {@code BYTES_VAR_DEREF BYTES_VAR_DEREF} it doesn't apply deduplication of the document values.
* </li>
* </ul>
+ * <p>
+ * Limitations:
+ * <ul>
+ * <li> Binary doc values can be at most {@link #MAX_BINARY_FIELD_LENGTH} in length.
+ * </ul>
* @deprecated Only for reading old 4.0 and 4.1 segments
*/
@Deprecated
@@ -125,6 +130,9 @@ import org.apache.lucene.util.packed.Pac
// for back compat only!
public class Lucene40DocValuesFormat extends DocValuesFormat {
+ /** Maximum length for each binary doc values field. */
+ public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2;
+
/** Sole constructor. */
public Lucene40DocValuesFormat() {
super("Lucene40");
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java Mon Oct 21 18:58:24 2013
@@ -35,9 +35,11 @@ import org.apache.lucene.index.SortedSet
import org.apache.lucene.store.CompoundFileDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.PagedBytes;
+import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;
/**
@@ -621,7 +623,17 @@ final class Lucene40DocValuesReader exte
}
@Override
+ public Bits getDocsWithField(FieldInfo field) throws IOException {
+ return new Bits.MatchAllBits(state.segmentInfo.getDocCount());
+ }
+
+ @Override
public void close() throws IOException {
dir.close();
}
+
+ @Override
+ public long ramBytesUsed() {
+ return RamUsageEstimator.sizeOf(this);
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java Mon Oct 21 18:58:24 2013
@@ -49,7 +49,7 @@ class Lucene40FieldInfosReader extends F
}
@Override
- public FieldInfos read(Directory directory, String segmentName, IOContext iocontext) throws IOException {
+ public FieldInfos read(Directory directory, String segmentName, String segmentSuffix, IOContext iocontext) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentName, "", Lucene40FieldInfosFormat.FIELD_INFOS_EXTENSION);
IndexInput input = directory.openInput(fileName, iocontext);
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java Mon Oct 21 18:58:24 2013
@@ -26,7 +26,7 @@ import org.apache.lucene.codecs.FieldsCo
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
-import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.PostingsWriterBase; // javadocs
import org.apache.lucene.index.DocsEnum; // javadocs
import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
import org.apache.lucene.index.FieldInfos; // javadocs
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java Mon Oct 21 18:58:24 2013
@@ -32,6 +32,7 @@ import org.apache.lucene.index.IndexFile
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
@@ -121,11 +122,6 @@ public class Lucene40PostingsReader exte
long proxOffset;
long skipOffset;
- // Only used by the "primary" TermState -- clones don't
- // copy this (basically they are "transient"):
- ByteArrayDataInput bytesReader; // TODO: should this NOT be in the TermState...?
- byte[] bytes;
-
@Override
public StandardTermState clone() {
StandardTermState other = new StandardTermState();
@@ -140,11 +136,6 @@ public class Lucene40PostingsReader exte
freqOffset = other.freqOffset;
proxOffset = other.proxOffset;
skipOffset = other.skipOffset;
-
- // Do not copy bytes, bytesReader (else TermState is
- // very heavy, ie drags around the entire block's
- // byte[]). On seek back, if next() is in fact used
- // (rare!), they will be re-read from disk.
}
@Override
@@ -171,38 +162,18 @@ public class Lucene40PostingsReader exte
}
}
- /* Reads but does not decode the byte[] blob holding
- metadata for the current terms block */
- @Override
- public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
- final StandardTermState termState = (StandardTermState) _termState;
-
- final int len = termsIn.readVInt();
-
- // if (DEBUG) System.out.println(" SPR.readTermsBlock bytes=" + len + " ts=" + _termState);
- if (termState.bytes == null) {
- termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
- termState.bytesReader = new ByteArrayDataInput();
- } else if (termState.bytes.length < len) {
- termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
- }
-
- termsIn.readBytes(termState.bytes, 0, len);
- termState.bytesReader.reset(termState.bytes, 0, len);
- }
-
@Override
- public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState)
+ public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute)
throws IOException {
final StandardTermState termState = (StandardTermState) _termState;
// if (DEBUG) System.out.println("SPR: nextTerm seg=" + segment + " tbOrd=" + termState.termBlockOrd + " bytesReader.fp=" + termState.bytesReader.getPosition());
final boolean isFirstTerm = termState.termBlockOrd == 0;
-
- if (isFirstTerm) {
- termState.freqOffset = termState.bytesReader.readVLong();
- } else {
- termState.freqOffset += termState.bytesReader.readVLong();
+ if (absolute) {
+ termState.freqOffset = 0;
+ termState.proxOffset = 0;
}
+
+ termState.freqOffset += in.readVLong();
/*
if (DEBUG) {
System.out.println(" dF=" + termState.docFreq);
@@ -212,7 +183,7 @@ public class Lucene40PostingsReader exte
assert termState.freqOffset < freqIn.length();
if (termState.docFreq >= skipMinimum) {
- termState.skipOffset = termState.bytesReader.readVLong();
+ termState.skipOffset = in.readVLong();
// if (DEBUG) System.out.println(" skipOffset=" + termState.skipOffset + " vs freqIn.length=" + freqIn.length());
assert termState.freqOffset + termState.skipOffset < freqIn.length();
} else {
@@ -220,11 +191,7 @@ public class Lucene40PostingsReader exte
}
if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
- if (isFirstTerm) {
- termState.proxOffset = termState.bytesReader.readVLong();
- } else {
- termState.proxOffset += termState.bytesReader.readVLong();
- }
+ termState.proxOffset += in.readVLong();
// if (DEBUG) System.out.println(" proxFP=" + termState.proxOffset);
}
}
@@ -1195,4 +1162,10 @@ public class Lucene40PostingsReader exte
return limit;
}
}
+
+ @Override
+ public long ramBytesUsed() {
+ return 0;
+ }
+
}
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfoFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfoFormat.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfoFormat.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfoFormat.java Mon Oct 21 18:58:24 2013
@@ -67,10 +67,11 @@ import org.apache.lucene.store.DataOutpu
*
* @see SegmentInfos
* @lucene.experimental
+ * @deprecated Only for reading old 4.0-4.5 segments
*/
+@Deprecated
public class Lucene40SegmentInfoFormat extends SegmentInfoFormat {
private final SegmentInfoReader reader = new Lucene40SegmentInfoReader();
- private final SegmentInfoWriter writer = new Lucene40SegmentInfoWriter();
/** Sole constructor. */
public Lucene40SegmentInfoFormat() {
@@ -83,7 +84,7 @@ public class Lucene40SegmentInfoFormat e
@Override
public SegmentInfoWriter getSegmentInfoWriter() {
- return writer;
+ throw new UnsupportedOperationException("this codec can only be used for reading");
}
/** File extension used to store {@link SegmentInfo}. */
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfoReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfoReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfoReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfoReader.java Mon Oct 21 18:58:24 2013
@@ -18,7 +18,6 @@ package org.apache.lucene.codecs.lucene4
*/
import java.io.IOException;
-import java.util.Collections;
import java.util.Map;
import java.util.Set;
@@ -37,7 +36,9 @@ import org.apache.lucene.util.IOUtils;
*
* @see Lucene40SegmentInfoFormat
* @lucene.experimental
+ * @deprecated Only for reading old 4.0-4.5 segments
*/
+@Deprecated
public class Lucene40SegmentInfoReader extends SegmentInfoReader {
/** Sole constructor. */
@@ -60,15 +61,14 @@ public class Lucene40SegmentInfoReader e
}
final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;
final Map<String,String> diagnostics = input.readStringStringMap();
- final Map<String,String> attributes = input.readStringStringMap();
+ input.readStringStringMap(); // read deprecated attributes
final Set<String> files = input.readStringSet();
if (input.getFilePointer() != input.length()) {
throw new CorruptIndexException("did not read all bytes from file \"" + fileName + "\": read " + input.getFilePointer() + " vs size " + input.length() + " (resource: " + input + ")");
}
- final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile,
- null, diagnostics, Collections.unmodifiableMap(attributes));
+ final SegmentInfo si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics);
si.setFiles(files);
success = true;
Modified: lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java Mon Oct 21 18:58:24 2013
@@ -244,4 +244,9 @@ public final class Lucene40StoredFieldsR
return fieldsStream;
}
+
+ @Override
+ public long ramBytesUsed() {
+ return 0;
+ }
}