You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/08/10 02:37:42 UTC
svn commit: r1512543 - in /lucene/dev/trunk/lucene: ./
codecs/src/java/org/apache/lucene/codecs/diskdv/
core/src/java/org/apache/lucene/util/packed/
test-framework/src/java/org/apache/lucene/codecs/cheapbastard/
test-framework/src/java/org/apache/lucen...
Author: rmuir
Date: Sat Aug 10 00:37:41 2013
New Revision: 1512543
URL: http://svn.apache.org/r1512543
Log:
LUCENE-5159: prefix-code the sorted/sortedset value dictionaries in DiskDV
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedReader.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1512543&r1=1512542&r2=1512543&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sat Aug 10 00:37:41 2013
@@ -168,6 +168,9 @@ Optimizations
* LUCENE-5150: Make WAH8DocIdSet able to inverse its encoding in order to
compress dense sets efficiently as well. (Adrien Grand)
+* LUCENE-5159: Prefix-code the sorted/sortedset value dictionaries in DiskDV.
+ (Robert Muir)
+
Documentation
* LUCENE-4894: remove facet userguide as it was outdated. Partially absorbed into
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java?rev=1512543&r1=1512542&r2=1512543&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java Sat Aug 10 00:37:41 2013
@@ -27,9 +27,11 @@ import org.apache.lucene.index.FieldInfo
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.MathUtil;
+import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.BlockPackedWriter;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
@@ -38,6 +40,7 @@ import org.apache.lucene.util.packed.Pac
public class DiskDocValuesConsumer extends DocValuesConsumer {
static final int BLOCK_SIZE = 16384;
+ static final int ADDRESS_INTERVAL = 16;
/** Compressed using packed blocks of ints. */
public static final int DELTA_COMPRESSED = 0;
@@ -45,6 +48,13 @@ public class DiskDocValuesConsumer exten
public static final int GCD_COMPRESSED = 1;
/** Compressed by giving IDs to unique values. */
public static final int TABLE_COMPRESSED = 2;
+
+ /** Uncompressed binary, written directly (fixed length). */
+ public static final int BINARY_FIXED_UNCOMPRESSED = 0;
+ /** Uncompressed binary, written directly (variable length). */
+ public static final int BINARY_VARIABLE_UNCOMPRESSED = 1;
+ /** Compressed binary with shared prefixes */
+ public static final int BINARY_PREFIX_COMPRESSED = 2;
final IndexOutput data, meta;
final int maxDoc;
@@ -173,7 +183,7 @@ public class DiskDocValuesConsumer exten
}
@Override
- public void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
+ public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
// write the byte[] data
meta.writeVInt(field.number);
meta.writeByte(DiskDocValuesFormat.BINARY);
@@ -187,6 +197,7 @@ public class DiskDocValuesConsumer exten
data.writeBytes(v.bytes, v.offset, v.length);
count++;
}
+ meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED);
meta.writeVInt(minLength);
meta.writeVInt(maxLength);
meta.writeVLong(count);
@@ -208,12 +219,68 @@ public class DiskDocValuesConsumer exten
writer.finish();
}
}
+
+ protected void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
+ // first check if its a "fixed-length" terms dict
+ int minLength = Integer.MAX_VALUE;
+ int maxLength = Integer.MIN_VALUE;
+ for (BytesRef v : values) {
+ minLength = Math.min(minLength, v.length);
+ maxLength = Math.max(maxLength, v.length);
+ }
+ if (minLength == maxLength) {
+ // no index needed: direct addressing by mult
+ addBinaryField(field, values);
+ } else {
+ // header
+ meta.writeVInt(field.number);
+ meta.writeByte(DiskDocValuesFormat.BINARY);
+ meta.writeVInt(BINARY_PREFIX_COMPRESSED);
+ // now write the bytes: sharing prefixes within a block
+ final long startFP = data.getFilePointer();
+ // currently, we have to store the delta from expected for every 1/nth term
+ // we could avoid this, but its not much and less overall RAM than the previous approach!
+ RAMOutputStream addressBuffer = new RAMOutputStream();
+ MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE);
+ BytesRef lastTerm = new BytesRef();
+ long count = 0;
+ for (BytesRef v : values) {
+ if (count % ADDRESS_INTERVAL == 0) {
+ termAddresses.add(data.getFilePointer() - startFP);
+ // force the first term in a block to be abs-encoded
+ lastTerm.length = 0;
+ }
+
+ // prefix-code
+ int sharedPrefix = StringHelper.bytesDifference(lastTerm, v);
+ data.writeVInt(sharedPrefix);
+ data.writeVInt(v.length - sharedPrefix);
+ data.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix);
+ lastTerm.copyBytes(v);
+ count++;
+ }
+ final long indexStartFP = data.getFilePointer();
+ // write addresses of indexed terms
+ termAddresses.finish();
+ addressBuffer.writeTo(data);
+ addressBuffer = null;
+ termAddresses = null;
+ meta.writeVInt(minLength);
+ meta.writeVInt(maxLength);
+ meta.writeVLong(count);
+ meta.writeLong(startFP);
+ meta.writeVInt(ADDRESS_INTERVAL);
+ meta.writeLong(indexStartFP);
+ meta.writeVInt(PackedInts.VERSION_CURRENT);
+ meta.writeVInt(BLOCK_SIZE);
+ }
+ }
@Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
meta.writeVInt(field.number);
meta.writeByte(DiskDocValuesFormat.SORTED);
- addBinaryField(field, values);
+ addTermsDict(field, values);
addNumericField(field, docToOrd, false);
}
@@ -222,7 +289,7 @@ public class DiskDocValuesConsumer exten
meta.writeVInt(field.number);
meta.writeByte(DiskDocValuesFormat.SORTED_SET);
// write the ord -> byte[] as a binary field
- addBinaryField(field, values);
+ addTermsDict(field, values);
// write the stream of ords as a numeric field
// NOTE: we could return an iterator that delta-encodes these within a doc
addNumericField(field, ords, false);
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java?rev=1512543&r1=1512542&r2=1512543&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java Sat Aug 10 00:37:41 2013
@@ -53,7 +53,8 @@ public final class DiskDocValuesFormat e
public static final String META_CODEC = "DiskDocValuesMetadata";
public static final String META_EXTENSION = "dvdm";
public static final int VERSION_START = 0;
- public static final int VERSION_CURRENT = VERSION_START;
+ public static final int VERSION_COMPRESSED_TERMS = 1;
+ public static final int VERSION_CURRENT = VERSION_COMPRESSED_TERMS;
public static final byte NUMERIC = 0;
public static final byte BINARY = 1;
public static final byte SORTED = 2;
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java?rev=1512543&r1=1512542&r2=1512543&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java Sat Aug 10 00:37:41 2013
@@ -21,7 +21,12 @@ import static org.apache.lucene.codecs.d
import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.GCD_COMPRESSED;
import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.TABLE_COMPRESSED;
+import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_FIXED_UNCOMPRESSED;
+import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED;
+import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_PREFIX_COMPRESSED;
+
import java.io.IOException;
+import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
@@ -29,6 +34,8 @@ import org.apache.lucene.codecs.CodecUti
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
@@ -36,7 +43,10 @@ import org.apache.lucene.index.NumericDo
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.BlockPackedReader;
@@ -62,7 +72,7 @@ class DiskDocValuesProducer extends DocV
final int version;
try {
version = CodecUtil.checkHeader(in, metaCodec,
- DiskDocValuesFormat.VERSION_START,
+ DiskDocValuesFormat.VERSION_CURRENT,
DiskDocValuesFormat.VERSION_CURRENT);
numerics = new HashMap<Integer,NumericEntry>();
ords = new HashMap<Integer,NumericEntry>();
@@ -84,7 +94,7 @@ class DiskDocValuesProducer extends DocV
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
data = state.directory.openInput(dataName, state.context);
final int version2 = CodecUtil.checkHeader(data, dataCodec,
- DiskDocValuesFormat.VERSION_START,
+ DiskDocValuesFormat.VERSION_CURRENT,
DiskDocValuesFormat.VERSION_CURRENT);
if (version != version2) {
throw new CorruptIndexException("Format versions mismatch");
@@ -196,14 +206,27 @@ class DiskDocValuesProducer extends DocV
static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
BinaryEntry entry = new BinaryEntry();
+ entry.format = meta.readVInt();
entry.minLength = meta.readVInt();
entry.maxLength = meta.readVInt();
entry.count = meta.readVLong();
entry.offset = meta.readLong();
- if (entry.minLength != entry.maxLength) {
- entry.addressesOffset = meta.readLong();
- entry.packedIntsVersion = meta.readVInt();
- entry.blockSize = meta.readVInt();
+ switch(entry.format) {
+ case BINARY_FIXED_UNCOMPRESSED:
+ break;
+ case BINARY_PREFIX_COMPRESSED:
+ entry.addressInterval = meta.readVInt();
+ entry.addressesOffset = meta.readLong();
+ entry.packedIntsVersion = meta.readVInt();
+ entry.blockSize = meta.readVInt();
+ break;
+ case BINARY_VARIABLE_UNCOMPRESSED:
+ entry.addressesOffset = meta.readLong();
+ entry.packedIntsVersion = meta.readVInt();
+ entry.blockSize = meta.readVInt();
+ break;
+ default:
+ throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
}
return entry;
}
@@ -255,10 +278,15 @@ class DiskDocValuesProducer extends DocV
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
BinaryEntry bytes = binaries.get(field.number);
- if (bytes.minLength == bytes.maxLength) {
- return getFixedBinary(field, bytes);
- } else {
- return getVariableBinary(field, bytes);
+ switch(bytes.format) {
+ case BINARY_FIXED_UNCOMPRESSED:
+ return getFixedBinary(field, bytes);
+ case BINARY_VARIABLE_UNCOMPRESSED:
+ return getVariableBinary(field, bytes);
+ case BINARY_PREFIX_COMPRESSED:
+ return getCompressedBinary(field, bytes);
+ default:
+ throw new AssertionError();
}
}
@@ -321,6 +349,30 @@ class DiskDocValuesProducer extends DocV
};
}
+ private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
+ final IndexInput data = this.data.clone();
+ final long interval = bytes.addressInterval;
+
+ final MonotonicBlockPackedReader addresses;
+ synchronized (addressInstances) {
+ MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
+ if (addrInstance == null) {
+ data.seek(bytes.addressesOffset);
+ final long size;
+ if (bytes.count % interval == 0) {
+ size = bytes.count / interval;
+ } else {
+ size = 1L + bytes.count / interval;
+ }
+ addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
+ addressInstances.put(field.number, addrInstance);
+ }
+ addresses = addrInstance;
+ }
+
+ return new CompressedBinaryDocValues(bytes, addresses, data);
+ }
+
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
final int valueCount = (int) binaries.get(field.number).count;
@@ -346,6 +398,24 @@ class DiskDocValuesProducer extends DocV
public int getValueCount() {
return valueCount;
}
+
+ @Override
+ public int lookupTerm(BytesRef key) {
+ if (binary instanceof CompressedBinaryDocValues) {
+ return (int) ((CompressedBinaryDocValues)binary).lookupTerm(key);
+ } else {
+ return super.lookupTerm(key);
+ }
+ }
+
+ @Override
+ public TermsEnum termsEnum() {
+ if (binary instanceof CompressedBinaryDocValues) {
+ return ((CompressedBinaryDocValues)binary).getTermsEnum();
+ } else {
+ return super.termsEnum();
+ }
+ }
};
}
@@ -399,6 +469,24 @@ class DiskDocValuesProducer extends DocV
public long getValueCount() {
return valueCount;
}
+
+ @Override
+ public long lookupTerm(BytesRef key) {
+ if (binary instanceof CompressedBinaryDocValues) {
+ return ((CompressedBinaryDocValues)binary).lookupTerm(key);
+ } else {
+ return super.lookupTerm(key);
+ }
+ }
+
+ @Override
+ public TermsEnum termsEnum() {
+ if (binary instanceof CompressedBinaryDocValues) {
+ return ((CompressedBinaryDocValues)binary).getTermsEnum();
+ } else {
+ return super.termsEnum();
+ }
+ }
};
}
@@ -423,10 +511,12 @@ class DiskDocValuesProducer extends DocV
static class BinaryEntry {
long offset;
+ int format;
long count;
int minLength;
int maxLength;
long addressesOffset;
+ long addressInterval;
int packedIntsVersion;
int blockSize;
}
@@ -449,4 +539,204 @@ class DiskDocValuesProducer extends DocV
abstract void get(long id, BytesRef Result);
}
+
+ // in the compressed case, we add a few additional operations for
+ // more efficient reverse lookup and enumeration
+ static class CompressedBinaryDocValues extends LongBinaryDocValues {
+ final BinaryEntry bytes;
+ final long interval;
+ final long numValues;
+ final long numIndexValues;
+ final MonotonicBlockPackedReader addresses;
+ final IndexInput data;
+ final TermsEnum termsEnum;
+
+ public CompressedBinaryDocValues(BinaryEntry bytes, MonotonicBlockPackedReader addresses, IndexInput data) throws IOException {
+ this.bytes = bytes;
+ this.interval = bytes.addressInterval;
+ this.addresses = addresses;
+ this.data = data;
+ this.numValues = bytes.count;
+ this.numIndexValues = addresses.size();
+ this.termsEnum = getTermsEnum(data);
+ }
+
+ @Override
+ public void get(long id, BytesRef result) {
+ try {
+ termsEnum.seekExact(id);
+ BytesRef term = termsEnum.term();
+ result.bytes = term.bytes;
+ result.offset = term.offset;
+ result.length = term.length;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ long lookupTerm(BytesRef key) {
+ try {
+ SeekStatus status = termsEnum.seekCeil(key);
+ if (status == SeekStatus.END) {
+ return -numValues-1;
+ } else if (status == SeekStatus.FOUND) {
+ return termsEnum.ord();
+ } else {
+ return -termsEnum.ord()-1;
+ }
+ } catch (IOException bogus) {
+ throw new RuntimeException(bogus);
+ }
+ }
+
+ TermsEnum getTermsEnum() {
+ try {
+ return getTermsEnum(data.clone());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private TermsEnum getTermsEnum(final IndexInput input) throws IOException {
+ input.seek(bytes.offset);
+
+ return new TermsEnum() {
+ private long currentOrd = -1;
+ // TODO: maxLength is negative when all terms are merged away...
+ private final BytesRef termBuffer = new BytesRef(bytes.maxLength < 0 ? 0 : bytes.maxLength);
+ private final BytesRef term = new BytesRef(); // TODO: paranoia?
+
+ @Override
+ public BytesRef next() throws IOException {
+ if (doNext() == null) {
+ return null;
+ } else {
+ setTerm();
+ return term;
+ }
+ }
+
+ private BytesRef doNext() throws IOException {
+ if (++currentOrd >= numValues) {
+ return null;
+ } else {
+ int start = input.readVInt();
+ int suffix = input.readVInt();
+ input.readBytes(termBuffer.bytes, start, suffix);
+ termBuffer.length = start + suffix;
+ return termBuffer;
+ }
+ }
+
+ @Override
+ public SeekStatus seekCeil(BytesRef text) throws IOException {
+ // binary-search just the index values to find the block,
+ // then scan within the block
+ long low = 0;
+ long high = numIndexValues-1;
+
+ while (low <= high) {
+ long mid = (low + high) >>> 1;
+ doSeek(mid * interval);
+ int cmp = termBuffer.compareTo(text);
+
+ if (cmp < 0) {
+ low = mid + 1;
+ } else if (cmp > 0) {
+ high = mid - 1;
+ } else {
+ // we got lucky, found an indexed term
+ setTerm();
+ return SeekStatus.FOUND;
+ }
+ }
+
+ if (numIndexValues == 0) {
+ return SeekStatus.END;
+ }
+
+ // block before insertion point
+ long block = low-1;
+ doSeek(block < 0 ? -1 : block * interval);
+
+ while (doNext() != null) {
+ int cmp = termBuffer.compareTo(text);
+ if (cmp == 0) {
+ setTerm();
+ return SeekStatus.FOUND;
+ } else if (cmp > 0) {
+ setTerm();
+ return SeekStatus.NOT_FOUND;
+ }
+ }
+
+ return SeekStatus.END;
+ }
+
+ @Override
+ public void seekExact(long ord) throws IOException {
+ doSeek(ord);
+ setTerm();
+ }
+
+ private void doSeek(long ord) throws IOException {
+ long block = ord / interval;
+
+ if (ord >= currentOrd && block == currentOrd / interval) {
+ // seek within current block
+ } else {
+ // position before start of block
+ currentOrd = ord - ord % interval - 1;
+ input.seek(bytes.offset + addresses.get(block));
+ }
+
+ while (currentOrd < ord) {
+ doNext();
+ }
+ }
+
+ private void setTerm() {
+ // TODO: is there a cleaner way
+ term.bytes = new byte[termBuffer.length];
+ term.offset = 0;
+ term.copyBytes(termBuffer);
+ }
+
+ @Override
+ public BytesRef term() throws IOException {
+ return term;
+ }
+
+ @Override
+ public long ord() throws IOException {
+ return currentOrd;
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
+
+ @Override
+ public int docFreq() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long totalTermFreq() throws IOException {
+ return -1;
+ }
+
+ @Override
+ public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+ }
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedReader.java?rev=1512543&r1=1512542&r2=1512543&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedReader.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedReader.java Sat Aug 10 00:37:41 2013
@@ -78,5 +78,10 @@ public final class MonotonicBlockPackedR
final int idx = (int) (index & blockMask);
return minValues[block] + (long) (idx * averages[block]) + zigZagDecode(subReaders[block].get(idx));
}
+
+ /** Returns the number of values */
+ public long size() {
+ return valueCount;
+ }
}
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java?rev=1512543&r1=1512542&r2=1512543&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java Sat Aug 10 00:37:41 2013
@@ -24,8 +24,10 @@ import org.apache.lucene.codecs.DocValue
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer;
import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat;
+import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.BytesRef;
/**
* DocValues format that keeps everything on disk.
@@ -53,7 +55,13 @@ public final class CheapBastardDocValues
return new DiskDocValuesConsumer(state, DiskDocValuesFormat.DATA_CODEC,
DiskDocValuesFormat.DATA_EXTENSION,
DiskDocValuesFormat.META_CODEC,
- DiskDocValuesFormat.META_EXTENSION);
+ DiskDocValuesFormat.META_EXTENSION) {
+ // don't ever write an index, we dont want to use RAM :)
+ @Override
+ protected void addTermsDict(FieldInfo field, Iterable<BytesRef> values) throws IOException {
+ addBinaryField(field, values);
+ }
+ };
}
@Override
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java?rev=1512543&r1=1512542&r2=1512543&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java Sat Aug 10 00:37:41 2013
@@ -27,6 +27,7 @@ import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer;
import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException;
@@ -58,7 +59,7 @@ class CheapBastardDocValuesProducer exte
final int version;
try {
version = CodecUtil.checkHeader(in, metaCodec,
- DiskDocValuesFormat.VERSION_START,
+ DiskDocValuesFormat.VERSION_CURRENT,
DiskDocValuesFormat.VERSION_CURRENT);
numerics = new HashMap<Integer,NumericEntry>();
ords = new HashMap<Integer,NumericEntry>();
@@ -80,7 +81,7 @@ class CheapBastardDocValuesProducer exte
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
data = state.directory.openInput(dataName, state.context);
final int version2 = CodecUtil.checkHeader(data, dataCodec,
- DiskDocValuesFormat.VERSION_START,
+ DiskDocValuesFormat.VERSION_CURRENT,
DiskDocValuesFormat.VERSION_CURRENT);
if (version != version2) {
throw new CorruptIndexException("Versions mismatch");
@@ -193,6 +194,10 @@ class CheapBastardDocValuesProducer exte
static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
BinaryEntry entry = new BinaryEntry();
+ int format = meta.readVInt();
+ if (format != DiskDocValuesConsumer.BINARY_FIXED_UNCOMPRESSED && format != DiskDocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED) {
+ throw new CorruptIndexException("Unexpected format for binary entry: " + format + ", input=" + meta);
+ }
entry.minLength = meta.readVInt();
entry.maxLength = meta.readVInt();
entry.count = meta.readVLong();
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java?rev=1512543&r1=1512542&r2=1512543&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java Sat Aug 10 00:37:41 2013
@@ -1350,6 +1350,57 @@ public abstract class BaseDocValuesForma
dir.close();
}
+ private void doTestSortedVsFieldCache(int minLength, int maxLength) throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
+ Document doc = new Document();
+ Field idField = new StringField("id", "", Field.Store.NO);
+ Field indexedField = new StringField("indexed", "", Field.Store.NO);
+ Field dvField = new SortedDocValuesField("dv", new BytesRef());
+ doc.add(idField);
+ doc.add(indexedField);
+ doc.add(dvField);
+
+ // index some docs
+ int numDocs = atLeast(300);
+ for (int i = 0; i < numDocs; i++) {
+ idField.setStringValue(Integer.toString(i));
+ final int length;
+ if (minLength == maxLength) {
+ length = minLength; // fixed length
+ } else {
+ length = _TestUtil.nextInt(random(), minLength, maxLength);
+ }
+ String value = _TestUtil.randomSimpleString(random(), length);
+ indexedField.setStringValue(value);
+ dvField.setBytesValue(new BytesRef(value));
+ writer.addDocument(doc);
+ if (random().nextInt(31) == 0) {
+ writer.commit();
+ }
+ }
+
+ // delete some docs
+ int numDeletions = random().nextInt(numDocs/10);
+ for (int i = 0; i < numDeletions; i++) {
+ int id = random().nextInt(numDocs);
+ writer.deleteDocuments(new Term("id", Integer.toString(id)));
+ }
+ writer.close();
+
+ // compare
+ DirectoryReader ir = DirectoryReader.open(dir);
+ for (AtomicReaderContext context : ir.leaves()) {
+ AtomicReader r = context.reader();
+ SortedDocValues expected = FieldCache.DEFAULT.getTermsIndex(r, "indexed");
+ SortedDocValues actual = r.getSortedDocValues("dv");
+ assertEquals(r.maxDoc(), expected, actual);
+ }
+ ir.close();
+ dir.close();
+ }
+
public void testSortedFixedLengthVsStoredFields() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
@@ -1358,6 +1409,21 @@ public abstract class BaseDocValuesForma
}
}
+ public void testSortedFixedLengthVsFieldCache() throws Exception {
+ int numIterations = atLeast(1);
+ for (int i = 0; i < numIterations; i++) {
+ int fixedLength = _TestUtil.nextInt(random(), 1, 10);
+ doTestSortedVsFieldCache(fixedLength, fixedLength);
+ }
+ }
+
+ public void testSortedVariableLengthVsFieldCache() throws Exception {
+ int numIterations = atLeast(1);
+ for (int i = 0; i < numIterations; i++) {
+ doTestSortedVsFieldCache(1, 10);
+ }
+ }
+
public void testSortedVariableLengthVsStoredFields() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
@@ -1905,6 +1971,10 @@ public abstract class BaseDocValuesForma
}
}
+ private void assertEquals(int maxDoc, SortedDocValues expected, SortedDocValues actual) throws Exception {
+ assertEquals(maxDoc, new SingletonSortedSetDocValues(expected), new SingletonSortedSetDocValues(actual));
+ }
+
private void assertEquals(int maxDoc, SortedSetDocValues expected, SortedSetDocValues actual) throws Exception {
// can be null for the segment if no docs actually had any SortedDocValues
// in this case FC.getDocTermsOrds returns EMPTY
@@ -1932,6 +2002,74 @@ public abstract class BaseDocValuesForma
actual.lookupTerm(actualBytes);
assertEquals(expectedBytes, actualBytes);
}
+
+ // compare termsenum
+ assertEquals(expected.getValueCount(), expected.termsEnum(), actual.termsEnum());
+ }
+
+ private void assertEquals(long numOrds, TermsEnum expected, TermsEnum actual) throws Exception {
+ BytesRef ref;
+
+ // sequential next() through all terms
+ while ((ref = expected.next()) != null) {
+ assertEquals(ref, actual.next());
+ assertEquals(expected.ord(), actual.ord());
+ assertEquals(expected.term(), actual.term());
+ }
+ assertNull(actual.next());
+
+ // sequential seekExact(ord) through all terms
+ for (long i = 0; i < numOrds; i++) {
+ expected.seekExact(i);
+ actual.seekExact(i);
+ assertEquals(expected.ord(), actual.ord());
+ assertEquals(expected.term(), actual.term());
+ }
+
+ // sequential seekExact(BytesRef) through all terms
+ for (long i = 0; i < numOrds; i++) {
+ expected.seekExact(i);
+ assertTrue(actual.seekExact(expected.term()));
+ assertEquals(expected.ord(), actual.ord());
+ assertEquals(expected.term(), actual.term());
+ }
+
+ // sequential seekCeil(BytesRef) through all terms
+ for (long i = 0; i < numOrds; i++) {
+ expected.seekExact(i);
+ assertEquals(SeekStatus.FOUND, actual.seekCeil(expected.term()));
+ assertEquals(expected.ord(), actual.ord());
+ assertEquals(expected.term(), actual.term());
+ }
+
+ // random seekExact(ord)
+ for (long i = 0; i < numOrds; i++) {
+ long randomOrd = _TestUtil.nextLong(random(), 0, numOrds-1);
+ expected.seekExact(randomOrd);
+ actual.seekExact(randomOrd);
+ assertEquals(expected.ord(), actual.ord());
+ assertEquals(expected.term(), actual.term());
+ }
+
+ // random seekExact(BytesRef)
+ for (long i = 0; i < numOrds; i++) {
+ long randomOrd = _TestUtil.nextLong(random(), 0, numOrds-1);
+ expected.seekExact(randomOrd);
+ actual.seekExact(expected.term());
+ assertEquals(expected.ord(), actual.ord());
+ assertEquals(expected.term(), actual.term());
+ }
+
+ // random seekCeil(BytesRef)
+ for (long i = 0; i < numOrds; i++) {
+ BytesRef target = new BytesRef(_TestUtil.randomUnicodeString(random()));
+ SeekStatus expectedStatus = expected.seekCeil(target);
+ assertEquals(expectedStatus, actual.seekCeil(target));
+ if (expectedStatus != SeekStatus.END) {
+ assertEquals(expected.ord(), actual.ord());
+ assertEquals(expected.term(), actual.term());
+ }
+ }
}
private void doTestSortedSetVsUninvertedField(int minLength, int maxLength) throws Exception {