You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/08/19 18:28:03 UTC
svn commit: r1515496 - in /lucene/dev/branches/lucene5178/lucene:
codecs/src/java/org/apache/lucene/codecs/diskdv/
codecs/src/java/org/apache/lucene/codecs/simpletext/
core/src/java/org/apache/lucene/codecs/lucene45/
core/src/java/org/apache/lucene/ind...
Author: rmuir
Date: Mon Aug 19 16:28:03 2013
New Revision: 1515496
URL: http://svn.apache.org/r1515496
Log:
support missing for 4.5 and disk dv
Modified:
lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java
lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java
lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java
lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java
lucene/dev/branches/lucene5178/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java
lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java
lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
Modified: lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java (original)
+++ lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java Mon Aug 19 16:28:03 2013
@@ -17,427 +17,34 @@ package org.apache.lucene.codecs.diskdv;
* limitations under the License.
*/
-import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.DELTA_COMPRESSED;
-import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.GCD_COMPRESSED;
-import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.TABLE_COMPRESSED;
-
import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-import org.apache.lucene.codecs.CodecUtil;
-import org.apache.lucene.codecs.DocValuesProducer;
-import org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer;
-import org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat;
-import org.apache.lucene.index.BinaryDocValues;
-import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.codecs.lucene45.Lucene45DocValuesProducer;
import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.IndexFileNames;
-import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentReadState;
-import org.apache.lucene.index.SortedDocValues;
-import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.packed.BlockPackedReader;
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
-import org.apache.lucene.util.packed.PackedInts;
-
-class DiskDocValuesProducer extends DocValuesProducer {
- private final Map<Integer,NumericEntry> numerics;
- private final Map<Integer,NumericEntry> ords;
- private final Map<Integer,NumericEntry> ordIndexes;
- private final Map<Integer,BinaryEntry> binaries;
- private final IndexInput data;
- private final int maxDoc;
-
- DiskDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
- String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
- this.maxDoc = state.segmentInfo.getDocCount();
- // read in the entries from the metadata file.
- IndexInput in = state.directory.openInput(metaName, state.context);
- boolean success = false;
- final int version;
- try {
- version = CodecUtil.checkHeader(in, metaCodec,
- Lucene45DocValuesFormat.VERSION_CURRENT,
- Lucene45DocValuesFormat.VERSION_CURRENT);
- numerics = new HashMap<Integer,NumericEntry>();
- ords = new HashMap<Integer,NumericEntry>();
- ordIndexes = new HashMap<Integer,NumericEntry>();
- binaries = new HashMap<Integer,BinaryEntry>();
- readFields(in);
-
- success = true;
- } finally {
- if (success) {
- IOUtils.close(in);
- } else {
- IOUtils.closeWhileHandlingException(in);
- }
- }
-
- success = false;
- try {
- String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
- data = state.directory.openInput(dataName, state.context);
- final int version2 = CodecUtil.checkHeader(data, dataCodec,
- Lucene45DocValuesFormat.VERSION_CURRENT,
- Lucene45DocValuesFormat.VERSION_CURRENT);
- if (version != version2) {
- throw new CorruptIndexException("Versions mismatch");
- }
-
- success = true;
- } finally {
- if (!success) {
- IOUtils.closeWhileHandlingException(this.data);
- }
- }
-
- }
-
- private void readFields(IndexInput meta) throws IOException {
- int fieldNumber = meta.readVInt();
- while (fieldNumber != -1) {
- byte type = meta.readByte();
- if (type == Lucene45DocValuesFormat.NUMERIC) {
- numerics.put(fieldNumber, readNumericEntry(meta));
- } else if (type == Lucene45DocValuesFormat.BINARY) {
- BinaryEntry b = readBinaryEntry(meta);
- binaries.put(fieldNumber, b);
- } else if (type == Lucene45DocValuesFormat.SORTED) {
- // sorted = binary + numeric
- if (meta.readVInt() != fieldNumber) {
- throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
- }
- if (meta.readByte() != Lucene45DocValuesFormat.BINARY) {
- throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
- }
- BinaryEntry b = readBinaryEntry(meta);
- binaries.put(fieldNumber, b);
-
- if (meta.readVInt() != fieldNumber) {
- throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
- }
- if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) {
- throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
- }
- NumericEntry n = readNumericEntry(meta);
- ords.put(fieldNumber, n);
- } else if (type == Lucene45DocValuesFormat.SORTED_SET) {
- // sortedset = binary + numeric + ordIndex
- if (meta.readVInt() != fieldNumber) {
- throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
- }
- if (meta.readByte() != Lucene45DocValuesFormat.BINARY) {
- throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
- }
- BinaryEntry b = readBinaryEntry(meta);
- binaries.put(fieldNumber, b);
-
- if (meta.readVInt() != fieldNumber) {
- throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
- }
- if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) {
- throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
- }
- NumericEntry n1 = readNumericEntry(meta);
- ords.put(fieldNumber, n1);
-
- if (meta.readVInt() != fieldNumber) {
- throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
- }
- if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) {
- throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
- }
- NumericEntry n2 = readNumericEntry(meta);
- ordIndexes.put(fieldNumber, n2);
- } else {
- throw new CorruptIndexException("invalid type: " + type + ", resource=" + meta);
- }
- fieldNumber = meta.readVInt();
- }
- }
-
- static NumericEntry readNumericEntry(IndexInput meta) throws IOException {
- NumericEntry entry = new NumericEntry();
- entry.format = meta.readVInt();
- entry.packedIntsVersion = meta.readVInt();
- entry.offset = meta.readLong();
- entry.count = meta.readVLong();
- entry.blockSize = meta.readVInt();
- switch(entry.format) {
- case GCD_COMPRESSED:
- entry.minValue = meta.readLong();
- entry.gcd = meta.readLong();
- break;
- case TABLE_COMPRESSED:
- if (entry.count > Integer.MAX_VALUE) {
- throw new CorruptIndexException("Cannot use TABLE_COMPRESSED with more than MAX_VALUE values, input=" + meta);
- }
- final int uniqueValues = meta.readVInt();
- if (uniqueValues > 256) {
- throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + meta);
- }
- entry.table = new long[uniqueValues];
- for (int i = 0; i < uniqueValues; ++i) {
- entry.table[i] = meta.readLong();
- }
- break;
- case DELTA_COMPRESSED:
- break;
- default:
- throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
- }
- return entry;
- }
-
- static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
- BinaryEntry entry = new BinaryEntry();
- int format = meta.readVInt();
- if (format != Lucene45DocValuesConsumer.BINARY_FIXED_UNCOMPRESSED && format != Lucene45DocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED) {
- throw new CorruptIndexException("Unexpected format for binary entry: " + format + ", input=" + meta);
- }
- entry.minLength = meta.readVInt();
- entry.maxLength = meta.readVInt();
- entry.count = meta.readVLong();
- entry.offset = meta.readLong();
- if (entry.minLength != entry.maxLength) {
- entry.addressesOffset = meta.readLong();
- entry.packedIntsVersion = meta.readVInt();
- entry.blockSize = meta.readVInt();
- }
- return entry;
- }
- @Override
- public NumericDocValues getNumeric(FieldInfo field) throws IOException {
- NumericEntry entry = numerics.get(field.number);
- return getNumeric(field, entry);
- }
-
- private LongNumericDocValues getNumeric(FieldInfo field, final NumericEntry entry) throws IOException {
- final IndexInput data = this.data.clone();
- data.seek(entry.offset);
+class DiskDocValuesProducer extends Lucene45DocValuesProducer {
- switch (entry.format) {
- case DELTA_COMPRESSED:
- final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
- return new LongNumericDocValues() {
- @Override
- public long get(long id) {
- return reader.get(id);
- }
- };
- case GCD_COMPRESSED:
- final long min = entry.minValue;
- final long mult = entry.gcd;
- final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
- return new LongNumericDocValues() {
- @Override
- public long get(long id) {
- return min + mult * quotientReader.get(id);
- }
- };
- case TABLE_COMPRESSED:
- final long[] table = entry.table;
- final int bitsRequired = PackedInts.bitsRequired(table.length - 1);
- final PackedInts.Reader ords = PackedInts.getDirectReaderNoHeader(data, PackedInts.Format.PACKED, entry.packedIntsVersion, (int) entry.count, bitsRequired);
- return new LongNumericDocValues() {
- @Override
- long get(long id) {
- return table[(int) ords.get((int) id)];
- }
- };
- default:
- throw new AssertionError();
- }
+ DiskDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+ super(state, dataCodec, dataExtension, metaCodec, metaExtension);
}
@Override
- public BinaryDocValues getBinary(FieldInfo field) throws IOException {
- BinaryEntry bytes = binaries.get(field.number);
- if (bytes.minLength == bytes.maxLength) {
- return getFixedBinary(field, bytes);
- } else {
- return getVariableBinary(field, bytes);
- }
- }
-
- private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) {
- final IndexInput data = this.data.clone();
-
- return new LongBinaryDocValues() {
- @Override
- public void get(long id, BytesRef result) {
- long address = bytes.offset + id * bytes.maxLength;
- try {
- data.seek(address);
- // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource)
- // assume "they" own the bytes after calling this!
- final byte[] buffer = new byte[bytes.maxLength];
- data.readBytes(buffer, 0, buffer.length);
- result.bytes = buffer;
- result.offset = 0;
- result.length = buffer.length;
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- };
- }
-
- private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
- final IndexInput data = this.data.clone();
+ protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
data.seek(bytes.addressesOffset);
-
- final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, true);
- return new LongBinaryDocValues() {
- @Override
- public void get(long id, BytesRef result) {
- long startAddress = bytes.offset + (id == 0 ? 0 : + addresses.get(id-1));
- long endAddress = bytes.offset + addresses.get(id);
- int length = (int) (endAddress - startAddress);
- try {
- data.seek(startAddress);
- // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource)
- // assume "they" own the bytes after calling this!
- final byte[] buffer = new byte[length];
- data.readBytes(buffer, 0, buffer.length);
- result.bytes = buffer;
- result.offset = 0;
- result.length = length;
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- };
+ return new MonotonicBlockPackedReader(data.clone(), bytes.packedIntsVersion, bytes.blockSize, bytes.count, true);
}
@Override
- public SortedDocValues getSorted(FieldInfo field) throws IOException {
- final int valueCount = (int) binaries.get(field.number).count;
- final BinaryDocValues binary = getBinary(field);
- final NumericDocValues ordinals = getNumeric(field, ords.get(field.number));
- return new SortedDocValues() {
-
- @Override
- public int getOrd(int docID) {
- return (int) ordinals.get(docID);
- }
-
- @Override
- public void lookupOrd(int ord, BytesRef result) {
- binary.get(ord, result);
- }
-
- @Override
- public int getValueCount() {
- return valueCount;
- }
- };
+ protected MonotonicBlockPackedReader getIntervalInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
+ throw new AssertionError();
}
@Override
- public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
- final long valueCount = binaries.get(field.number).count;
- final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
- final LongNumericDocValues ordinals = getNumeric(field, ords.get(field.number));
- NumericEntry entry = ordIndexes.get(field.number);
- IndexInput data = this.data.clone();
+ protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException {
data.seek(entry.offset);
- final MonotonicBlockPackedReader ordIndex = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
-
- return new SortedSetDocValues() {
- long offset;
- long endOffset;
-
- @Override
- public long nextOrd() {
- if (offset == endOffset) {
- return NO_MORE_ORDS;
- } else {
- long ord = ordinals.get(offset);
- offset++;
- return ord;
- }
- }
-
- @Override
- public void setDocument(int docID) {
- offset = (docID == 0 ? 0 : ordIndex.get(docID-1));
- endOffset = ordIndex.get(docID);
- }
-
- @Override
- public void lookupOrd(long ord, BytesRef result) {
- binary.get(ord, result);
- }
-
- @Override
- public long getValueCount() {
- return valueCount;
- }
- };
- }
-
- @Override
- public Bits getDocsWithField(FieldInfo field) throws IOException {
- if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) {
- return new SortedSetDocsWithField(getSortedSet(field), maxDoc);
- } else {
- return new Bits.MatchAllBits(maxDoc);
- }
- }
-
- @Override
- public void close() throws IOException {
- data.close();
- }
-
- static class NumericEntry {
- long offset;
-
- int format;
- int packedIntsVersion;
- long count;
- int blockSize;
-
- long minValue;
- long gcd;
- long table[];
- }
-
- static class BinaryEntry {
- long offset;
-
- long count;
- int minLength;
- int maxLength;
- long addressesOffset;
- int packedIntsVersion;
- int blockSize;
- }
-
- // internally we compose complex dv (sorted/sortedset) from other ones
- static abstract class LongNumericDocValues extends NumericDocValues {
- @Override
- public final long get(int docID) {
- return get((long) docID);
- }
-
- abstract long get(long id);
- }
-
- static abstract class LongBinaryDocValues extends BinaryDocValues {
- @Override
- public final void get(int docID, BytesRef result) {
- get((long)docID, result);
- }
-
- abstract void get(long id, BytesRef Result);
+ return new MonotonicBlockPackedReader(data.clone(), entry.packedIntsVersion, entry.blockSize, entry.count, true);
}
}
Modified: lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java (original)
+++ lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java Mon Aug 19 16:28:03 2013
@@ -321,12 +321,8 @@ class SimpleTextDocValuesReader extends
@Override
public void lookupOrd(int ord, BytesRef result) {
try {
- if (ord == -1) {
- result.length = 0;
- return;
- }
- if (ord < -1 || ord >= field.numValues) {
- throw new IndexOutOfBoundsException("ord must be -1 .. " + (field.numValues-1) + "; got " + ord);
+ if (ord < 0 || ord >= field.numValues) {
+ throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues-1) + "; got " + ord);
}
in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
SimpleTextUtil.readLine(in, scratch);
Modified: lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java (original)
+++ lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java Mon Aug 19 16:28:03 2013
@@ -23,7 +23,6 @@ import java.util.HashSet;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
-import org.apache.lucene.codecs.MissingOrdRemapper;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
@@ -88,14 +87,20 @@ public class Lucene45DocValuesConsumer e
long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE;
long gcd = 0;
+ boolean missing = false;
// TODO: more efficient?
HashSet<Long> uniqueValues = null;
if (optimizeStorage) {
uniqueValues = new HashSet<>();
- // nocommit: impl null values (ideally smartly)
for (Number nv : values) {
- final long v = nv == null ? 0 : nv.longValue();
+ final long v;
+ if (nv == null) {
+ v = 0;
+ missing = true;
+ } else {
+ v = nv.longValue();
+ }
if (gcd != 1) {
if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
@@ -142,6 +147,12 @@ public class Lucene45DocValuesConsumer e
meta.writeVInt(field.number);
meta.writeByte(Lucene45DocValuesFormat.NUMERIC);
meta.writeVInt(format);
+ if (missing) {
+ meta.writeLong(data.getFilePointer());
+ writeMissingBitset(values);
+ } else {
+ meta.writeLong(-1L);
+ }
meta.writeVInt(PackedInts.VERSION_CURRENT);
meta.writeLong(data.getFilePointer());
meta.writeVLong(count);
@@ -184,6 +195,27 @@ public class Lucene45DocValuesConsumer e
throw new AssertionError();
}
}
+
+ // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
+ // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
+ void writeMissingBitset(Iterable<?> values) throws IOException {
+ byte bits = 0;
+ int count = 0;
+ for (Object v : values) {
+ if (count == 8) {
+ data.writeByte(bits);
+ count = 0;
+ bits = 0;
+ }
+ if (v != null) {
+ bits |= 1 << (count & 7);
+ }
+ count++;
+ }
+ if (count > 0) {
+ data.writeByte(bits);
+ }
+ }
@Override
public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
@@ -194,8 +226,15 @@ public class Lucene45DocValuesConsumer e
int maxLength = Integer.MIN_VALUE;
final long startFP = data.getFilePointer();
long count = 0;
+ boolean missing = false;
for(BytesRef v : values) {
- final int length = v == null ? 0 : v.length;
+ final int length;
+ if (v == null) {
+ length = 0;
+ missing = true;
+ } else {
+ length = v.length;
+ }
minLength = Math.min(minLength, length);
maxLength = Math.max(maxLength, length);
if (v != null) {
@@ -204,6 +243,12 @@ public class Lucene45DocValuesConsumer e
count++;
}
meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED);
+ if (missing) {
+ meta.writeLong(data.getFilePointer());
+ writeMissingBitset(values);
+ } else {
+ meta.writeLong(-1L);
+ }
meta.writeVInt(minLength);
meta.writeVInt(maxLength);
meta.writeVLong(count);
@@ -244,6 +289,7 @@ public class Lucene45DocValuesConsumer e
meta.writeVInt(field.number);
meta.writeByte(Lucene45DocValuesFormat.BINARY);
meta.writeVInt(BINARY_PREFIX_COMPRESSED);
+ meta.writeLong(-1L);
// now write the bytes: sharing prefixes within a block
final long startFP = data.getFilePointer();
// currently, we have to store the delta from expected for every 1/nth term
@@ -286,34 +332,6 @@ public class Lucene45DocValuesConsumer e
@Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
- // nocommit: remove this hack and support missing!
-
- // three cases for simulating the old writer:
- // 1. no missing
- // 2. missing (and empty string in use): remap ord=-1 -> ord=0
- // 3. missing (and empty string not in use): remap all ords +1, insert empty string into values
- boolean anyMissing = false;
- for (Number n : docToOrd) {
- if (n.longValue() == -1) {
- anyMissing = true;
- break;
- }
- }
-
- boolean hasEmptyString = false;
- for (BytesRef b : values) {
- hasEmptyString = b.length == 0;
- break;
- }
-
- if (!anyMissing) {
- // nothing to do
- } else if (hasEmptyString) {
- docToOrd = MissingOrdRemapper.mapMissingToOrd0(docToOrd);
- } else {
- docToOrd = MissingOrdRemapper.mapAllOrds(docToOrd);
- values = MissingOrdRemapper.insertEmptyValue(values);
- }
meta.writeVInt(field.number);
meta.writeByte(Lucene45DocValuesFormat.SORTED);
addTermsDict(field, values);
@@ -334,6 +352,7 @@ public class Lucene45DocValuesConsumer e
meta.writeVInt(field.number);
meta.writeByte(Lucene45DocValuesFormat.NUMERIC);
meta.writeVInt(DELTA_COMPRESSED);
+ meta.writeLong(-1L);
meta.writeVInt(PackedInts.VERSION_CURRENT);
meta.writeLong(data.getFilePointer());
meta.writeVLong(maxDoc);
Modified: lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java (original)
+++ lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java Mon Aug 19 16:28:03 2013
@@ -53,7 +53,7 @@ import org.apache.lucene.util.packed.Blo
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
import org.apache.lucene.util.packed.PackedInts;
-class Lucene45DocValuesProducer extends DocValuesProducer {
+public class Lucene45DocValuesProducer extends DocValuesProducer {
private final Map<Integer,NumericEntry> numerics;
private final Map<Integer,BinaryEntry> binaries;
private final Map<Integer,NumericEntry> ords;
@@ -65,7 +65,7 @@ class Lucene45DocValuesProducer extends
private final Map<Integer,MonotonicBlockPackedReader> addressInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
private final Map<Integer,MonotonicBlockPackedReader> ordIndexInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
- Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+ protected Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
// read in the entries from the metadata file.
IndexInput in = state.directory.openInput(metaName, state.context);
@@ -176,6 +176,7 @@ class Lucene45DocValuesProducer extends
static NumericEntry readNumericEntry(IndexInput meta) throws IOException {
NumericEntry entry = new NumericEntry();
entry.format = meta.readVInt();
+ entry.missingOffset = meta.readLong();
entry.packedIntsVersion = meta.readVInt();
entry.offset = meta.readLong();
entry.count = meta.readVLong();
@@ -209,6 +210,7 @@ class Lucene45DocValuesProducer extends
static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
BinaryEntry entry = new BinaryEntry();
entry.format = meta.readVInt();
+ entry.missingOffset = meta.readLong();
entry.minLength = meta.readVInt();
entry.maxLength = meta.readVInt();
entry.count = meta.readVLong();
@@ -315,9 +317,7 @@ class Lucene45DocValuesProducer extends
};
}
- private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
- final IndexInput data = this.data.clone();
-
+ protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
final MonotonicBlockPackedReader addresses;
synchronized (addressInstances) {
MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
@@ -328,6 +328,13 @@ class Lucene45DocValuesProducer extends
}
addresses = addrInstance;
}
+ return addresses;
+ }
+
+ private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
+ final IndexInput data = this.data.clone();
+
+ final MonotonicBlockPackedReader addresses = getAddressInstance(data, field, bytes);
return new LongBinaryDocValues() {
@Override
@@ -350,12 +357,10 @@ class Lucene45DocValuesProducer extends
}
};
}
-
- private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
- final IndexInput data = this.data.clone();
- final long interval = bytes.addressInterval;
-
+
+ protected MonotonicBlockPackedReader getIntervalInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
final MonotonicBlockPackedReader addresses;
+ final long interval = bytes.addressInterval;
synchronized (addressInstances) {
MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
if (addrInstance == null) {
@@ -371,6 +376,14 @@ class Lucene45DocValuesProducer extends
}
addresses = addrInstance;
}
+ return addresses;
+ }
+
+
+ private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
+ final IndexInput data = this.data.clone();
+
+ final MonotonicBlockPackedReader addresses = getIntervalInstance(data, field, bytes);
return new CompressedBinaryDocValues(bytes, addresses, data);
}
@@ -420,26 +433,30 @@ class Lucene45DocValuesProducer extends
}
};
}
-
- @Override
- public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
- final long valueCount = binaries.get(field.number).count;
- // we keep the byte[]s and list of ords on disk, these could be large
- final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
- final LongNumericDocValues ordinals = getNumeric(ords.get(field.number));
- // but the addresses to the ord stream are in RAM
+
+ protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException {
final MonotonicBlockPackedReader ordIndex;
synchronized (ordIndexInstances) {
MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number);
if (ordIndexInstance == null) {
- NumericEntry entry = ordIndexes.get(field.number);
- IndexInput data = this.data.clone();
data.seek(entry.offset);
ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
ordIndexInstances.put(field.number, ordIndexInstance);
}
ordIndex = ordIndexInstance;
}
+ return ordIndex;
+ }
+
+ @Override
+ public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
+ final IndexInput data = this.data.clone();
+ final long valueCount = binaries.get(field.number).count;
+ // we keep the byte[]s and list of ords on disk, these could be large
+ final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
+ final LongNumericDocValues ordinals = getNumeric(ords.get(field.number));
+ // but the addresses to the ord stream are in RAM
+ final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(data, field, ordIndexes.get(field.number));
return new SortedSetDocValues() {
long offset;
@@ -491,15 +508,47 @@ class Lucene45DocValuesProducer extends
}
};
}
+
+ public Bits getMissingBits(final long offset) throws IOException {
+ if (offset == -1) {
+ return new Bits.MatchAllBits(maxDoc);
+ } else {
+ final IndexInput in = data.clone();
+ return new Bits() {
+
+ @Override
+ public boolean get(int index) {
+ try {
+ in.seek(offset + (index >> 3));
+ return (in.readByte() & (1 << (index & 7))) != 0;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public int length() {
+ return maxDoc;
+ }
+ };
+ }
+ }
@Override
public Bits getDocsWithField(FieldInfo field) throws IOException {
- // nocommit: only use this if the field's entry has missing values (write that),
- // otherwise return MatchAllBits
- if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) {
- return new SortedSetDocsWithField(getSortedSet(field), maxDoc);
- } else {
- return new Bits.MatchAllBits(maxDoc);
+ switch(field.getDocValuesType()) {
+ case SORTED_SET:
+ return new SortedSetDocsWithField(getSortedSet(field), maxDoc);
+ case SORTED:
+ return new SortedDocsWithField(getSorted(field), maxDoc);
+ case BINARY:
+ BinaryEntry be = binaries.get(field.number);
+ return getMissingBits(be.missingOffset);
+ case NUMERIC:
+ NumericEntry ne = numerics.get(field.number);
+ return getMissingBits(ne.missingOffset);
+ default:
+ throw new AssertionError();
}
}
@@ -508,30 +557,32 @@ class Lucene45DocValuesProducer extends
data.close();
}
- static class NumericEntry {
- long offset;
-
- int format;
- int packedIntsVersion;
- long count;
- int blockSize;
+ protected static class NumericEntry {
+ long missingOffset;
+ public long offset;
+
+ public int format;
+ public int packedIntsVersion;
+ public long count;
+ public int blockSize;
long minValue;
long gcd;
long table[];
}
- static class BinaryEntry {
+ protected static class BinaryEntry {
+ long missingOffset;
long offset;
int format;
- long count;
+ public long count;
int minLength;
int maxLength;
- long addressesOffset;
- long addressInterval;
- int packedIntsVersion;
- int blockSize;
+ public long addressesOffset;
+ public long addressInterval;
+ public int packedIntsVersion;
+ public int blockSize;
}
// internally we compose complex dv (sorted/sortedset) from other ones
Modified: lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java (original)
+++ lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java Mon Aug 19 16:28:03 2013
@@ -43,7 +43,7 @@ public abstract class SortedDocValues ex
public abstract int getOrd(int docID);
/** Retrieves the value for the specified ordinal.
- * @param ord ordinal to lookup
+ * @param ord ordinal to lookup (must be >= 0 and < {@link #getValueCount()})
* @param result will be populated with the ordinal's value
* @see #getOrd(int)
*/
Modified: lucene/dev/branches/lucene5178/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java (original)
+++ lucene/dev/branches/lucene5178/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java Mon Aug 19 16:28:03 2013
@@ -480,7 +480,7 @@ public class TestFieldCache extends Luce
} catch (IllegalStateException expected) {}
Bits bits = FieldCache.DEFAULT.getDocsWithField(ar, "binary");
- assertTrue(bits instanceof Bits.MatchAllBits);
+ assertTrue(bits.get(0));
// Sorted type: can be retrieved via getTerms(), getTermsIndex(), getDocTermOrds()
try {
@@ -510,7 +510,7 @@ public class TestFieldCache extends Luce
assertEquals(1, sortedSet.getValueCount());
bits = FieldCache.DEFAULT.getDocsWithField(ar, "sorted");
- assertTrue(bits instanceof Bits.MatchAllBits);
+ assertTrue(bits.get(0));
// Numeric type: can be retrieved via getInts() and so on
Ints numeric = FieldCache.DEFAULT.getInts(ar, "numeric", false);
@@ -537,7 +537,7 @@ public class TestFieldCache extends Luce
} catch (IllegalStateException expected) {}
bits = FieldCache.DEFAULT.getDocsWithField(ar, "numeric");
- assertTrue(bits instanceof Bits.MatchAllBits);
+ assertTrue(bits.get(0));
// SortedSet type: can be retrieved via getDocTermOrds()
if (defaultCodecSupportsSortedSet()) {
Modified: lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java (original)
+++ lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java Mon Aug 19 16:28:03 2013
@@ -438,14 +438,14 @@ public class AssertingAtomicReader exten
this.in = in;
this.maxDoc = maxDoc;
this.valueCount = in.getValueCount();
- assert valueCount >= 1 && valueCount <= maxDoc;
+ assert valueCount >= 0 && valueCount <= maxDoc;
}
@Override
public int getOrd(int docID) {
assert docID >= 0 && docID < maxDoc;
int ord = in.getOrd(docID);
- assert ord >= 0 && ord < valueCount;
+ assert ord >= -1 && ord < valueCount;
return ord;
}
Modified: lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java (original)
+++ lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java Mon Aug 19 16:28:03 2013
@@ -696,7 +696,10 @@ public abstract class BaseDocValuesForma
BytesRef scratch = new BytesRef();
dv.lookupOrd(dv.getOrd(0), scratch);
assertEquals(new BytesRef("hello world 2"), scratch);
- dv.lookupOrd(dv.getOrd(1), scratch);
+ if (codecSupportsDocsWithField("dv")) {
+ assertEquals(-1, dv.getOrd(1));
+ }
+ dv.get(1, scratch);
assertEquals(new BytesRef(""), scratch);
ireader.close();
directory.close();
Modified: lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java (original)
+++ lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java Mon Aug 19 16:28:03 2013
@@ -1372,8 +1372,11 @@ public abstract class LuceneTestCase ext
/** Returns true if the codec for the field "supports" docsWithField
* (other codecs return MatchAllBits, because you couldnt write missing values before) */
public static boolean codecSupportsDocsWithField(String field) {
- // currently only one codec!
- return _TestUtil.getDocValuesFormat(Codec.getDefault(), field).equals("SimpleText");
+ String name = _TestUtil.getDocValuesFormat(Codec.getDefault(), field);
+ if (name.equals("Lucene40") || name.equals("Lucene42")) {
+ return false;
+ }
+ return true;
}
public void assertReaderEquals(String info, IndexReader leftReader, IndexReader rightReader) throws IOException {