You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rj...@apache.org on 2014/10/31 21:25:22 UTC
svn commit: r1635856 - in /lucene/dev/branches/branch_5x: ./ lucene/
lucene/core/ lucene/core/src/java/org/apache/lucene/codecs/lucene50/
lucene/core/src/test/org/apache/lucene/codecs/lucene50/
lucene/test-framework/ lucene/test-framework/src/java/org/...
Author: rjernst
Date: Fri Oct 31 20:25:22 2014
New Revision: 1635856
URL: http://svn.apache.org/r1635856
Log:
LUCENE-6030: Add norms patched compression for a small number of common values (merged 1635854)
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/lucene/ (props changed)
lucene/dev/branches/branch_5x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/lucene/core/ (props changed)
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java
lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java
lucene/dev/branches/branch_5x/lucene/test-framework/ (props changed)
lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java
Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1635856&r1=1635855&r2=1635856&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Fri Oct 31 20:25:22 2014
@@ -236,6 +236,9 @@ Optimizations
* LUCENE-6022: DocValuesDocIdSet checks live docs before doc values.
(Adrien Grand)
+* LUCENE-6030: Add norms patched compression for a small number of common values
+ (Ryan Ernst)
+
Build
* LUCENE-5909: Smoke tester now has better command line parsing and
Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java?rev=1635856&r1=1635855&r2=1635856&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsConsumer.java Fri Oct 31 20:25:22 2014
@@ -19,9 +19,7 @@ package org.apache.lucene.codecs.lucene5
import java.io.IOException;
import java.util.Arrays;
-import java.util.HashMap;
import java.util.Iterator;
-import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.NormsConsumer;
@@ -31,6 +29,7 @@ import org.apache.lucene.index.SegmentWr
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.FilterIterator;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.packed.BlockPackedWriter;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
@@ -47,7 +46,8 @@ class Lucene50NormsConsumer extends Norm
static final byte CONST_COMPRESSED = 2;
static final byte UNCOMPRESSED = 3;
static final byte INDIRECT = 4;
- static final byte PATCHED = 5;
+ static final byte PATCHED_BITSET = 5;
+ static final byte PATCHED_TABLE = 6;
static final int BLOCK_SIZE = 1 << 14;
// threshold for indirect encoding, computed as 1 - 1/log2(maxint)
@@ -89,11 +89,7 @@ class Lucene50NormsConsumer extends Norm
private void writeNormsField(FieldInfo field, Iterable<Number> values, int level) throws IOException {
assert level <= 1; // we only "recurse" once in the indirect case
meta.writeVInt(field.number);
- long minValue = Long.MAX_VALUE;
- long maxValue = Long.MIN_VALUE;
- // TODO: more efficient?
NormMap uniqueValues = new NormMap();
-
int count = 0;
for (Number nv : values) {
@@ -101,45 +97,76 @@ class Lucene50NormsConsumer extends Norm
throw new IllegalStateException("illegal norms data for field " + field.name + ", got null for value: " + count);
}
final long v = nv.longValue();
-
- minValue = Math.min(minValue, v);
- maxValue = Math.max(maxValue, v);
-
+
if (uniqueValues != null) {
- if (uniqueValues.add(v)) {
- if (uniqueValues.size > 256) {
- uniqueValues = null;
+ if (v >= Byte.MIN_VALUE && v <= Byte.MAX_VALUE) {
+ if (uniqueValues.add((byte) v)) {
+ if (uniqueValues.size > 256) {
+ uniqueValues = null;
+ }
}
+ } else {
+ // anything outside an 8 bit float comes from a custom scorer, which is an extreme edge case
+ uniqueValues = null;
}
}
count++;
}
- if (uniqueValues != null && uniqueValues.size == 1) {
+
+ if (uniqueValues == null) {
+ addDeltaCompressed(values, count);
+ } else if (uniqueValues.size == 1) {
// 0 bpv
- addConstant(minValue);
- } else if (level == 0 && count > 256 && uniqueValues != null && uniqueValues.maxFreq() > count * INDIRECT_THRESHOLD) {
- long commonValue = uniqueValues.getDecodeTable()[uniqueValues.maxOrd()];
- if (commonValue == 0) {
- // if the common value is missing, don't waste RAM on a bitset, since we won't be searching those docs
- addIndirect(field, values, count, uniqueValues);
- } else {
- // otherwise, write a sparse bitset, where 1 indicates 'uncommon value'.
- addPatched(field, values, count, uniqueValues);
- }
- } else if (uniqueValues != null) {
- // small number of unique values: this is the typical case:
- FormatAndBits compression = fastestFormatAndBits(uniqueValues.size-1);
+ addConstant(uniqueValues.values[0]);
+ } else {
+ // small number of unique values: this is the typical case
+ uniqueValues.optimizeOrdinals();
- if (compression.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
- addUncompressed(values, count);
+ int numCommonValues = -1;
+ int commonValuesCount = 0;
+ if (level == 0 && count > 256) {
+ float threshold_count = count * INDIRECT_THRESHOLD;
+ if (uniqueValues.freqs[0] > threshold_count) {
+ numCommonValues = 1;
+ } else if ((commonValuesCount = sum(uniqueValues.freqs, 0, 3)) > threshold_count && uniqueValues.size > 4) {
+ numCommonValues = 3;
+ } else if ((commonValuesCount = sum(uniqueValues.freqs, 0, 15)) > threshold_count && uniqueValues.size > 16) {
+ numCommonValues = 15;
+ }
+ }
+
+ if (numCommonValues == -1) {
+ // no pattern in values, just find the most efficient way to pack the values
+ FormatAndBits compression = fastestFormatAndBits(uniqueValues.size - 1);
+ if (compression.bitsPerValue == 8) {
+ addUncompressed(values, count);
+ } else {
+ addTableCompressed(values, compression, count, uniqueValues);
+ }
+
+ } else if (numCommonValues == 1) {
+ byte commonValue = uniqueValues.values[0];
+ if (commonValue == 0) {
+ // if the common value is missing, don't waste RAM on a bitset, since we won't be searching those docs
+ addIndirect(field, values, count, uniqueValues, 0);
+ } else {
+ // otherwise, write a sparse bitset, where 1 indicates 'uncommon value'.
+ addPatchedBitset(field, values, count, uniqueValues);
+ }
} else {
- addTableCompressed(values, compression, count, uniqueValues);
+ addPatchedTable(field, values, numCommonValues, commonValuesCount, count, uniqueValues);
}
- } else {
- addDeltaCompressed(values, count);
}
}
+ private int sum(int[] freqs, int start, int end) {
+ int accum = 0;
+ for (int i = start; i < end; ++i) {
+ accum += freqs[i];
+ }
+ return accum;
+ }
+
private FormatAndBits fastestFormatAndBits(int max) {
// we only use bpv=1,2,4,8
PackedInts.Format format = PackedInts.Format.PACKED_SINGLE_BLOCK;
@@ -152,18 +179,18 @@ class Lucene50NormsConsumer extends Norm
return new FormatAndBits(format, bitsPerValue);
}
- private void addConstant(long constant) throws IOException {
+ private void addConstant(byte constant) throws IOException {
meta.writeVInt(0);
meta.writeByte(CONST_COMPRESSED);
meta.writeLong(constant);
}
-
+
private void addUncompressed(Iterable<Number> values, int count) throws IOException {
meta.writeVInt(count);
meta.writeByte(UNCOMPRESSED); // uncompressed byte[]
meta.writeLong(data.getFilePointer());
for (Number nv : values) {
- data.writeByte((byte) nv.longValue());
+ data.writeByte(nv.byteValue());
}
}
@@ -171,25 +198,28 @@ class Lucene50NormsConsumer extends Norm
meta.writeVInt(count);
meta.writeByte(TABLE_COMPRESSED); // table-compressed
meta.writeLong(data.getFilePointer());
- data.writeVInt(PackedInts.VERSION_CURRENT);
-
- long[] decode = uniqueValues.getDecodeTable();
- // upgrade to power of two sized array
- int size = 1 << compression.bitsPerValue;
- data.writeVInt(size);
- for (int i = 0; i < decode.length; i++) {
- data.writeLong(decode[i]);
- }
- for (int i = decode.length; i < size; i++) {
- data.writeLong(0);
- }
+ writeTable(values, compression, count, uniqueValues, uniqueValues.size);
+ }
+
+ private void writeTable(Iterable<Number> values, FormatAndBits compression, int count, NormMap uniqueValues, int numOrds) throws IOException {
+ data.writeVInt(PackedInts.VERSION_CURRENT);
data.writeVInt(compression.format.getId());
data.writeVInt(compression.bitsPerValue);
+
+ data.writeVInt(numOrds);
+ for (int i = 0; i < numOrds; i++) {
+ data.writeByte(uniqueValues.values[i]);
+ }
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, compression.format, count, compression.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
for(Number nv : values) {
- writer.add(uniqueValues.getOrd(nv.longValue()));
+ int ord = uniqueValues.ord(nv.byteValue());
+ if (ord < numOrds) {
+ writer.add(ord);
+ } else {
+ writer.add(numOrds); // collapses all ords >= numOrds into a single value
+ }
}
writer.finish();
}
@@ -211,17 +241,15 @@ class Lucene50NormsConsumer extends Norm
// encodes only uncommon values in a sparse bitset
// access is constant time, and the common case is predictable
// exceptions nest either to CONST (if there are only 2 values), or INDIRECT (if there are > 2 values)
- private void addPatched(FieldInfo field, final Iterable<Number> values, int count, NormMap uniqueValues) throws IOException {
- final long decodeTable[] = uniqueValues.getDecodeTable();
- int commonCount = uniqueValues.maxFreq();
- final long commonValue = decodeTable[uniqueValues.maxOrd()];
+ private void addPatchedBitset(FieldInfo field, final Iterable<Number> values, int count, NormMap uniqueValues) throws IOException {
+ int commonCount = uniqueValues.freqs[0];
meta.writeVInt(count - commonCount);
- meta.writeByte(PATCHED);
+ meta.writeByte(PATCHED_BITSET);
meta.writeLong(data.getFilePointer());
// write docs with value
- writeDocsWithValue(values, commonValue);
+ writeDocsWithValue(values, uniqueValues, 0);
// write exceptions: only two cases make sense
// bpv = 1 (folded into sparse bitset already)
@@ -229,44 +257,58 @@ class Lucene50NormsConsumer extends Norm
meta.writeVInt(field.number);
if (uniqueValues.size == 2) {
// special case: implicit in bitset
- int otherOrd = uniqueValues.maxOrd() == 0 ? 1 : 0;
- addConstant(decodeTable[otherOrd]);
+ addConstant(uniqueValues.values[1]);
} else {
// exception table
- addIndirect(field, values, count, uniqueValues);
+ addIndirect(field, values, count, uniqueValues, 0);
}
}
+
+ // encodes common values in a table, and the rest of the values as exceptions using INDIRECT.
+ // the exceptions should not be accessed very often, since the values are uncommon
+ private void addPatchedTable(FieldInfo field, final Iterable<Number> values, final int numCommonValues, int commonValuesCount, int count, final NormMap uniqueValues) throws IOException {
+ meta.writeVInt(count);
+ meta.writeByte(PATCHED_TABLE);
+ meta.writeLong(data.getFilePointer());
+
+ assert numCommonValues == 3 || numCommonValues == 15;
+ FormatAndBits compression = fastestFormatAndBits(numCommonValues);
+
+ writeTable(values, compression, count, uniqueValues, numCommonValues);
+
+ meta.writeVInt(field.number);
+ addIndirect(field, values, count - commonValuesCount, uniqueValues, numCommonValues);
+ }
// encodes values as sparse array: keys[] and values[]
// access is log(N) where N = keys.length (slow!)
// so this is only appropriate as an exception table for patched, or when common value is 0 (wont be accessed by searching)
- private void addIndirect(FieldInfo field, final Iterable<Number> values, int count, NormMap uniqueValues) throws IOException {
- int commonCount = uniqueValues.maxFreq();
- final long commonValue = uniqueValues.getDecodeTable()[uniqueValues.maxOrd()];
+ private void addIndirect(FieldInfo field, final Iterable<Number> values, int count, final NormMap uniqueValues, final int minOrd) throws IOException {
+ int commonCount = uniqueValues.freqs[minOrd];
meta.writeVInt(count - commonCount);
meta.writeByte(INDIRECT);
meta.writeLong(data.getFilePointer());
// write docs with value
- writeDocsWithValue(values, commonValue);
+ writeDocsWithValue(values, uniqueValues, minOrd);
// write actual values
writeNormsField(field, new Iterable<Number>() {
@Override
public Iterator<Number> iterator() {
- return new FilterIterator<Number,Number>(values.iterator()) {
+ return new FilterIterator<Number, Number>(values.iterator()) {
@Override
protected boolean predicateFunction(Number value) {
- return value.longValue() != commonValue;
+ return uniqueValues.ord(value.byteValue()) > minOrd;
}
};
}
}, 1);
}
- private void writeDocsWithValue(final Iterable<Number> values, long commonValue) throws IOException {
- data.writeLong(commonValue);
+ private void writeDocsWithValue(final Iterable<Number> values, NormMap uniqueValues, int minOrd) throws IOException {
+ data.writeLong(uniqueValues.values[minOrd]);
data.writeVInt(PackedInts.VERSION_CURRENT);
data.writeVInt(BLOCK_SIZE);
@@ -274,8 +316,8 @@ class Lucene50NormsConsumer extends Norm
final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
int doc = 0;
for (Number n : values) {
- long v = n.longValue();
- if (v != commonValue) {
+ int ord = uniqueValues.ord(n.byteValue());
+ if (ord > minOrd) {
writer.add(doc);
}
doc++;
@@ -304,103 +346,63 @@ class Lucene50NormsConsumer extends Norm
meta = data = null;
}
}
-
+
// specialized deduplication of long->ord for norms: 99.99999% of the time this will be a single-byte range.
static class NormMap {
// we use short: at most we will add 257 values to this map before its rejected as too big above.
- final short[] singleByteRange = new short[256];
+ private final short[] ords = new short[256];
final int[] freqs = new int[257];
- final Map<Long,Short> other = new HashMap<Long,Short>();
+ final byte[] values = new byte[257];
int size;
-
+
{
- Arrays.fill(singleByteRange, (short)-1);
+ Arrays.fill(ords, (short)-1);
}
- /** adds an item to the mapping. returns true if actually added */
- public boolean add(long l) {
+ // adds an item to the mapping. returns true if actually added
+ public boolean add(byte l) {
assert size <= 256; // once we add > 256 values, we nullify the map in addNumericField and don't use this strategy
- if (l >= Byte.MIN_VALUE && l <= Byte.MAX_VALUE) {
- int index = (int) (l + 128);
- short previous = singleByteRange[index];
- if (previous < 0) {
- short slot = (short) size;
- singleByteRange[index] = slot;
- freqs[slot]++;
- size++;
- return true;
- } else {
- freqs[previous]++;
- return false;
- }
+ int index = (int)l + 128;
+ short previous = ords[index];
+ if (previous < 0) {
+ short slot = (short)size;
+ ords[index] = slot;
+ freqs[slot]++;
+ values[slot] = l;
+ size++;
+ return true;
} else {
- Short previous = other.get(l);
- if (previous == null) {
- freqs[size]++;
- other.put(l, (short)size);
- size++;
- return true;
- } else {
- freqs[previous]++;
- return false;
- }
+ freqs[previous]++;
+ return false;
}
}
-
- /** gets the ordinal for a previously added item */
- public int getOrd(long l) {
- if (l >= Byte.MIN_VALUE && l <= Byte.MAX_VALUE) {
- int index = (int) (l + 128);
- return singleByteRange[index];
- } else {
- // NPE if something is screwed up
- return other.get(l);
- }
+
+ public int ord(byte value) {
+ return ords[(int)value + 128];
}
-
- /** retrieves the ordinal table for previously added items */
- public long[] getDecodeTable() {
- long decode[] = new long[size];
- for (int i = 0; i < singleByteRange.length; i++) {
- short s = singleByteRange[i];
- if (s >= 0) {
- decode[s] = i - 128;
+
+ // reassign ordinals so higher frequencies have lower ordinals
+ public void optimizeOrdinals() {
+ new InPlaceMergeSorter() {
+ @Override
+ protected int compare(int i, int j) {
+ return freqs[j] - freqs[i]; // sort descending
}
- }
- for (Map.Entry<Long,Short> entry : other.entrySet()) {
- decode[entry.getValue()] = entry.getKey();
- }
- return decode;
- }
-
- // TODO: if we need more complicated frequency-driven optos, maybe add 'finish' to this api
- // and sort all ords by frequency. we could then lower BPV and waste a value to represent 'patched',
-
- /** retrieves frequency table for items (indexed by ordinal) */
- public int[] getFreqs() {
- return freqs;
- }
-
- /** sugar: returns max value over getFreqs() */
- public int maxFreq() {
- int max = 0;
- for (int i = 0; i < size; i++) {
- max = Math.max(max, freqs[i]);
- }
- return max;
- }
-
- /** sugar: returns ordinal with maxFreq() */
- public int maxOrd() {
- long max = 0;
- int maxOrd = 0;
- for (int i = 0; i < size; i++) {
- if (freqs[i] > max) {
- max = freqs[i];
- maxOrd = i;
+ @Override
+ protected void swap(int i, int j) {
+ // swap ordinal i with ordinal j
+ ords[(int)values[i] + 128] = (short)j;
+ ords[(int)values[j] + 128] = (short)i;
+
+ int tmpFreq = freqs[i];
+ byte tmpValue = values[i];
+ freqs[i] = freqs[j];
+ values[i] = values[j];
+ freqs[j] = tmpFreq;
+ values[j] = tmpValue;
}
- }
- return maxOrd;
+ }.sort(0, size);
}
}
+
}
Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java?rev=1635856&r1=1635855&r2=1635856&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsFormat.java Fri Oct 31 20:25:22 2014
@@ -51,9 +51,12 @@ import org.apache.lucene.util.packed.Pac
* <li>Indirect: when norms are extremely sparse, missing values are omitted.
* Access to an individual value is slower, but missing norm values are never accessed
* by search code.
- * <li>Patched: when a single norm value dominates, a sparse bitset encodes docs with exceptions,
- * so that access to the common value is still very fast. outliers fall thru to an exception
- * handling mechanism (Indirect or Constant).
+ * <li>Patched bitset: when a single norm value dominates, a sparse bitset encodes docs
+ * with exceptions, so that access to the common value is still very fast. outliers
+ * fall through to an exception handling mechanism (Indirect or Constant).
+ * <li>Patched table: when a small number of norm values dominate, a table is used for the
+ * common values to allow fast access. less common values fall through to an exception
+ * handling mechanism (Indirect).
* </ul>
* <p>
* Files:
@@ -87,7 +90,9 @@ import org.apache.lucene.util.packed.Pac
* <li>3 --> uncompressed: Values written as a simple byte[].
* <li>4 --> indirect. Only documents with a value are written with monotonic compression. a nested
* entry for the same field will follow for the exception handler.
- * <li>5 --> patched. Encoded the same as indirect.
+ * <li>5 --> patched bitset. Encoded the same as indirect.
+ * <li>6 --> patched table. Documents with very common values are written with a lookup table.
+ * Other values are written using a nested indirect.
* </ul>
* <li><a name="nvd" id="nvd"></a>
* <p>The Norms data or .nvd file.</p>
Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java?rev=1635856&r1=1635855&r2=1635856&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50NormsProducer.java Fri Oct 31 20:25:22 2014
@@ -49,7 +49,8 @@ import static org.apache.lucene.codecs.l
import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.TABLE_COMPRESSED;
import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.UNCOMPRESSED;
import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.INDIRECT;
-import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.PATCHED;
+import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.PATCHED_BITSET;
+import static org.apache.lucene.codecs.lucene50.Lucene50NormsConsumer.PATCHED_TABLE;
/**
* Reader for {@link Lucene50NormsFormat}
@@ -151,7 +152,8 @@ class Lucene50NormsProducer extends Norm
case TABLE_COMPRESSED:
case DELTA_COMPRESSED:
break;
- case PATCHED:
+ case PATCHED_BITSET:
+ case PATCHED_TABLE:
case INDIRECT:
if (meta.readVInt() != info.number) {
throw new CorruptIndexException("indirect norms entry for field: " + info.name + " is corrupt", meta);
@@ -237,16 +239,22 @@ class Lucene50NormsProducer extends Norm
case TABLE_COMPRESSED: {
data.seek(entry.offset);
int packedIntsVersion = data.readVInt();
- int size = data.readVInt();
- if (size > 256) {
- throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, got=" + size, data);
- }
- final long decode[] = new long[size];
- for (int i = 0; i < decode.length; i++) {
- decode[i] = data.readLong();
- }
final int formatID = data.readVInt();
final int bitsPerValue = data.readVInt();
+
+ if (bitsPerValue != 1 && bitsPerValue != 2 && bitsPerValue != 4) {
+ throw new CorruptIndexException("TABLE_COMPRESSED only supports bpv=1, bpv=2 and bpv=4, got=" + bitsPerValue, data);
+ }
+ int size = 1 << bitsPerValue;
+ final byte decode[] = new byte[size];
+ final int ordsSize = data.readVInt();
+ for (int i = 0; i < ordsSize; ++i) {
+ decode[i] = data.readByte();
+ }
+ for (int i = ordsSize; i < size; ++i) {
+ decode[i] = 0;
+ }
+
final PackedInts.Reader ordsReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID), packedIntsVersion, entry.count, bitsPerValue);
instance.info = Accountables.namedAccountable("table compressed", ordsReader);
instance.ramBytesUsed = RamUsageEstimator.sizeOf(decode) + ordsReader.ramBytesUsed();
@@ -291,7 +299,7 @@ class Lucene50NormsProducer extends Norm
};
break;
}
- case PATCHED: {
+ case PATCHED_BITSET: {
data.seek(entry.offset);
final long common = data.readLong();
int packedIntsVersion = data.readVInt();
@@ -304,7 +312,7 @@ class Lucene50NormsProducer extends Norm
}
LoadedNorms nestedInstance = loadNorms(entry.nested);
instance.ramBytesUsed = set.ramBytesUsed() + nestedInstance.ramBytesUsed;
- instance.info = Accountables.namedAccountable("patched -> " + nestedInstance.info, instance.ramBytesUsed);
+ instance.info = Accountables.namedAccountable("patched bitset -> " + nestedInstance.info, instance.ramBytesUsed);
final NumericDocValues values = nestedInstance.norms;
instance.norms = new NumericDocValues() {
@Override
@@ -318,6 +326,42 @@ class Lucene50NormsProducer extends Norm
};
break;
}
+ case PATCHED_TABLE: {
+ data.seek(entry.offset);
+ int packedIntsVersion = data.readVInt();
+ final int formatID = data.readVInt();
+ final int bitsPerValue = data.readVInt();
+
+ if (bitsPerValue != 2 && bitsPerValue != 4) {
+ throw new CorruptIndexException("PATCHED_TABLE only supports bpv=2 and bpv=4, got=" + bitsPerValue, data);
+ }
+ final int size = 1 << bitsPerValue;
+ final int ordsSize = data.readVInt();
+ final byte decode[] = new byte[ordsSize];
+ assert ordsSize + 1 == size;
+ for (int i = 0; i < ordsSize; ++i) {
+ decode[i] = data.readByte();
+ }
+
+ final PackedInts.Reader ordsReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID), packedIntsVersion, entry.count, bitsPerValue);
+ final LoadedNorms nestedInstance = loadNorms(entry.nested);
+ instance.ramBytesUsed = RamUsageEstimator.sizeOf(decode) + ordsReader.ramBytesUsed() + nestedInstance.ramBytesUsed;
+ instance.info = Accountables.namedAccountable("patched table -> " + nestedInstance.info, instance.ramBytesUsed);
+ final NumericDocValues values = nestedInstance.norms;
+ instance.norms = new NumericDocValues() {
+ @Override
+ public long get(int docID) {
+ int ord = (int)ordsReader.get(docID);
+ try {
+ // doing a try/catch here eliminates a seemingly unavoidable branch in hotspot...
+ return decode[ord];
+ } catch (IndexOutOfBoundsException e) {
+ return values.get(docID);
+ }
+ }
+ };
+ break;
+ }
default:
throw new AssertionError();
}
Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java?rev=1635856&r1=1635855&r2=1635856&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50NormsFormat.java Fri Oct 31 20:25:22 2014
@@ -43,85 +43,88 @@ public class TestLucene50NormsFormat ext
public void testNormMapSimple() {
NormMap map = new NormMap();
- map.add(10);
- map.add(5);
- map.add(4);
- map.add(10);
+ map.add((byte)4);
+ map.add((byte) 10);
+ map.add((byte) 5);
+ map.add((byte)10);
assertEquals(3, map.size);
// first come, first serve ord assignment
-
- // encode
- assertEquals(0, map.getOrd(10));
- assertEquals(1, map.getOrd(5));
- assertEquals(2, map.getOrd(4));
-
- // decode
- long decode[] = map.getDecodeTable();
- assertEquals(10, decode[0]);
- assertEquals(5, decode[1]);
- assertEquals(4, decode[2]);
-
- // freqs
- int freqs[] = map.getFreqs();
- assertEquals(2, freqs[0]);
- assertEquals(1, freqs[1]);
- assertEquals(1, freqs[2]);
-
- assertEquals(2, map.maxFreq());
+ assertEquals(0, map.ord((byte) 4));
+ assertEquals(1, map.ord((byte) 10));
+ assertEquals(2, map.ord((byte) 5));
+
+ assertEquals(4, map.values[0]);
+ assertEquals(10, map.values[1]);
+ assertEquals(5, map.values[2]);
+
+ assertEquals(1, map.freqs[0]);
+ assertEquals(2, map.freqs[1]);
+ assertEquals(1, map.freqs[2]);
+
+ // optimizing reorders the ordinals
+ map.optimizeOrdinals();
+ assertEquals(0, map.ord((byte)10));
+ assertEquals(1, map.ord((byte)4));
+ assertEquals(2, map.ord((byte)5));
+
+ assertEquals(10, map.values[0]);
+ assertEquals(4, map.values[1]);
+ assertEquals(5, map.values[2]);
+
+ assertEquals(2, map.freqs[0]);
+ assertEquals(1, map.freqs[1]);
+ assertEquals(1, map.freqs[2]);
}
public void testNormMapRandom() {
- Map<Long,Integer> freqs = new HashMap<>();
- Map<Long,Integer> ords = new HashMap<>();
-
- Set<Long> uniqueValuesSet = new HashSet<>();
+
+ Set<Byte> uniqueValuesSet = new HashSet<>();
int numUniqValues = TestUtil.nextInt(random(), 1, 256);
for (int i = 0; i < numUniqValues; i++) {
- if (random().nextBoolean()) {
- uniqueValuesSet.add(TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE));
- } else {
- uniqueValuesSet.add(TestUtil.nextLong(random(), Byte.MIN_VALUE, Byte.MAX_VALUE));
- }
+ uniqueValuesSet.add(Byte.valueOf((byte)TestUtil.nextInt(random(), Byte.MIN_VALUE, Byte.MAX_VALUE)));
}
-
- Long uniqueValues[] = uniqueValuesSet.toArray(new Long[uniqueValuesSet.size()]);
-
+ Byte uniqueValues[] = uniqueValuesSet.toArray(new Byte[uniqueValuesSet.size()]);
+
+ Map<Byte,Integer> freqs = new HashMap<>();
NormMap map = new NormMap();
int numdocs = TestUtil.nextInt(random(), 1, 100000);
for (int i = 0; i < numdocs; i++) {
- long value = uniqueValues[random().nextInt(uniqueValues.length)];
+ byte value = uniqueValues[random().nextInt(uniqueValues.length)];
// now add to both expected and actual
map.add(value);
-
- Integer ord = ords.get(value);
- if (ord == null) {
- ord = ords.size();
- ords.put(value, ord);
- freqs.put(value, 1);
+ if (freqs.containsKey(value)) {
+ freqs.put(value, freqs.get(value) + 1);
} else {
- freqs.put(value, freqs.get(value)+1);
+ freqs.put(value, 1);
}
}
-
- // value -> ord
- assertEquals(ords.size(), map.size);
- for (Map.Entry<Long,Integer> kv : ords.entrySet()) {
- assertEquals(kv.getValue().intValue(), map.getOrd(kv.getKey()));
+
+ assertEquals(freqs.size(), map.size);
+ for (Map.Entry<Byte,Integer> kv : freqs.entrySet()) {
+ byte value = kv.getKey();
+ int freq = kv.getValue();
+ int ord = map.ord(value);
+ assertEquals(freq, map.freqs[ord]);
+ assertEquals(value, map.values[ord]);
}
-
- // ord -> value
- Map<Long,Integer> reversed = new HashMap<>();
- long table[] = map.getDecodeTable();
- for (int i = 0; i < map.size; i++) {
- reversed.put(table[i], i);
+
+ // optimizing should reorder ordinals from greatest to least frequency
+ map.optimizeOrdinals();
+ // recheck consistency
+ assertEquals(freqs.size(), map.size);
+ for (Map.Entry<Byte,Integer> kv : freqs.entrySet()) {
+ byte value = kv.getKey();
+ int freq = kv.getValue();
+ int ord = map.ord(value);
+ assertEquals(freq, map.freqs[ord]);
+ assertEquals(value, map.values[ord]);
}
- assertEquals(ords, reversed);
-
- // freqs
- int freqTable[] = map.getFreqs();
- for (int i = 0; i < map.size; i++) {
- assertEquals(freqs.get(table[i]).longValue(), freqTable[i]);
+ // also check descending freq
+ int prevFreq = map.freqs[0];
+ for (int i = 1; i < map.size; ++i) {
+ assertTrue(prevFreq >= map.freqs[i]);
+ prevFreq = map.freqs[i];
}
}
}
Modified: lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java?rev=1635856&r1=1635855&r2=1635856&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java (original)
+++ lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseNormsFormatTestCase.java Fri Oct 31 20:25:22 2014
@@ -22,6 +22,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Random;
+import com.carrotsearch.randomizedtesting.annotations.Seed;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
@@ -43,6 +44,7 @@ import org.apache.lucene.util.TestUtil;
* test passes, then all Lucene/Solr tests should also pass. Ie,
* if there is some bug in a given NormsFormat that this
* test fails to catch then this test needs to be improved! */
+@Seed(value = "AD2222476BCB8800")
public abstract class BaseNormsFormatTestCase extends BaseIndexFileFormatTestCase {
public void testByteRange() throws Exception {
@@ -182,6 +184,32 @@ public abstract class BaseNormsFormatTes
}
}
+ public void testNCommon() throws Exception {
+ final int iterations = atLeast(1);
+ final Random r = random();
+ for (int i = 0; i < iterations; ++i) {
+ // 16 is 4 bpv, the max before we jump to 8bpv
+ for (int n = 2; n < 16; ++n) {
+ final int N = n;
+ final long[] commonValues = new long[N];
+ for (int j = 0; j < N; ++j) {
+ commonValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+ }
+ final int numOtherValues = TestUtil.nextInt(r, 2, 256 - N);
+ final long[] otherValues = new long[numOtherValues];
+ for (int j = 0; j < numOtherValues; ++j) {
+ otherValues[j] = TestUtil.nextLong(r, Byte.MIN_VALUE, Byte.MAX_VALUE);
+ }
+ doTestNormsVersusStoredFields(new LongProducer() {
+ @Override
+ long next() {
+ return r.nextInt(100) == 0 ? otherValues[r.nextInt(numOtherValues - 1)] : commonValues[r.nextInt(N - 1)];
+ }
+ });
+ }
+ }
+ }
+
private void doTestNormsVersusStoredFields(LongProducer longs) throws Exception {
int numDocs = atLeast(500);
long norms[] = new long[numDocs];
@@ -226,7 +254,7 @@ public abstract class BaseNormsFormatTes
NumericDocValues docValues = r.getNormValues("stored");
for (int i = 0; i < r.maxDoc(); i++) {
long storedValue = Long.parseLong(r.document(i).get("stored"));
- assertEquals(storedValue, docValues.get(i));
+ assertEquals("doc " + i, storedValue, docValues.get(i));
}
}
ir.close();