You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/07/01 15:54:41 UTC
svn commit: r1607080 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/CHANGES.txt lucene/core/
lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsConsumer.java
Author: rmuir
Date: Tue Jul 1 13:54:41 2014
New Revision: 1607080
URL: http://svn.apache.org/r1607080
Log:
LUCENE-5797: Optimize norms merging
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/core/ (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsConsumer.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1607080&r1=1607079&r2=1607080&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Tue Jul 1 13:54:41 2014
@@ -26,6 +26,8 @@ Optimizations
* LUCENE-5799: Optimize numeric docvalues merging. (Robert Muir)
+* LUCENE-5797: Optimize norms merging (Adrien Grand, Robert Muir)
+
Test Framework
* LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess).
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsConsumer.java?rev=1607080&r1=1607079&r2=1607080&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsConsumer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsConsumer.java Tue Jul 1 13:54:41 2014
@@ -20,7 +20,7 @@ package org.apache.lucene.codecs.lucene4
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
-import java.util.HashSet;
+import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
@@ -79,8 +79,7 @@ class Lucene49NormsConsumer extends DocV
long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE;
// TODO: more efficient?
- HashSet<Long> uniqueValues = null;
- uniqueValues = new HashSet<>();
+ NormMap uniqueValues = new NormMap();
long count = 0;
for (Number nv : values) {
@@ -94,7 +93,7 @@ class Lucene49NormsConsumer extends DocV
if (uniqueValues != null) {
if (uniqueValues.add(v)) {
- if (uniqueValues.size() > 256) {
+ if (uniqueValues.size > 256) {
uniqueValues = null;
}
}
@@ -106,7 +105,7 @@ class Lucene49NormsConsumer extends DocV
throw new IllegalStateException("illegal norms data for field " + field.name + ", expected " + maxDoc + " values, got " + count);
}
- if (uniqueValues != null && uniqueValues.size() == 1) {
+ if (uniqueValues != null && uniqueValues.size == 1) {
// 0 bpv
meta.writeByte(CONST_COMPRESSED);
meta.writeLong(minValue);
@@ -114,7 +113,7 @@ class Lucene49NormsConsumer extends DocV
// small number of unique values: this is the typical case:
// we only use bpv=1,2,4,8
PackedInts.Format format = PackedInts.Format.PACKED_SINGLE_BLOCK;
- int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1);
+ int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size-1);
if (bitsPerValue == 3) {
bitsPerValue = 4;
} else if (bitsPerValue > 4) {
@@ -132,15 +131,12 @@ class Lucene49NormsConsumer extends DocV
meta.writeLong(data.getFilePointer());
data.writeVInt(PackedInts.VERSION_CURRENT);
- Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
- Arrays.sort(decode);
- final HashMap<Long,Integer> encode = new HashMap<>();
+ long[] decode = uniqueValues.getDecodeTable();
// upgrade to power of two sized array
int size = 1 << bitsPerValue;
data.writeVInt(size);
for (int i = 0; i < decode.length; i++) {
data.writeLong(decode[i]);
- encode.put(decode[i], i);
}
for (int i = decode.length; i < size; i++) {
data.writeLong(0);
@@ -151,7 +147,7 @@ class Lucene49NormsConsumer extends DocV
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, format, maxDoc, bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
for(Number nv : values) {
- writer.add(encode.get(nv.longValue()));
+ writer.add(uniqueValues.getOrd(nv.longValue()));
}
writer.finish();
}
@@ -210,4 +206,66 @@ class Lucene49NormsConsumer extends DocV
public void addSortedNumericField(FieldInfo field, Iterable<Number> docToValueCount, Iterable<Number> values) throws IOException {
throw new UnsupportedOperationException();
}
+
+ // specialized deduplication of long->ord for norms: 99.99999% of the time this will be a single-byte range.
+ static class NormMap {
+ // we use short: at most we will add 257 values to this map before its rejected as too big above.
+ short[] singleByteRange = new short[256];
+ Map<Long,Short> other = new HashMap<Long,Short>();
+ int size;
+
+ {
+ Arrays.fill(singleByteRange, (short)-1);
+ }
+
+ /** adds an item to the mapping. returns true if actually added */
+ public boolean add(long l) {
+ assert size <= 256; // once we add > 256 values, we nullify the map in addNumericField and don't use this strategy
+ if (l >= Byte.MIN_VALUE && l <= Byte.MAX_VALUE) {
+ int index = (int) (l + 128);
+ short previous = singleByteRange[index];
+ if (previous < 0) {
+ singleByteRange[index] = (short) size;
+ size++;
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ if (!other.containsKey(l)) {
+ other.put(l, (short)size);
+ size++;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ /** gets the ordinal for a previously added item */
+ public int getOrd(long l) {
+ if (l >= Byte.MIN_VALUE && l <= Byte.MAX_VALUE) {
+ int index = (int) (l + 128);
+ return singleByteRange[index];
+ } else {
+ // NPE if something is screwed up
+ return other.get(l);
+ }
+ }
+
+ /** retrieves the ordinal table for previously added items */
+ public long[] getDecodeTable() {
+ long decode[] = new long[size];
+ for (int i = 0; i < singleByteRange.length; i++) {
+ short s = singleByteRange[i];
+ if (s >= 0) {
+ decode[s] = i - 128;
+ }
+ }
+ for (Map.Entry<Long,Short> entry : other.entrySet()) {
+ decode[entry.getValue()] = entry.getKey();
+ }
+ return decode;
+ }
+ }
}