You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2015/08/26 19:24:07 UTC
svn commit: r1697977 - in /lucene/dev/branches/branch_5x: ./ solr/
solr/core/ solr/core/src/java/org/apache/solr/util/hll/
solr/core/src/test/org/apache/solr/util/hll/
Author: hossman
Date: Wed Aug 26 17:24:07 2015
New Revision: 1697977
URL: http://svn.apache.org/r1697977
Log:
SOLR-7954: Fixed an integer overflow bug in the HyperLogLog code used by the 'cardinality' option of stats.field to prevent ArrayIndexOutOfBoundsException in a distributed search when a large precision is selected and a large number of values exist in each shard (merge r1697969)
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/solr/ (props changed)
lucene/dev/branches/branch_5x/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/solr/core/ (props changed)
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java
lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java
Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1697977&r1=1697976&r2=1697977&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Wed Aug 26 17:24:07 2015
@@ -79,6 +79,10 @@ Bug Fixes
* SOLR-7929: SimplePostTool (also bin/post) -filetypes "*" now works properly in 'web' mode (Erik Hatcher)
+* SOLR-7954: Fixed an integer overflow bug in the HyperLogLog code used by the 'cardinality' option
+ of stats.field to prevent ArrayIndexOutOfBoundsException in a distributed search when a large precision
+ is selected and a large number of values exist in each shard (hossman)
+
Optimizations
----------------------
Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java?rev=1697977&r1=1697976&r2=1697977&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java Wed Aug 26 17:24:07 2015
@@ -99,7 +99,7 @@ class BigEndianAscendingWordDeserializer
}
// First bit of the word
- final long firstBitIndex = (position * wordLength);
+ final long firstBitIndex = ((long)position) * ((long)wordLength);
final int firstByteIndex = (bytePadding + (int)(firstBitIndex / BITS_PER_BYTE));
final int firstByteSkipBits = (int)(firstBitIndex % BITS_PER_BYTE);
Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java?rev=1697977&r1=1697976&r2=1697977&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java Wed Aug 26 17:24:07 2015
@@ -85,7 +85,7 @@ class BigEndianAscendingWordSerializer i
this.wordLength = wordLength;
this.wordCount = wordCount;
- final long bitsRequired = (wordLength * wordCount);
+ final long bitsRequired = ((long)wordLength) * ((long)wordCount);
final boolean leftoverBits = ((bitsRequired % BITS_PER_BYTE) != 0);
final int bytesRequired = (int)(bitsRequired / BITS_PER_BYTE) + (leftoverBits ? 1 : 0) + bytePadding;
bytes = new byte[bytesRequired];
Modified: lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java?rev=1697977&r1=1697976&r2=1697977&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java Wed Aug 26 17:24:07 2015
@@ -18,6 +18,8 @@
package org.apache.solr.util.hll;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
import org.junit.Test;
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
@@ -27,6 +29,7 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Random;
+import java.util.EnumSet;
import static org.apache.solr.util.hll.HLL.*;
@@ -34,55 +37,190 @@ import static org.apache.solr.util.hll.H
* Serialization smoke-tests.
*/
public class HLLSerializationTest extends LuceneTestCase {
- /**
- * A smoke-test that covers serialization/deserialization of an HLL
- * under all possible parameters.
- */
- @Test
- @Slow
- @Nightly
- public void serializationSmokeTest() throws Exception {
- final Random random = new Random(randomLong());
- final int randomCount = 250;
- final List<Long> randoms = new ArrayList<Long>(randomCount);
- for (int i=0; i<randomCount; i++) {
- randoms.add(random.nextLong());
- }
-
- assertCardinality(HLLType.EMPTY, randoms);
- assertCardinality(HLLType.EXPLICIT, randoms);
- assertCardinality(HLLType.SPARSE, randoms);
- assertCardinality(HLLType.FULL, randoms);
+
+ /**
+ * A smoke-test that covers serialization/deserialization of an HLL
+ * under most possible init parameters.
+ */
+ @Test
+ @Slow
+ @Nightly
+ public void serializationSmokeTest() throws Exception {
+ final Random random = new Random(randomLong());
+ final int randomCount = 250;
+ final List<Long> randoms = new ArrayList<Long>(randomCount);
+ for (int i=0; i<randomCount; i++) {
+ randoms.add(random.nextLong());
}
-
+
// NOTE: log2m<=16 was chosen as the max log2m parameter so that the test
// completes in a reasonable amount of time. Not much is gained by
- // testing larger values - there are no more known serialization
- // related edge cases that appear as log2m gets even larger.
- // NOTE: This test completed successfully with log2m<=MAXIMUM_LOG2M_PARAM
- // on 2014-01-30.
- private static void assertCardinality(final HLLType hllType, final Collection<Long> items)
- throws CloneNotSupportedException {
- for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=16; log2m++) {
- for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
- for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
- for(final boolean sparse: new boolean[]{true, false}) {
- HLL hll = new HLL(log2m, regw, expthr, sparse, hllType);
- for(final Long item: items) {
- hll.addRaw(item);
- }
- HLL copy = HLL.fromBytes(hll.toBytes());
- assertEquals(copy.cardinality(), hll.cardinality());
- assertEquals(copy.getType(), hll.getType());
- assertTrue(Arrays.equals(copy.toBytes(), hll.toBytes()));
-
- HLL clone = hll.clone();
- assertEquals(clone.cardinality(), hll.cardinality());
- assertEquals(clone.getType(), hll.getType());
- assertTrue(Arrays.equals(clone.toBytes(), hll.toBytes()));
- }
- }
- }
+ // testing larger values
+ final int maxLog2m = 16;
+ for (HLLType type : EnumSet.allOf(HLLType.class)) {
+ assertCardinality(type, maxLog2m, randoms);
+ }
+ }
+
+ /**
+ * A smoke-test that covers serialization/deserialization of HLLs
+ * under the max possible numeric init parameters, iterating over all possible combinations of
+ * the other params.
+ *
+ * @see #manyValuesHLLSerializationTest
+ */
+ @Test
+ @Slow
+ @Monster("needs roughly -Dtests.heapsize=8g because of the (multiple) massive data structs")
+ public void monsterHLLSerializationTest() throws Exception {
+ final Random random = new Random(randomLong());
+ final int randomCount = 250;
+ final List<Long> randoms = new ArrayList<Long>(randomCount);
+ for (int i=0; i<randomCount; i++) {
+ randoms.add(random.nextLong());
+ }
+
+ for (HLLType type : EnumSet.allOf(HLLType.class)) {
+ for (boolean sparse : new boolean[] {true, false} ) {
+ HLL hll = new HLL(MAXIMUM_LOG2M_PARAM, MAXIMUM_REGWIDTH_PARAM, MAXIMUM_EXPTHRESH_PARAM,
+ sparse, type);
+ assertCardinality(hll, randoms);
+ }
+ }
+ }
+
+ /**
+ * A smoke-test that covers serialization/deserialization of a (single) HLL
+ * with random init params with an extremely large number of unique values added to it.
+ *
+ * @see #monsterHLLSerializationTest
+ */
+ @Test
+ @Slow
+ @Monster("may require as much as -Dtests.heapsize=4g depending on random values picked")
+ public void manyValuesHLLSerializationTest() throws Exception {
+
+ final HLLType[] ALL_TYPES = EnumSet.allOf(HLLType.class).toArray(new HLLType[0]);
+ Arrays.sort(ALL_TYPES);
+
+ final int log2m = TestUtil.nextInt(random(), MINIMUM_LOG2M_PARAM, MAXIMUM_LOG2M_PARAM);
+ final int regwidth = TestUtil.nextInt(random(), MINIMUM_REGWIDTH_PARAM, MAXIMUM_REGWIDTH_PARAM);
+ final int expthresh = TestUtil.nextInt(random(), MINIMUM_EXPTHRESH_PARAM, MAXIMUM_EXPTHRESH_PARAM);
+ final boolean sparse = random().nextBoolean();
+ final HLLType type = ALL_TYPES[TestUtil.nextInt(random(), 0, ALL_TYPES.length-1)];
+
+ HLL hll = new HLL(log2m, regwidth, expthresh, sparse, type);
+
+ final long NUM_VALS = TestUtil.nextLong(random(), 150000, 1000000);
+ final long MIN_VAL = TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE-NUM_VALS);
+ final long MAX_VAL = MIN_VAL + NUM_VALS;
+ assert MIN_VAL < MAX_VAL;
+
+ for (long val = MIN_VAL; val < MAX_VAL; val++) {
+ hll.addRaw(val);
+ }
+
+ final long expectedCardinality = hll.cardinality();
+ final HLLType expectedType = hll.getType();
+
+ byte[] serializedData = hll.toBytes();
+ hll = null; // allow some GC
+
+ HLL copy = HLL.fromBytes(serializedData);
+ serializedData = null; // allow some GC
+
+ assertEquals(expectedCardinality, copy.cardinality());
+ assertEquals(expectedType, copy.getType());
+
+ }
+
+ /**
+ * A smoke-test that covers serialization/deserialization of a (single) HLL
+ * with random the max possible numeric init parameters, with randomized values for the other params.
+ *
+ * @see #monsterHLLSerializationTest
+ */
+ @Test
+ @Slow
+ @Monster("can require as much as -Dtests.heapsize=4g because of the massive data structs")
+ public void manyValuesMonsterHLLSerializationTest() throws Exception {
+
+ final HLLType[] ALL_TYPES = EnumSet.allOf(HLLType.class).toArray(new HLLType[0]);
+ Arrays.sort(ALL_TYPES);
+
+ final boolean sparse = random().nextBoolean();
+ final HLLType type = ALL_TYPES[TestUtil.nextInt(random(), 0, ALL_TYPES.length-1)];
+
+ HLL hll = new HLL(MAXIMUM_LOG2M_PARAM, MAXIMUM_REGWIDTH_PARAM, MAXIMUM_EXPTHRESH_PARAM, sparse, type);
+
+ final long NUM_VALS = TestUtil.nextLong(random(), 150000, 1000000);
+ final long MIN_VAL = TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE-NUM_VALS);
+ final long MAX_VAL = MIN_VAL + NUM_VALS;
+ assert MIN_VAL < MAX_VAL;
+
+ for (long val = MIN_VAL; val < MAX_VAL; val++) {
+ hll.addRaw(val);
+ }
+
+ final long expectedCardinality = hll.cardinality();
+ final HLLType expectedType = hll.getType();
+
+ byte[] serializedData = hll.toBytes();
+ hll = null; // allow some GC
+
+ HLL copy = HLL.fromBytes(serializedData);
+ serializedData = null; // allow some GC
+
+ assertEquals(expectedCardinality, copy.cardinality());
+ assertEquals(expectedType, copy.getType());
+
+ }
+
+ /**
+ * Iterates over all possible constructor args, with the exception of log2m,
+ * which is only iterated up to the specified max so the test runs in a
+ * "reasonable" amount of time and ram.
+ */
+ private static void assertCardinality(final HLLType hllType,
+ final int maxLog2m,
+ final Collection<Long> items) throws CloneNotSupportedException {
+ for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
+ for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
+ for(final boolean sparse: new boolean[]{true, false}) {
+ for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=maxLog2m; log2m++) {
+ assertCardinality(new HLL(log2m, regw, expthr, sparse, hllType), items);
+ }
}
+ }
+ }
+ }
+
+ /**
+ * Adds all of the items to the specified hll, then does a round trip serialize/deserialize and confirms
+ * equality of several properties (including the byte serialization). Repeats process with a clone.
+ */
+ private static void assertCardinality(HLL hll, final Collection<Long> items)
+ throws CloneNotSupportedException {
+
+ for (final Long item: items) {
+ hll.addRaw(item);
}
+
+ final long hllCardinality = hll.cardinality();
+ final HLLType hllType = hll.getType();
+ final byte[] hllBytes = hll.toBytes();
+ hll = null; // allow some GC
+
+ HLL copy = HLL.fromBytes(hllBytes);
+ assertEquals(copy.cardinality(), hllCardinality);
+ assertEquals(copy.getType(), hllType);
+ assertTrue(Arrays.equals(copy.toBytes(), hllBytes));
+
+ HLL clone = copy.clone();
+ copy = null; // allow some GC
+
+ assertEquals(clone.cardinality(), hllCardinality);
+ assertEquals(clone.getType(), hllType);
+ assertTrue(Arrays.equals(clone.toBytes(), hllBytes));
+ }
}