You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2015/08/26 18:33:39 UTC

svn commit: r1697969 - in /lucene/dev/trunk/solr: ./ core/src/java/org/apache/solr/util/hll/ core/src/test/org/apache/solr/util/hll/

Author: hossman
Date: Wed Aug 26 16:33:39 2015
New Revision: 1697969

URL: http://svn.apache.org/r1697969
Log:
SOLR-7954: Fixed an integer overflow bug in the HyperLogLog code used by the 'cardinality' option of stats.field to prevent ArrayIndexOutOfBoundsException in a distributed search when a large precision is selected and a large number of values exist in each shard

Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1697969&r1=1697968&r2=1697969&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Wed Aug 26 16:33:39 2015
@@ -159,6 +159,10 @@ Bug Fixes
 
 * SOLR-7929: SimplePostTool (also bin/post) -filetypes "*" now works properly in 'web' mode (Erik Hatcher)
 
+* SOLR-7954: Fixed an integer overflow bug in the HyperLogLog code used by the 'cardinality' option
+  of stats.field to prevent ArrayIndexOutOfBoundsException in a distributed search when a large precision
+  is selected and a large number of values exist in each shard (hossman)
+
 Optimizations
 ----------------------
 

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java?rev=1697969&r1=1697968&r2=1697969&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java Wed Aug 26 16:33:39 2015
@@ -99,7 +99,7 @@ class BigEndianAscendingWordDeserializer
         }
 
         // First bit of the word
-        final long firstBitIndex = (position * wordLength);
+        final long firstBitIndex = ((long)position) * ((long)wordLength);
         final int firstByteIndex = (bytePadding + (int)(firstBitIndex / BITS_PER_BYTE));
         final int firstByteSkipBits = (int)(firstBitIndex % BITS_PER_BYTE);
 

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java?rev=1697969&r1=1697968&r2=1697969&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java Wed Aug 26 16:33:39 2015
@@ -85,7 +85,7 @@ class BigEndianAscendingWordSerializer i
         this.wordLength = wordLength;
         this.wordCount = wordCount;
 
-        final long bitsRequired = (wordLength * wordCount);
+        final long bitsRequired = ((long)wordLength) * ((long)wordCount);
         final boolean leftoverBits = ((bitsRequired % BITS_PER_BYTE) != 0);
         final int bytesRequired = (int)(bitsRequired / BITS_PER_BYTE) + (leftoverBits ? 1 : 0) + bytePadding;
         bytes = new byte[bytesRequired];

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java?rev=1697969&r1=1697968&r2=1697969&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java Wed Aug 26 16:33:39 2015
@@ -18,6 +18,8 @@
 package org.apache.solr.util.hll;
 
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
 import org.junit.Test;
 
 import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
@@ -27,6 +29,7 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
 import java.util.Random;
+import java.util.EnumSet;
 
 import static org.apache.solr.util.hll.HLL.*;
 
@@ -34,55 +37,190 @@ import static org.apache.solr.util.hll.H
  * Serialization smoke-tests.
  */
 public class HLLSerializationTest extends LuceneTestCase {
-    /**
-     * A smoke-test that covers serialization/deserialization of an HLL
-     * under all possible parameters.
-     */
-    @Test
-    @Slow
-    @Nightly
-    public void serializationSmokeTest() throws Exception {
-        final Random random = new Random(randomLong());
-        final int randomCount = 250;
-        final List<Long> randoms = new ArrayList<Long>(randomCount);
-        for (int i=0; i<randomCount; i++) {
-          randoms.add(random.nextLong());
-      }
-
-        assertCardinality(HLLType.EMPTY, randoms);
-        assertCardinality(HLLType.EXPLICIT, randoms);
-        assertCardinality(HLLType.SPARSE, randoms);
-        assertCardinality(HLLType.FULL, randoms);
+  
+  /**
+   * A smoke-test that covers serialization/deserialization of an HLL
+   * under most possible init parameters.
+   */
+  @Test
+  @Slow
+  @Nightly
+  public void serializationSmokeTest() throws Exception {
+    final Random random = new Random(randomLong());
+    final int randomCount = 250;
+    final List<Long> randoms = new ArrayList<Long>(randomCount);
+    for (int i=0; i<randomCount; i++) {
+      randoms.add(random.nextLong());
     }
-
+    
     // NOTE: log2m<=16 was chosen as the max log2m parameter so that the test
     //       completes in a reasonable amount of time. Not much is gained by
-    //       testing larger values - there are no more known serialization
-    //       related edge cases that appear as log2m gets even larger.
-    // NOTE: This test completed successfully with log2m<=MAXIMUM_LOG2M_PARAM
-    //       on 2014-01-30.
-    private static void assertCardinality(final HLLType hllType, final Collection<Long> items)
-           throws CloneNotSupportedException {
-        for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=16; log2m++) {
-            for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
-                for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
-                    for(final boolean sparse: new boolean[]{true, false}) {
-                        HLL hll = new HLL(log2m, regw, expthr, sparse, hllType);
-                        for(final Long item: items) {
-                            hll.addRaw(item);
-                        }
-                        HLL copy = HLL.fromBytes(hll.toBytes());
-                        assertEquals(copy.cardinality(), hll.cardinality());
-                        assertEquals(copy.getType(), hll.getType());
-                        assertTrue(Arrays.equals(copy.toBytes(), hll.toBytes()));
-
-                        HLL clone = hll.clone();
-                        assertEquals(clone.cardinality(), hll.cardinality());
-                        assertEquals(clone.getType(), hll.getType());
-                        assertTrue(Arrays.equals(clone.toBytes(), hll.toBytes()));
-                    }
-                }
-            }
+    //       testing larger values
+    final int maxLog2m = 16;
+    for (HLLType type : EnumSet.allOf(HLLType.class)) {
+      assertCardinality(type, maxLog2m, randoms);
+    }
+  }
+  
+  /**
+   * A smoke-test that covers serialization/deserialization of HLLs
+   * under the max possible numeric init parameters, iterating over all possible combinations of 
+   * the other params.
+   *
+   * @see #manyValuesHLLSerializationTest
+   */
+  @Test
+  @Slow
+  @Monster("needs roughly -Dtests.heapsize=8g because of the (multiple) massive data structs")
+  public void monsterHLLSerializationTest() throws Exception {
+    final Random random = new Random(randomLong());
+    final int randomCount = 250;
+    final List<Long> randoms = new ArrayList<Long>(randomCount);
+    for (int i=0; i<randomCount; i++) {
+      randoms.add(random.nextLong());
+    }
+
+    for (HLLType type : EnumSet.allOf(HLLType.class)) {
+      for (boolean sparse : new boolean[] {true, false} ) {
+        HLL hll = new HLL(MAXIMUM_LOG2M_PARAM, MAXIMUM_REGWIDTH_PARAM, MAXIMUM_EXPTHRESH_PARAM,
+                          sparse, type);
+        assertCardinality(hll, randoms);
+      }
+    }
+  }
+  
+  /**
+   * A smoke-test that covers serialization/deserialization of a (single) HLL
+   * with random init params with an extremely large number of unique values added to it.
+   *
+   * @see #monsterHLLSerializationTest
+   */
+  @Test
+  @Slow
+  @Monster("may require as much as -Dtests.heapsize=4g depending on random values picked")
+  public void manyValuesHLLSerializationTest() throws Exception {
+
+    final HLLType[] ALL_TYPES = EnumSet.allOf(HLLType.class).toArray(new HLLType[0]);
+    Arrays.sort(ALL_TYPES);
+      
+    final int log2m = TestUtil.nextInt(random(), MINIMUM_LOG2M_PARAM, MAXIMUM_LOG2M_PARAM);
+    final int regwidth = TestUtil.nextInt(random(), MINIMUM_REGWIDTH_PARAM, MAXIMUM_REGWIDTH_PARAM);
+    final int expthresh = TestUtil.nextInt(random(), MINIMUM_EXPTHRESH_PARAM, MAXIMUM_EXPTHRESH_PARAM);
+    final boolean sparse = random().nextBoolean();
+    final HLLType type = ALL_TYPES[TestUtil.nextInt(random(), 0, ALL_TYPES.length-1)];
+    
+    HLL hll = new HLL(log2m, regwidth, expthresh, sparse, type);
+
+    final long NUM_VALS = TestUtil.nextLong(random(), 150000, 1000000);
+    final long MIN_VAL = TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE-NUM_VALS);
+    final long MAX_VAL = MIN_VAL + NUM_VALS;
+    assert MIN_VAL < MAX_VAL;
+    
+    for (long val = MIN_VAL; val < MAX_VAL; val++) {
+      hll.addRaw(val);
+    }
+    
+    final long expectedCardinality = hll.cardinality();
+    final HLLType expectedType = hll.getType();
+
+    byte[] serializedData = hll.toBytes();
+    hll = null; // allow some GC
+    
+    HLL copy = HLL.fromBytes(serializedData);
+    serializedData = null; // allow some GC
+    
+    assertEquals(expectedCardinality, copy.cardinality());
+    assertEquals(expectedType, copy.getType());
+    
+  }
+  
+  /**
+   * A smoke-test that covers serialization/deserialization of a (single) HLL
+   * with random the max possible numeric init parameters, with randomized values for the other params.
+   *
+   * @see #monsterHLLSerializationTest
+   */
+  @Test
+  @Slow
+  @Monster("can require as much as -Dtests.heapsize=4g because of the massive data structs")
+  public void manyValuesMonsterHLLSerializationTest() throws Exception {
+
+    final HLLType[] ALL_TYPES = EnumSet.allOf(HLLType.class).toArray(new HLLType[0]);
+    Arrays.sort(ALL_TYPES);
+      
+    final boolean sparse = random().nextBoolean();
+    final HLLType type = ALL_TYPES[TestUtil.nextInt(random(), 0, ALL_TYPES.length-1)];
+    
+    HLL hll = new HLL(MAXIMUM_LOG2M_PARAM, MAXIMUM_REGWIDTH_PARAM, MAXIMUM_EXPTHRESH_PARAM, sparse, type);
+
+    final long NUM_VALS = TestUtil.nextLong(random(), 150000, 1000000);
+    final long MIN_VAL = TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE-NUM_VALS);
+    final long MAX_VAL = MIN_VAL + NUM_VALS;
+    assert MIN_VAL < MAX_VAL;
+    
+    for (long val = MIN_VAL; val < MAX_VAL; val++) {
+      hll.addRaw(val);
+    }
+    
+    final long expectedCardinality = hll.cardinality();
+    final HLLType expectedType = hll.getType();
+
+    byte[] serializedData = hll.toBytes();
+    hll = null; // allow some GC
+    
+    HLL copy = HLL.fromBytes(serializedData);
+    serializedData = null; // allow some GC
+    
+    assertEquals(expectedCardinality, copy.cardinality());
+    assertEquals(expectedType, copy.getType());
+    
+  }
+
+  /**
+   * Iterates over all possible constructor args, with the exception of log2m, 
+   * which is only iterated up to the specified max so the test runs in a 
+   * "reasonable" amount of time and ram.
+   */
+  private static void assertCardinality(final HLLType hllType,
+                                        final int maxLog2m,
+                                        final Collection<Long> items) throws CloneNotSupportedException {
+    for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
+      for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
+        for(final boolean sparse: new boolean[]{true, false}) {
+          for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=maxLog2m; log2m++) {
+            assertCardinality(new HLL(log2m, regw, expthr, sparse, hllType), items);
+          }
         }
+      }
+    }
+  }
+
+  /**
+   * Adds all of the items to the specified hll, then does a round trip serialize/deserialize and confirms
+   * equality of several properties (including the byte serialization).  Repeats process with a clone.
+   */
+  private static void assertCardinality(HLL hll, final Collection<Long> items)
+    throws CloneNotSupportedException {
+    
+    for (final Long item: items) {
+      hll.addRaw(item);
     }
+    
+    final long hllCardinality = hll.cardinality();
+    final HLLType hllType = hll.getType();
+    final byte[] hllBytes = hll.toBytes();
+    hll = null; // allow some GC
+    
+    HLL copy = HLL.fromBytes(hllBytes);
+    assertEquals(copy.cardinality(), hllCardinality);
+    assertEquals(copy.getType(), hllType);
+    assertTrue(Arrays.equals(copy.toBytes(), hllBytes));
+    
+    HLL clone = copy.clone();
+    copy = null; // allow some GC
+    
+    assertEquals(clone.cardinality(), hllCardinality);
+    assertEquals(clone.getType(), hllType);
+    assertTrue(Arrays.equals(clone.toBytes(), hllBytes));
+  }
 }