You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2015/07/16 12:47:38 UTC

svn commit: r1691351 [2/2] - /lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/

Added: lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/IntegrationTestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/IntegrationTestGenerator.java?rev=1691351&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/IntegrationTestGenerator.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/IntegrationTestGenerator.java Thu Jul 16 10:47:37 2015
@@ -0,0 +1,708 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
+import static org.apache.solr.util.hll.ProbabilisticTestUtil.*;
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Random;
+
+/**
+ * Generates test files for testing other implementations of HLL
+ * serialization/deserialization, namely the PostgreSQL implementation.
+ */
+public class IntegrationTestGenerator {
+    // ************************************************************************
+    // directory to output the generated tests
+    private static final String OUTPUT_DIRECTORY = "/tmp/hll_test/";
+
+    // ------------------------------------------------------------------------
+    // configurations for HLLs, should mirror settings in PostgreSQL impl. tests
+    private static final int REGWIDTH = 5;
+    private static final int LOG2M = 11;
+    // NOTE:  This differs from the PostgreSQL impl. parameter 'expthresh'. This
+    //        is a literal threshold to use in the promotion hierarchy, implying
+    //        that both EXPLICIT representation should be used and it should
+    //        NOT be automatically computed. This is done to ensure that the
+    //        parameters of the test are very explicitly defined.
+    private static final int EXPLICIT_THRESHOLD = 256;
+    // NOTE:  This is not the PostgreSQL impl. parameter 'sparseon'. 'sparseon'
+    //        is assumed to be true and this is a literal register-count threshold
+    //        to use in the promotion hierarchy. This is done to ensure that the
+    //        parameters of the test are very explicitly defined.
+    private static final int SPARSE_THRESHOLD = 850;
+
+    // ------------------------------------------------------------------------
+    // computed constants
+    private static final int REGISTER_COUNT = (1 << LOG2M);
+    private static final int REGISTER_MAX_VALUE = (1 << REGWIDTH) - 1;
+
+    // ========================================================================
+    // Tests
+    /**
+     * Cumulatively adds random values to a FULL HLL through the small range
+     * correction, uncorrected range, and large range correction of the HLL's
+     * cardinality estimator.
+     *
+     * Format: cumulative add
+     * Tests:
+     * - FULL cardinality computation
+     */
+    private static void fullCardinalityCorrectionTest(final ISchemaVersion schemaVersion) throws IOException {
+        final FileWriter output = openOutput(schemaVersion, "cardinality_correction", TestType.ADD);
+
+        // the accumulator, starts empty
+        final HLL hll = newHLL(HLLType.FULL);
+        initLineAdd(output, hll, schemaVersion);
+
+        // run through some values in the small range correction
+        for(int i=0; i<((1 << LOG2M) - 1); i++) {
+            final long rawValue = constructHLLValue(LOG2M, i, 1);
+            cumulativeAddLine(output, hll, rawValue, schemaVersion);
+        }
+
+        // run up past some values in the uncorrected range
+        for(int i=0; i<(1 << LOG2M); i++) {
+            final long rawValue = constructHLLValue(LOG2M, i, 7);
+            cumulativeAddLine(output, hll, rawValue, schemaVersion);
+        }
+
+        // run through some values in the large range correction
+        for(int i=0; i<(1 << LOG2M); i++) {
+            final long rawValue = constructHLLValue(LOG2M, i, 30);
+            cumulativeAddLine(output, hll, rawValue, schemaVersion);
+        }
+
+        output.flush();
+        output.close();
+    }
+
+    /**
+     * Cumulatively adds random values to an EMPTY HLL.
+     *
+     * Format: cumulative add
+     * Tests:
+     * - EMPTY, EXPLICIT, SPARSE, PROBABILSTIC addition
+     * - EMPTY to EXPLICIT promotion
+     * - EXPLICIT to SPARSE promotion
+     * - SPARSE to FULL promotion
+     */
+    private static void globalStepTest(final ISchemaVersion schemaVersion) throws IOException {
+        final FileWriter output = openOutput(schemaVersion, "comprehensive_promotion", TestType.ADD);
+
+        // the accumulator, starts empty
+        final HLL hll = newHLL(HLLType.EMPTY);
+        initLineAdd(output, hll, schemaVersion);
+
+        for(int i=0; i<10000/*arbitrary*/; i++) {
+            cumulativeAddLine(output, hll, randomLong(), schemaVersion);
+        }
+
+        output.flush();
+        output.close();
+    }
+
+    /**
+     * Cumulatively unions "underpopulated" FULL HLLs into the
+     * accumulator to verify the correct behavior from the PostgreSQL implementation.
+     * The PostgreSQL implementation's representations of probabilistic HLLs should
+     * depend exclusively on the chosen SPARSE-to-FULL cutoff.
+     *
+     * Format: cumulative union
+     * Tests:
+     * - EMPTY U "underpopulated" FULL => SPARSE
+     * - SPARSE U "underpopulated" FULL => SPARSE
+     * - SPARSE U "barely underpopulated" FULL => FULL
+     */
+    private static void sparseFullRepresentationTest(final ISchemaVersion schemaVersion) throws IOException {
+        final FileWriter output = openOutput(schemaVersion, "sparse_full_representation", TestType.UNION);
+
+        final HLL emptyHLL1 = newHLL(HLLType.EMPTY);
+        final HLL emptyHLL2 = newHLL(HLLType.EMPTY);
+
+        cumulativeUnionLine(output, emptyHLL1, emptyHLL2, schemaVersion);
+
+        // NOTE:  In this test the sparseReference will be the "expected" value
+        //        from the C representation, since it doesn't choose representation
+        //        based on original encoding, but rather on the promotion rules
+        //        and the declared type of the "receiving" field.
+        //        It is the manually-constructed union result.
+
+        // "underpopulated" FULL U EMPTY => SPARSE
+        final HLL fullHLL = newHLL(HLLType.FULL);
+        fullHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
+
+        final HLL sparseHLL = newHLL(HLLType.SPARSE);
+        sparseHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
+
+        output.write(stringCardinality(fullHLL) + "," + toByteA(fullHLL, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
+        output.flush();
+
+        // "underpopulated" FULL (small) U SPARSE (small) => SPARSE
+        final HLL fullHLL2 = newHLL(HLLType.FULL);
+        fullHLL2.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
+
+        sparseHLL.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
+
+        output.write(stringCardinality(fullHLL2) + "," + toByteA(fullHLL2, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
+        output.flush();
+
+        // "underpopulated" FULL (just on edge) U SPARSE (small) => FULL
+        final HLL fullHLL3 = newHLL(HLLType.FULL);
+        for(int i=2; i<(SPARSE_THRESHOLD + 1); i++) {
+            fullHLL3.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
+            sparseHLL.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
+        }
+
+        output.write(stringCardinality(fullHLL3) + "," + toByteA(fullHLL3, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
+        output.flush();
+    }
+
+    /**
+     * Cumulatively sets successive registers to:
+     *
+     *     <code>(registerIndex % REGISTER_MAX_VALUE) + 1</code>
+     *
+     * by adding specifically constructed values to a SPARSE HLL.
+     * Does not induce promotion.
+     *
+     * Format: cumulative add
+     * Tests:
+     * - SPARSE addition (predictable)
+     */
+    private static void sparseStepTest(final ISchemaVersion schemaVersion) throws IOException {
+        final FileWriter output = openOutput(schemaVersion, "sparse_step", TestType.ADD);
+
+        // the accumulator, starts empty sparse probabilistic
+        final HLL hll = newHLL(HLLType.SPARSE);
+        initLineAdd(output, hll, schemaVersion);
+
+        for(int i=0; i<SPARSE_THRESHOLD; i++) {
+            final long rawValue = constructHLLValue(LOG2M, i, ((i % REGISTER_MAX_VALUE) + 1));
+            cumulativeAddLine(output, hll, rawValue, schemaVersion);
+        }
+
+        output.flush();
+        output.close();
+    }
+
+    /**
+     * Cumulatively sets random registers of a SPARSE HLL to
+     * random values by adding random values. Does not induce promotion.
+     *
+     * Format: cumulative add
+     * Tests:
+     * - SPARSE addition (random)
+     */
+    private static void sparseRandomTest(final ISchemaVersion schemaVersion) throws IOException {
+        final FileWriter output = openOutput(schemaVersion, "sparse_random", TestType.ADD);
+
+        final Random random = new Random(randomLong());
+
+        // the accumulator, starts empty
+        final HLL hll = newHLL(HLLType.SPARSE);
+        initLineAdd(output, hll, schemaVersion);
+
+        for(int i=0; i<SPARSE_THRESHOLD; i++) {
+            final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
+            final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
+            final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
+
+            cumulativeAddLine(output, hll, rawValue, schemaVersion);
+        }
+
+        output.flush();
+        output.close();
+    }
+
+    /**
+     * Cumulatively sets the first register (index 0) to value 2, the last
+     * register (index m-1) to value 2, and then sets registers with indices in
+     * the range 2 to (sparseCutoff + 2) to value 1 to trigger promotion.
+     *
+     * This tests for register alignment in the promotion from SPARSE
+     * to FULL.
+     *
+     * Format: cumulative add
+     * Tests:
+     * - SPARSE addition
+     * - SPARSE to FULL promotion
+     */
+    private static void sparseEdgeTest(final ISchemaVersion schemaVersion) throws IOException {
+        final FileWriter output = openOutput(schemaVersion, "sparse_edge", TestType.ADD);
+
+        // the accumulator, starts empty
+        final HLL hll = newHLL(HLLType.SPARSE);
+        initLineAdd(output, hll, schemaVersion);
+
+        final long firstValue = constructHLLValue(LOG2M, 0, 2);
+        cumulativeAddLine(output, hll, firstValue, schemaVersion);
+
+        final long lastValue = constructHLLValue(LOG2M, (1 << LOG2M) - 1, 2);
+        cumulativeAddLine(output, hll, lastValue, schemaVersion);
+
+        for(int i=2; i<(SPARSE_THRESHOLD + 2); i++) {
+            final long middleValue = constructHLLValue(LOG2M, i, 1);
+
+            cumulativeAddLine(output, hll, middleValue, schemaVersion);
+        }
+
+        output.flush();
+        output.close();
+    }
+
+    /**
+     * Unions an EMPTY accumulator with EXPLICIT HLLs, each containing a
+     * single random value.
+     *
+     * Format: cumulative union
+     * Tests:
+     * - EMPTY U EXPLICIT
+     * - EXPLICIT U EXPLICIT
+     * - EXPLICIT to SPARSE promotion
+     * - SPARSE U EXPLICIT
+     */
+    private static void explicitPromotionTest(final ISchemaVersion schemaVersion) throws IOException {
+        final FileWriter output = openOutput(schemaVersion, "explicit_promotion", TestType.UNION);
+
+        final Random random = new Random(randomLong());
+
+        // the accumulator, starts empty
+        final HLL hll = newHLL(HLLType.EMPTY);
+        final HLL emptyHLL = newHLL(HLLType.EMPTY);
+        cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+        for(int i=0; i<(EXPLICIT_THRESHOLD+500)/*should be greater than promotion cutoff*/; i++) {
+            // make an EXPLICIT set and populate with cardinality 1
+            final HLL explicitHLL = newHLL(HLLType.EXPLICIT);
+            explicitHLL.addRaw(random.nextLong());
+
+            cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
+        }
+
+        output.flush();
+        output.close();
+    }
+
+    /**
+     * Unions an EMPTY accumulator with SPARSE HLLs, each
+     * having one register set.
+     *
+     * Format: cumulative union
+     * Tests:
+     * - EMPTY U SPARSE
+     * - SPARSE U SPARSE
+     * - SPARSE promotion
+     * - SPARSE U FULL
+     */
+    private static void sparseProbabilisticPromotionTest(final ISchemaVersion schemaVersion) throws IOException {
+        final FileWriter output = openOutput(schemaVersion, "sparse_promotion", TestType.UNION);
+
+        final Random random = new Random(randomLong());
+
+        // the accumulator, starts empty
+        final HLL hll = newHLL(HLLType.EMPTY);
+        final HLL emptyHLL = newHLL(HLLType.EMPTY);
+        cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+
+        for(int i=0; i<(SPARSE_THRESHOLD + 1000)/*should be greater than promotion cutoff*/; i++) {
+            // make a SPARSE set and populate with cardinality 1
+            final HLL sparseHLL = newHLL(HLLType.SPARSE);
+
+            final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
+            final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
+            final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
+            sparseHLL.addRaw(rawValue);
+
+            cumulativeUnionLine(output, hll, sparseHLL, schemaVersion);
+        }
+
+        output.flush();
+        output.close();
+    }
+
+    /**
+     * Unions an EMPTY accumulator with EXPLICIT HLLs, each having a single
+     * random value, twice in a row to verify that the set properties are
+     * satisfied.
+     *
+     * Format: cumulative union
+     * Tests:
+     * - EMPTY U EXPLICIT
+     * - EXPLICIT U EXPLICIT
+     */
+    private static void explicitOverlapTest(final ISchemaVersion schemaVersion) throws IOException {
+        final FileWriter output = openOutput(schemaVersion, "explicit_explicit", TestType.UNION);
+
+        final Random random = new Random(randomLong());
+
+        // the accumulator, starts empty
+        final HLL hll = newHLL(HLLType.EMPTY);
+        final HLL emptyHLL = newHLL(HLLType.EMPTY);
+
+        cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+        for(int i=0; i<EXPLICIT_THRESHOLD; i++) {
+            // make an EXPLICIT set and populate with cardinality 1
+            final HLL explicitHLL = newHLL(HLLType.EXPLICIT);
+            explicitHLL.addRaw(random.nextLong());
+
+            // union it into the accumulator twice, to test overlap (cardinality should not change)
+            cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
+            cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
+        }
+
+        output.flush();
+        output.close();
+    }
+
+    /**
+     * Unions an EMPTY accumulator with SPARSE HLLs, each
+     * having a single register set, twice in a row to verify that the set
+     * properties are satisfied.
+     *
+     * Format: cumulative union
+     * Tests:
+     * - EMPTY U SPARSE
+     * - SPARSE U SPARSE
+     */
+    private static void sparseProbabilisticOverlapTest(final ISchemaVersion schemaVersion) throws IOException {
+        final FileWriter output = openOutput(schemaVersion, "sparse_sparse", TestType.UNION);
+
+        final Random random = new Random(randomLong());
+
+        // the accumulator, starts empty
+        final HLL hll = newHLL(HLLType.EMPTY);
+        final HLL emptyHLL = newHLL(HLLType.EMPTY);
+
+        cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+        for(int i=0; i<SPARSE_THRESHOLD; i++) {
+            // make a SPARSE set and populate with cardinality 1
+            final HLL sparseHLL = newHLL(HLLType.SPARSE);
+            final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
+            final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
+            final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
+            sparseHLL.addRaw(rawValue);
+
+            cumulativeUnionLine(output, hll, sparseHLL, schemaVersion);
+        }
+
+        output.flush();
+        output.close();
+    }
+
+    /**
+     * Unions an EMPTY accumulator with FULL HLLs, each having
+     * many registers set, twice in a row to verify that the set properties are
+     * satisfied.
+     *
+     * Format: cumulative union
+     * Tests:
+     * - EMPTY U FULL
+     * - FULL U FULL
+     */
+    private static void probabilisticUnionTest(final ISchemaVersion schemaVersion) throws IOException {
+        final FileWriter output = openOutput(schemaVersion, "probabilistic_probabilistic", TestType.UNION);
+
+        final Random random = new Random(randomLong());
+
+        // the accumulator, starts empty
+        final HLL hll = newHLL(HLLType.EMPTY);
+        final HLL emptyHLL = newHLL(HLLType.EMPTY);
+        cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+        for(int i=0; i<1000/*number of rows to generate*/; i++) {
+            // make a FULL set and populate with
+            final HLL fullHLL = newHLL(HLLType.FULL);
+            final int elementCount = random.nextInt(10000/*arbitrary maximum cardinality*/);
+            for(int j=0;j<elementCount;j++) {
+                fullHLL.addRaw(random.nextLong());
+            }
+
+            cumulativeUnionLine(output, hll, fullHLL, schemaVersion);
+        }
+
+        output.flush();
+        output.close();
+    }
+
+    /**
+     * Unions an EMPTY accumulator with random HLLs.
+     *
+     * Format: cumulative union
+     * Tests:
+     * - hopefully all union possibilities
+     */
+    private static void globalUnionTest(final ISchemaVersion schemaVersion) throws IOException {
+        final FileWriter output = openOutput(schemaVersion, "comprehensive", TestType.UNION);
+
+        // the accumulator, starts empty
+        final HLL hll = newHLL(HLLType.EMPTY);
+        final HLL emptyHLL = newHLL(HLLType.EMPTY);
+
+        cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+        for(int i=0; i<1000/*number of rows to generate*/; i++) {
+            final HLL randomHLL = generateRandomHLL();
+            cumulativeUnionLine(output, hll, randomHLL, schemaVersion);
+        }
+
+        output.flush();
+        output.close();
+    }
+
+    // ========================================================================
+    // Main
+    public static void fullSuite(final ISchemaVersion schemaVersion) throws IOException {
+        fullCardinalityCorrectionTest(schemaVersion);
+        globalUnionTest(schemaVersion);
+        globalStepTest(schemaVersion);
+        probabilisticUnionTest(schemaVersion);
+        explicitPromotionTest(schemaVersion);
+        explicitOverlapTest(schemaVersion);
+        sparseFullRepresentationTest(schemaVersion);
+        sparseStepTest(schemaVersion);
+        sparseRandomTest(schemaVersion);
+        sparseEdgeTest(schemaVersion);
+        sparseProbabilisticPromotionTest(schemaVersion);
+        sparseProbabilisticOverlapTest(schemaVersion);
+    }
+
+    public static void main(String[] args) throws IOException {
+        fullSuite(SerializationUtil.VERSION_ONE);
+    }
+
+    // ************************************************************************
+    // Helpers
+    /**
+     * Shortcut for testing constructor, which uses the constants defined at
+     * the top of the file as default parameters.
+     *
+     * @return a new {@link HLL} of specified type, which uses the parameters
+     *         ({@link #LOG2M}, {@link #REGWIDTH}, {@link #EXPLICIT_THRESHOLD},
+     *         and {@link #SPARSE_THRESHOLD}) specified above.
+     */
+    private static HLL newHLL(final HLLType type) {
+        return newHLL(type);
+    }
+
+    /**
+     * Returns the algorithm-specific cardinality of the specified {@link HLL}
+     * as a {@link String} appropriate for comparison with the algorithm-specific
+     * cardinality provided by the PostgreSQL implementation.
+     *
+     * @param  hll the HLL whose algorithm-specific cardinality is to be printed.
+     *         This cannot be <code>null</code>.
+     * @return the algorithm-specific cardinality of the instance as a PostgreSQL-
+     *         compatible String. This will never be <code>null</code>
+     */
+    private static String stringCardinality(final HLL hll) {
+        switch(hll.getType()) {
+            case EMPTY:
+                return "0";
+            case EXPLICIT:/*promotion has not yet occurred*/
+                return Long.toString(hll.cardinality());
+            case SPARSE:
+                return Double.toString(hll.sparseProbabilisticAlgorithmCardinality());
+            case FULL:
+                return Double.toString(hll.fullProbabilisticAlgorithmCardinality());
+            default:
+                throw new RuntimeException("Unknown HLL type " + hll.getType());
+        }
+    }
+
+    /**
+     * Generates a random HLL and populates it with random values.
+     *
+     * @return the populated HLL. This will never be <code>null</code>.
+     */
+    public static HLL generateRandomHLL() {
+        final int randomTypeInt = randomIntBetween(0, HLLType.values().length - 1);
+        final HLLType type;
+        switch(randomTypeInt) {
+            case 0:
+                type = HLLType.EMPTY;
+                break;
+            case 1:
+                type = HLLType.EXPLICIT;
+                break;
+            case 2:
+                type = HLLType.FULL;
+                break;
+            case 3:
+                type = HLLType.EMPTY;
+                break;
+            case 4:
+                type = HLLType.SPARSE;
+                break;
+            default:
+                throw new RuntimeException("Unassigned type int " + randomTypeInt);
+        }
+
+        final int cardinalityCap;
+        final int cardinalityBaseline;
+
+        switch(type) {
+            case EMPTY:
+                return newHLL(HLLType.EMPTY);
+            case EXPLICIT:
+                cardinalityCap = EXPLICIT_THRESHOLD;
+                cardinalityBaseline = 1;
+                break;
+            case SPARSE:
+                cardinalityCap = SPARSE_THRESHOLD;
+                cardinalityBaseline = (EXPLICIT_THRESHOLD + 1);
+                break;
+            case FULL:
+                cardinalityCap = 100000;
+                cardinalityBaseline = (SPARSE_THRESHOLD*10);
+                break;
+            default:
+                throw new RuntimeException("We should never be here.");
+        }
+
+        final HLL hll = newHLL(HLLType.EMPTY);
+        for(int i=0; i<cardinalityBaseline; i++) {
+            hll.addRaw(randomLong());
+        }
+        for(int i=0; i<randomInt(cardinalityCap - cardinalityBaseline); i++) {
+            hll.addRaw(randomLong());
+        }
+
+        return hll;
+    }
+
+    /**
+     * Opens a {@link FileWriter} and writes out an appropriate CSV header.
+     *
+     * @param  schemaVersion Schema version of the output. This cannot be
+     *         <code>null</code>.
+     * @param  description Description string used to build the filename.
+     *         This cannot be <code>null</code>.
+     * @param  type {@link TestType type} of the test file to be written.
+     *         This cannot be <code>null</code>.
+     * @return The opened {@link FileWriter writer}. This will never be <code>null</code>.
+     */
+    private static FileWriter openOutput(final ISchemaVersion schemaVersion, final String description, final TestType type) throws IOException {
+        final String schemaVersionPrefix = "v"+ schemaVersion.schemaVersionNumber() + "_";
+        final String header;
+        final String filename;
+        switch(type) {
+            case ADD:
+                header = "cardinality,raw_value,HLL\n";
+                filename = schemaVersionPrefix + "cumulative_add_" + description + ".csv";
+                break;
+            case UNION:
+                header = "cardinality,HLL,union_cardinality,union_HLL\n";
+                filename = schemaVersionPrefix + "cumulative_union_" + description + ".csv";
+                break;
+            default:
+                throw new RuntimeException("Unknown test type " + type);
+        }
+
+        final FileWriter output = new FileWriter(OUTPUT_DIRECTORY + filename);
+        output.write(header);
+        output.flush();
+        return output;
+    }
+
+    /**
+     * Writes out a {@link TestType#ADD}-formatted test line.
+     *
+     * @param  output The output {@link FileWriter writer}. This cannot be <code>null</code>.
+     * @param  hll The "accumulator" HLL instance. This cannot be <code>null</code>.
+     * @param  rawValue The raw value added to the HLL.
+     * @param  schemaVersion the schema with which to serialize the HLLs. This cannot
+     *         be <code>null</code>.
+     */
+    private static void cumulativeAddLine(final FileWriter output, final HLL hll, final long rawValue, final ISchemaVersion schemaVersion) throws IOException {
+        hll.addRaw(rawValue);
+        final String accumulatorCardinality = stringCardinality(hll);
+
+        output.write(accumulatorCardinality + "," + rawValue + "," + toByteA(hll, schemaVersion) + "\n");
+        output.flush();
+    }
+
+    /**
+     * Writes an initial line for a {@link TestType#ADD}-formatted test.
+     *
+     * @param  output The output {@link FileWriter writer}. This cannot be <code>null</code>.
+     * @param  hll The "accumulator" HLL instance. This cannot be <code>null</code>.
+     * @param  schemaVersion the schema with which to serialize the HLLs. This cannot
+     *         be <code>null</code>.
+     */
+    private static void initLineAdd(final FileWriter output, final HLL hll, final ISchemaVersion schemaVersion) throws IOException {
+        output.write(0 + "," + 0 + "," + toByteA(hll, schemaVersion) + "\n");
+        output.flush();
+    }
+
+    /**
+     * Writes out a {@link TestType#UNION}-formatted test line.
+     *
+     * @param  output The output {@link FileWriter writer}. This cannot be <code>null</code>.
+     * @param  hll The "accumulator" HLL instance. This cannot be <code>null</code>.
+     * @param  increment The "increment" HLL instance which will be unioned into
+     *         the accumulator. This cannot be <code>null</code>.
+     * @param  schemaVersion the schema with which to serialize the HLLs. This cannot
+     *         be <code>null</code>.
+     */
+    private static void cumulativeUnionLine(final FileWriter output, final HLL hll, final HLL increment, final ISchemaVersion schemaVersion) throws IOException {
+        hll.union(increment);
+
+        final String incrementCardinality = stringCardinality(increment);
+        final String accumulatorCardinality = stringCardinality(hll);
+        output.write(incrementCardinality + "," + toByteA(increment, schemaVersion) + "," + accumulatorCardinality + "," + toByteA(hll, schemaVersion) + "\n");
+        output.flush();
+    }
+
+    /**
+     * Serializes a HLL to Postgres 9 'bytea' hex-format, for CSV ingest.
+     *
+     * @param  hll the HLL to serialize. This cannot be <code>null</code>.
+     * @param  schemaVersion the schema with which to serialize the HLLs. This cannot
+     *         be <code>null</code>.
+     * @return a PostgreSQL 'bytea' string representing the HLL.
+     */
+    private static String toByteA(final HLL hll, final ISchemaVersion schemaVersion) {
+        final byte[] bytes = hll.toBytes(schemaVersion);
+        return ("\\x" + NumberUtil.toHex(bytes, 0, bytes.length));
+    }
+
+    /**
+     * Indicates what kind of test output a test will generate.
+     */
+    private static enum TestType {
+        /**
+         * This type of test is characterized by values being added to an
+         * accumulator HLL whose serialized representation (after the value is added)
+         * is printed to each line along with the cardinality and added value.
+         */
+        ADD,
+        /**
+         * This type of test is characterized by HLLs being unioned into an
+         * accumulator HLL whose serialized representation (after the HLL is
+         * union'd) is printed to each line along with the cardinalities and the
+         * serialized representation of the HLL union'd in.
+         */
+        UNION;
+    }
+}

Added: lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/ProbabilisticTestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/ProbabilisticTestUtil.java?rev=1691351&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/ProbabilisticTestUtil.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/ProbabilisticTestUtil.java Thu Jul 16 10:47:37 2015
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+/**
+ * A collection of test utilities for constructing input values to HLLs and for
+ * computing their serialized size.
+ */
+public class ProbabilisticTestUtil {
+    /**
+     * Constructs a value that when added raw to a HLL will set the register at
+     * <code>registerIndex</code> to <code>registerValue</code>.
+     *
+     * @param  log2m the log-base-2 of the number of registers in the HLL
+     * @param  registerIndex the index of the register to set
+     * @param  registerValue the value to set the register to
+     * @return the value
+     */
+    public static long constructHLLValue(final int log2m, final int registerIndex, final int registerValue) {
+        final long partition = registerIndex;
+        final long substreamValue = (1L << (registerValue - 1));
+        return (substreamValue << log2m) | partition;
+    }
+
+    /**
+     * Extracts the HLL register index from a raw value.
+     */
+    public static short getRegisterIndex(final long rawValue, final int log2m) {
+        final long mBitsMask = (1 << log2m) - 1;
+        final short j = (short)(rawValue & mBitsMask);
+        return j;
+    }
+
+    /**
+     * Extracts the HLL register value from a raw value.
+     */
+    public static byte getRegisterValue(final long rawValue, final int log2m) {
+        final long substreamValue = (rawValue >>> log2m);
+        final byte p_w;
+
+        if (substreamValue == 0L) {
+            // The paper does not cover p(0x0), so the special value 0 is used.
+            // 0 is the original initialization value of the registers, so by
+            // doing this the HLL simply ignores it. This is acceptable
+            // because the probability is 1/(2^(2^registerSizeInBits)).
+            p_w = 0;
+        } else {
+            p_w = (byte)Math.min(1 + BitUtil.leastSignificantBit(substreamValue), 31);
+        }
+
+        return p_w;
+    }
+
+    /**
+     * @return the number of bytes required to pack <code>registerCount</code>
+     *         registers of width <code>shortWordLength</code>.
+     */
+    public static int getRequiredBytes(final int shortWordLength, final int registerCount) {
+        return (int)Math.ceil((registerCount * shortWordLength)/(float)8);
+    }
+}

Added: lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/SparseHLLTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/SparseHLLTest.java?rev=1691351&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/SparseHLLTest.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/SparseHLLTest.java Thu Jul 16 10:47:37 2015
@@ -0,0 +1,453 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+import com.carrotsearch.hppc.IntByteOpenHashMap;
+import com.carrotsearch.hppc.cursors.IntByteCursor;
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+
+/**
+ * Tests {@link HLL} of type {@link HLLType#SPARSE}.
+ */
+public class SparseHLLTest extends LuceneTestCase {
+    private static final int log2m = 11;
+
+    /**
+     * Tests {@link HLL#addRaw(long)}.
+     */
+    @Test
+    public void addTest() {
+        { // insert an element with register value 1 (minimum set value)
+            final int registerIndex = 0;
+            final int registerValue = 1;
+            final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+            final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+            hll.addRaw(rawValue);
+
+            assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
+        }
+        { // insert an element with register value 31 (maximum set value)
+            final int registerIndex = 0;
+            final int registerValue = 31;
+            final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+            final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+            hll.addRaw(rawValue);
+
+            assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
+        }
+        { // insert an element that could overflow the register (past 31)
+            final int registerIndex = 0;
+            final int registerValue = 36;
+            final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+            final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+            hll.addRaw(rawValue);
+
+            assertOneRegisterSet(hll, (short)registerIndex, (byte)31/*register max*/);
+        }
+        { // insert duplicate elements, observe no change
+            final int registerIndex = 0;
+            final int registerValue = 1;
+            final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+            final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+            hll.addRaw(rawValue);
+            hll.addRaw(rawValue);
+
+            assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
+        }
+        { // insert elements that increase a register's value
+            final int registerIndex = 0;
+            final int registerValue = 1;
+            final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+            final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+            hll.addRaw(rawValue);
+
+            final int registerValue2 = 2;
+            final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
+            hll.addRaw(rawValue2);
+
+            assertOneRegisterSet(hll, registerIndex, (byte)registerValue2);
+        }
+        { // insert elements that have lower register values, observe no change
+            final int registerIndex = 0;
+            final int registerValue = 2;
+            final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+            final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+            hll.addRaw(rawValue);
+
+            final int registerValue2 = 1;
+            final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
+            hll.addRaw(rawValue2);
+
+            assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
+        }
+    }
+
+    /**
+     * Smoke test for {@link HLL#cardinality()} and the proper use of the small
+     * range correction.
+     */
+    @Test
+    public void smallRangeSmokeTest() {
+        final int log2m = 11;
+        final int m = (1 << log2m);
+        final int regwidth = 5;
+
+        // only one register set
+        {
+            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+            hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0, 1));
+
+            final long cardinality = hll.cardinality();
+
+            // Trivially true that small correction conditions hold: one register
+            // set implies zeroes exist, and estimator trivially smaller than 5m/2.
+            // Small range correction: m * log(m/V)
+            final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
+            assertEquals(cardinality, expected);
+        }
+
+        // all but one register set
+        {
+            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+            for(int i=0; i<(m - 1); i++) {
+                hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
+            }
+
+            // Trivially true that small correction conditions hold: all but
+            // one register set implies a zero exists, and estimator trivially
+            // smaller than 5m/2 since it's alpha / ((m-1)/2)
+            final long cardinality = hll.cardinality();
+
+            // Small range correction: m * log(m/V)
+            final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
+            assertEquals(cardinality, expected);
+        }
+    }
+
+    /**
+     * Smoke test for {@link HLL#cardinality()} and the proper use of the
+     * uncorrected estimator.
+     */
+    @Test
+    public void normalRangeSmokeTest() {
+        final int log2m = 11;
+        final int m = (1 << log2m);
+        final int regwidth = 5;
+        // regwidth = 5, so hash space is
+        // log2m + (2^5 - 1 - 1), so L = log2m + 30
+        final int l = log2m + 30;
+
+        // all registers at 'medium' value
+        {
+            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
+
+            final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
+            for(int i=0; i<m; i++) {
+                hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
+            }
+
+            final long cardinality = hll.cardinality();
+
+            // Simplified estimator when all registers take same value: alpha / (m/2^val)
+            final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
+
+            // Assert conditions for uncorrected range
+            assertTrue(estimator <= Math.pow(2,l)/30);
+            assertTrue(estimator > (5 * m /(double)2));
+
+            final long expected = (long)Math.ceil(estimator);
+            assertEquals(cardinality, expected);
+        }
+    }
+
+    /**
+     * Smoke test for {@link HLL#cardinality()} and the proper use of the large
+     * range correction.
+     */
+    @Test
+    public void largeRangeSmokeTest() {
+        final int log2m = 11;
+        final int m = (1 << log2m);
+        final int regwidth = 5;
+        // regwidth = 5, so hash space is
+        // log2m + (2^5 - 1 - 1), so L = log2m + 30
+        final int l = log2m + 30;
+
+        // all registers at large value
+        {
+            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
+
+            final int registerValue = 31/*chosen to ensure large correction kicks in*/;
+            for(int i=0; i<m; i++) {
+                hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
+            }
+
+            final long cardinality = hll.cardinality();
+
+
+            // Simplified estimator when all registers take same value: alpha / (m/2^val)
+            final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
+
+            // Assert conditions for large range
+            assertTrue(estimator > Math.pow(2, l)/30);
+
+            // Large range correction: -2^32 * log(1 - E/2^32)
+            final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
+            assertEquals(cardinality, expected);
+        }
+    }
+
+    /**
+     * Tests {@link HLL#union(HLL)}.
+     */
+    @Test
+    public void unionTest() {
+        final int log2m = 11/*arbitrary*/;
+        final int sparseThreshold = 256/*arbitrary*/;
+
+        { // two empty multisets should union to an empty set
+            final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+            final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+            hllA.union(hllB);
+
+            assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
+            assertEquals(hllA.cardinality(), 0L);
+        }
+        { // two disjoint multisets should union properly
+            final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+            hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 1));
+            final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+            hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 2, 1));
+
+
+            hllA.union(hllB);
+
+            assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
+            assertEquals(hllA.cardinality(), 3L/*precomputed*/);
+            assertRegisterPresent(hllA, 1, (byte)1);
+            assertRegisterPresent(hllA, 2, (byte)1);
+        }
+        { // two exactly overlapping multisets should union properly
+            final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+            hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 10));
+            final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+            hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 13));
+
+            hllA.union(hllB);
+
+            assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
+            assertEquals(hllA.cardinality(), 2L/*precomputed*/);
+            assertOneRegisterSet(hllA, 1, (byte)13/*max(10,13)*/);
+        }
+        { // overlapping multisets should union properly
+            final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+            final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+            // register index = 3
+            final long rawValueA = ProbabilisticTestUtil.constructHLLValue(log2m, 3, 11);
+
+            // register index = 4
+            final long rawValueB = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 13);
+            final long rawValueBPrime = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 21);
+
+            // register index = 5
+            final long rawValueC = ProbabilisticTestUtil.constructHLLValue(log2m, 5, 14);
+
+            hllA.addRaw(rawValueA);
+            hllA.addRaw(rawValueB);
+
+            hllB.addRaw(rawValueBPrime);
+            hllB.addRaw(rawValueC);
+
+            hllA.union(hllB);
+            // union should have three registers set, with partition B set to the
+            // max of the two registers
+            assertRegisterPresent(hllA, 3, (byte)11);
+            assertRegisterPresent(hllA, 4, (byte)21/*max(21,13)*/);
+            assertRegisterPresent(hllA, 5, (byte)14);
+        }
+        { // too-large unions should promote
+            final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+            final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+            // fill up sets to maxCapacity
+            for(int i=0; i<sparseThreshold; i++) {
+                hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
+                hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, (i + sparseThreshold)/*non-overlapping*/, 1));
+            }
+
+            hllA.union(hllB);
+
+            assertEquals(hllA.getType(), HLLType.FULL);
+        }
+    }
+
+    /**
+     * Tests {@link HLL#clear()}.
+     */
+    @Test
+    public void clearTest() {
+        final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.SPARSE);
+        hll.addRaw(1L);
+        hll.clear();
+        assertEquals(hll.cardinality(), 0L);
+    }
+
+    /**
+     * Tests {@link HLL#toBytes(ISchemaVersion)} and
+     * {@link HLL#fromBytes(byte[])}.
+     */
+    @Test
+    public void toFromBytesTest() {
+        final int log2m = 11/*arbitrary*/;
+        final int regwidth = 5/*arbitrary*/;
+        final int sparseThreshold = 256/*arbitrary*/;
+        final int shortWordLength = 16/*log2m + regwidth = 11 + 5*/;
+
+        final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
+        final HLLType type = HLLType.SPARSE;
+        final int padding = schemaVersion.paddingBytes(type);
+
+        {// Should work on an empty element
+            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+            final byte[] bytes = hll.toBytes(schemaVersion);
+
+            // output should just be padding since no registers are used
+            assertEquals(bytes.length, padding);
+
+            final HLL inHLL = HLL.fromBytes(bytes);
+
+            // assert register values correct
+            assertElementsEqual(hll, inHLL);
+        }
+        {// Should work on a partially filled element
+            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+            for(int i=0; i<3; i++) {
+                final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i+9));
+                hll.addRaw(rawValue);
+            }
+
+            final byte[] bytes = hll.toBytes(schemaVersion);
+
+            assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, 3/*registerCount*/));
+
+            final HLL inHLL = HLL.fromBytes(bytes);
+
+            // assert register values correct
+            assertElementsEqual(hll, inHLL);
+        }
+        {// Should work on a full set
+            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+            for(int i=0; i<sparseThreshold; i++) {
+                final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i % 9) + 1);
+                hll.addRaw(rawValue);
+            }
+
+            final byte[] bytes = hll.toBytes(schemaVersion);
+
+            // 'short words' should be 12 bits + 5 bits = 17 bits long
+            assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, sparseThreshold));
+
+            final HLL inHLL = HLL.fromBytes(bytes);
+
+            // assert register values correct
+            assertElementsEqual(hll, inHLL);
+        }
+    }
+
+    /**
+     * Smoke tests the multisets by adding random values.
+     */
+    @Test
+    public void randomValuesTest() {
+        final int log2m = 11/*arbitrary*/;
+        final int regwidth = 5/*arbitrary*/;
+        final int sparseThreshold = 256/*arbitrary*/;
+
+        for(int run=0; run<100; run++) {
+            final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+            final IntByteOpenHashMap map = new IntByteOpenHashMap();
+
+            for(int i=0; i<sparseThreshold; i++) {
+                final long rawValue = RandomizedTest.randomLong();
+
+                final short registerIndex = ProbabilisticTestUtil.getRegisterIndex(rawValue, log2m);
+                final byte registerValue = ProbabilisticTestUtil.getRegisterValue(rawValue, log2m);
+                if(map.get(registerIndex) < registerValue) {
+                    map.put(registerIndex, registerValue);
+                }
+
+                hll.addRaw(rawValue);
+            }
+
+            for (IntByteCursor c : map) {
+                final byte expectedRegisterValue = map.get(c.key);
+                assertRegisterPresent(hll, c.key, expectedRegisterValue);
+            }
+        }
+    }
+
+    //*************************************************************************
+    // assertion helpers
+    /**
+     * Asserts that the register at the specified index is set to the specified
+     * value.
+     */
+    private static void assertRegisterPresent(final HLL hll,
+                                              final int registerIndex,
+                                              final int registerValue) {
+        final IntByteOpenHashMap sparseProbabilisticStorage = hll.sparseProbabilisticStorage;
+        assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
+    }
+
+    /**
+     * Asserts that only the specified register is set and has the specified value.
+     */
+    private static void assertOneRegisterSet(final HLL hll,
+                                             final int registerIndex,
+                                             final byte registerValue) {
+        final IntByteOpenHashMap sparseProbabilisticStorage = hll.sparseProbabilisticStorage;
+        assertEquals(sparseProbabilisticStorage.size(), 1);
+        assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
+    }
+
+    /**
+     * Asserts that all registers in the two {@link HLL} instances are identical.
+     */
+    private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
+        final IntByteOpenHashMap sparseProbabilisticStorageA = hllA.sparseProbabilisticStorage;
+        final IntByteOpenHashMap sparseProbabilisticStorageB = hllB.sparseProbabilisticStorage;
+        assertEquals(sparseProbabilisticStorageA.size(), sparseProbabilisticStorageB.size());
+        for (IntByteCursor c : sparseProbabilisticStorageA) {
+            assertEquals(sparseProbabilisticStorageA.get(c.key), 
+                         sparseProbabilisticStorageB.get(c.key));
+        }
+    }
+}