You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2015/07/16 12:47:38 UTC
svn commit: r1691351 [2/2] -
/lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/
Added: lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/IntegrationTestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/IntegrationTestGenerator.java?rev=1691351&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/IntegrationTestGenerator.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/IntegrationTestGenerator.java Thu Jul 16 10:47:37 2015
@@ -0,0 +1,708 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
+import static org.apache.solr.util.hll.ProbabilisticTestUtil.*;
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Random;
+
+/**
+ * Generates test files for testing other implementations of HLL
+ * serialization/deserialization, namely the PostgreSQL implementation.
+ */
+public class IntegrationTestGenerator {
+ // ************************************************************************
+ // directory to output the generated tests
+ private static final String OUTPUT_DIRECTORY = "/tmp/hll_test/";
+
+ // ------------------------------------------------------------------------
+ // configurations for HLLs, should mirror settings in PostgreSQL impl. tests
+ private static final int REGWIDTH = 5;
+ private static final int LOG2M = 11;
+ // NOTE: This differs from the PostgreSQL impl. parameter 'expthresh'. This
+ // is a literal threshold to use in the promotion hierarchy, implying
+ // that both EXPLICIT representation should be used and it should
+ // NOT be automatically computed. This is done to ensure that the
+ // parameters of the test are very explicitly defined.
+ private static final int EXPLICIT_THRESHOLD = 256;
+ // NOTE: This is not the PostgreSQL impl. parameter 'sparseon'. 'sparseon'
+ // is assumed to be true and this is a literal register-count threshold
+ // to use in the promotion hierarchy. This is done to ensure that the
+ // parameters of the test are very explicitly defined.
+ private static final int SPARSE_THRESHOLD = 850;
+
+ // ------------------------------------------------------------------------
+ // computed constants
+ private static final int REGISTER_COUNT = (1 << LOG2M);
+ private static final int REGISTER_MAX_VALUE = (1 << REGWIDTH) - 1;
+
+ // ========================================================================
+ // Tests
+ /**
+ * Cumulatively adds random values to a FULL HLL through the small range
+ * correction, uncorrected range, and large range correction of the HLL's
+ * cardinality estimator.
+ *
+ * Format: cumulative add
+ * Tests:
+ * - FULL cardinality computation
+ */
+ private static void fullCardinalityCorrectionTest(final ISchemaVersion schemaVersion) throws IOException {
+ final FileWriter output = openOutput(schemaVersion, "cardinality_correction", TestType.ADD);
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.FULL);
+ initLineAdd(output, hll, schemaVersion);
+
+ // run through some values in the small range correction
+ for(int i=0; i<((1 << LOG2M) - 1); i++) {
+ final long rawValue = constructHLLValue(LOG2M, i, 1);
+ cumulativeAddLine(output, hll, rawValue, schemaVersion);
+ }
+
+ // run up past some values in the uncorrected range
+ for(int i=0; i<(1 << LOG2M); i++) {
+ final long rawValue = constructHLLValue(LOG2M, i, 7);
+ cumulativeAddLine(output, hll, rawValue, schemaVersion);
+ }
+
+ // run through some values in the large range correction
+ for(int i=0; i<(1 << LOG2M); i++) {
+ final long rawValue = constructHLLValue(LOG2M, i, 30);
+ cumulativeAddLine(output, hll, rawValue, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Cumulatively adds random values to an EMPTY HLL.
+ *
+ * Format: cumulative add
+ * Tests:
+ * - EMPTY, EXPLICIT, SPARSE, PROBABILSTIC addition
+ * - EMPTY to EXPLICIT promotion
+ * - EXPLICIT to SPARSE promotion
+ * - SPARSE to FULL promotion
+ */
+ private static void globalStepTest(final ISchemaVersion schemaVersion) throws IOException {
+ final FileWriter output = openOutput(schemaVersion, "comprehensive_promotion", TestType.ADD);
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ initLineAdd(output, hll, schemaVersion);
+
+ for(int i=0; i<10000/*arbitrary*/; i++) {
+ cumulativeAddLine(output, hll, randomLong(), schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Cumulatively unions "underpopulated" FULL HLLs into the
+ * accumulator to verify the correct behavior from the PostgreSQL implementation.
+ * The PostgreSQL implementation's representations of probabilistic HLLs should
+ * depend exclusively on the chosen SPARSE-to-FULL cutoff.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - EMPTY U "underpopulated" FULL => SPARSE
+ * - SPARSE U "underpopulated" FULL => SPARSE
+ * - SPARSE U "barely underpopulated" FULL => FULL
+ */
+ private static void sparseFullRepresentationTest(final ISchemaVersion schemaVersion) throws IOException {
+ final FileWriter output = openOutput(schemaVersion, "sparse_full_representation", TestType.UNION);
+
+ final HLL emptyHLL1 = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL2 = newHLL(HLLType.EMPTY);
+
+ cumulativeUnionLine(output, emptyHLL1, emptyHLL2, schemaVersion);
+
+ // NOTE: In this test the sparseReference will be the "expected" value
+ // from the C representation, since it doesn't choose representation
+ // based on original encoding, but rather on the promotion rules
+ // and the declared type of the "receiving" field.
+ // It is the manually-constructed union result.
+
+ // "underpopulated" FULL U EMPTY => SPARSE
+ final HLL fullHLL = newHLL(HLLType.FULL);
+ fullHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
+
+ final HLL sparseHLL = newHLL(HLLType.SPARSE);
+ sparseHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
+
+ output.write(stringCardinality(fullHLL) + "," + toByteA(fullHLL, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
+ output.flush();
+
+ // "underpopulated" FULL (small) U SPARSE (small) => SPARSE
+ final HLL fullHLL2 = newHLL(HLLType.FULL);
+ fullHLL2.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
+
+ sparseHLL.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
+
+ output.write(stringCardinality(fullHLL2) + "," + toByteA(fullHLL2, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
+ output.flush();
+
+ // "underpopulated" FULL (just on edge) U SPARSE (small) => FULL
+ final HLL fullHLL3 = newHLL(HLLType.FULL);
+ for(int i=2; i<(SPARSE_THRESHOLD + 1); i++) {
+ fullHLL3.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
+ sparseHLL.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
+ }
+
+ output.write(stringCardinality(fullHLL3) + "," + toByteA(fullHLL3, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
+ output.flush();
+ }
+
+ /**
+ * Cumulatively sets successive registers to:
+ *
+ * <code>(registerIndex % REGISTER_MAX_VALUE) + 1</code>
+ *
+ * by adding specifically constructed values to a SPARSE HLL.
+ * Does not induce promotion.
+ *
+ * Format: cumulative add
+ * Tests:
+ * - SPARSE addition (predictable)
+ */
+ private static void sparseStepTest(final ISchemaVersion schemaVersion) throws IOException {
+ final FileWriter output = openOutput(schemaVersion, "sparse_step", TestType.ADD);
+
+ // the accumulator, starts empty sparse probabilistic
+ final HLL hll = newHLL(HLLType.SPARSE);
+ initLineAdd(output, hll, schemaVersion);
+
+ for(int i=0; i<SPARSE_THRESHOLD; i++) {
+ final long rawValue = constructHLLValue(LOG2M, i, ((i % REGISTER_MAX_VALUE) + 1));
+ cumulativeAddLine(output, hll, rawValue, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Cumulatively sets random registers of a SPARSE HLL to
+ * random values by adding random values. Does not induce promotion.
+ *
+ * Format: cumulative add
+ * Tests:
+ * - SPARSE addition (random)
+ */
+ private static void sparseRandomTest(final ISchemaVersion schemaVersion) throws IOException {
+ final FileWriter output = openOutput(schemaVersion, "sparse_random", TestType.ADD);
+
+ final Random random = new Random(randomLong());
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.SPARSE);
+ initLineAdd(output, hll, schemaVersion);
+
+ for(int i=0; i<SPARSE_THRESHOLD; i++) {
+ final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
+ final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
+ final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
+
+ cumulativeAddLine(output, hll, rawValue, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Cumulatively sets the first register (index 0) to value 2, the last
+ * register (index m-1) to value 2, and then sets registers with indices in
+ * the range 2 to (sparseCutoff + 2) to value 1 to trigger promotion.
+ *
+ * This tests for register alignment in the promotion from SPARSE
+ * to FULL.
+ *
+ * Format: cumulative add
+ * Tests:
+ * - SPARSE addition
+ * - SPARSE to FULL promotion
+ */
+ private static void sparseEdgeTest(final ISchemaVersion schemaVersion) throws IOException {
+ final FileWriter output = openOutput(schemaVersion, "sparse_edge", TestType.ADD);
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.SPARSE);
+ initLineAdd(output, hll, schemaVersion);
+
+ final long firstValue = constructHLLValue(LOG2M, 0, 2);
+ cumulativeAddLine(output, hll, firstValue, schemaVersion);
+
+ final long lastValue = constructHLLValue(LOG2M, (1 << LOG2M) - 1, 2);
+ cumulativeAddLine(output, hll, lastValue, schemaVersion);
+
+ for(int i=2; i<(SPARSE_THRESHOLD + 2); i++) {
+ final long middleValue = constructHLLValue(LOG2M, i, 1);
+
+ cumulativeAddLine(output, hll, middleValue, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Unions an EMPTY accumulator with EXPLICIT HLLs, each containing a
+ * single random value.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - EMPTY U EXPLICIT
+ * - EXPLICIT U EXPLICIT
+ * - EXPLICIT to SPARSE promotion
+ * - SPARSE U EXPLICIT
+ */
+ private static void explicitPromotionTest(final ISchemaVersion schemaVersion) throws IOException {
+ final FileWriter output = openOutput(schemaVersion, "explicit_promotion", TestType.UNION);
+
+ final Random random = new Random(randomLong());
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL = newHLL(HLLType.EMPTY);
+ cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+ for(int i=0; i<(EXPLICIT_THRESHOLD+500)/*should be greater than promotion cutoff*/; i++) {
+ // make an EXPLICIT set and populate with cardinality 1
+ final HLL explicitHLL = newHLL(HLLType.EXPLICIT);
+ explicitHLL.addRaw(random.nextLong());
+
+ cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Unions an EMPTY accumulator with SPARSE HLLs, each
+ * having one register set.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - EMPTY U SPARSE
+ * - SPARSE U SPARSE
+ * - SPARSE promotion
+ * - SPARSE U FULL
+ */
+ private static void sparseProbabilisticPromotionTest(final ISchemaVersion schemaVersion) throws IOException {
+ final FileWriter output = openOutput(schemaVersion, "sparse_promotion", TestType.UNION);
+
+ final Random random = new Random(randomLong());
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL = newHLL(HLLType.EMPTY);
+ cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+
+ for(int i=0; i<(SPARSE_THRESHOLD + 1000)/*should be greater than promotion cutoff*/; i++) {
+ // make a SPARSE set and populate with cardinality 1
+ final HLL sparseHLL = newHLL(HLLType.SPARSE);
+
+ final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
+ final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
+ final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
+ sparseHLL.addRaw(rawValue);
+
+ cumulativeUnionLine(output, hll, sparseHLL, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Unions an EMPTY accumulator with EXPLICIT HLLs, each having a single
+ * random value, twice in a row to verify that the set properties are
+ * satisfied.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - EMPTY U EXPLICIT
+ * - EXPLICIT U EXPLICIT
+ */
+ private static void explicitOverlapTest(final ISchemaVersion schemaVersion) throws IOException {
+ final FileWriter output = openOutput(schemaVersion, "explicit_explicit", TestType.UNION);
+
+ final Random random = new Random(randomLong());
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL = newHLL(HLLType.EMPTY);
+
+ cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+ for(int i=0; i<EXPLICIT_THRESHOLD; i++) {
+ // make an EXPLICIT set and populate with cardinality 1
+ final HLL explicitHLL = newHLL(HLLType.EXPLICIT);
+ explicitHLL.addRaw(random.nextLong());
+
+ // union it into the accumulator twice, to test overlap (cardinality should not change)
+ cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
+ cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Unions an EMPTY accumulator with SPARSE HLLs, each
+ * having a single register set, twice in a row to verify that the set
+ * properties are satisfied.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - EMPTY U SPARSE
+ * - SPARSE U SPARSE
+ */
+ private static void sparseProbabilisticOverlapTest(final ISchemaVersion schemaVersion) throws IOException {
+ final FileWriter output = openOutput(schemaVersion, "sparse_sparse", TestType.UNION);
+
+ final Random random = new Random(randomLong());
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL = newHLL(HLLType.EMPTY);
+
+ cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+ for(int i=0; i<SPARSE_THRESHOLD; i++) {
+ // make a SPARSE set and populate with cardinality 1
+ final HLL sparseHLL = newHLL(HLLType.SPARSE);
+ final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
+ final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
+ final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
+ sparseHLL.addRaw(rawValue);
+
+ cumulativeUnionLine(output, hll, sparseHLL, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Unions an EMPTY accumulator with FULL HLLs, each having
+ * many registers set, twice in a row to verify that the set properties are
+ * satisfied.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - EMPTY U FULL
+ * - FULL U FULL
+ */
+ private static void probabilisticUnionTest(final ISchemaVersion schemaVersion) throws IOException {
+ final FileWriter output = openOutput(schemaVersion, "probabilistic_probabilistic", TestType.UNION);
+
+ final Random random = new Random(randomLong());
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL = newHLL(HLLType.EMPTY);
+ cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+ for(int i=0; i<1000/*number of rows to generate*/; i++) {
+ // make a FULL set and populate with
+ final HLL fullHLL = newHLL(HLLType.FULL);
+ final int elementCount = random.nextInt(10000/*arbitrary maximum cardinality*/);
+ for(int j=0;j<elementCount;j++) {
+ fullHLL.addRaw(random.nextLong());
+ }
+
+ cumulativeUnionLine(output, hll, fullHLL, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Unions an EMPTY accumulator with random HLLs.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - hopefully all union possibilities
+ */
+ private static void globalUnionTest(final ISchemaVersion schemaVersion) throws IOException {
+ final FileWriter output = openOutput(schemaVersion, "comprehensive", TestType.UNION);
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL = newHLL(HLLType.EMPTY);
+
+ cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+ for(int i=0; i<1000/*number of rows to generate*/; i++) {
+ final HLL randomHLL = generateRandomHLL();
+ cumulativeUnionLine(output, hll, randomHLL, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ // ========================================================================
+ // Main
+ public static void fullSuite(final ISchemaVersion schemaVersion) throws IOException {
+ fullCardinalityCorrectionTest(schemaVersion);
+ globalUnionTest(schemaVersion);
+ globalStepTest(schemaVersion);
+ probabilisticUnionTest(schemaVersion);
+ explicitPromotionTest(schemaVersion);
+ explicitOverlapTest(schemaVersion);
+ sparseFullRepresentationTest(schemaVersion);
+ sparseStepTest(schemaVersion);
+ sparseRandomTest(schemaVersion);
+ sparseEdgeTest(schemaVersion);
+ sparseProbabilisticPromotionTest(schemaVersion);
+ sparseProbabilisticOverlapTest(schemaVersion);
+ }
+
+ public static void main(String[] args) throws IOException {
+ fullSuite(SerializationUtil.VERSION_ONE);
+ }
+
+ // ************************************************************************
+ // Helpers
+ /**
+ * Shortcut for testing constructor, which uses the constants defined at
+ * the top of the file as default parameters.
+ *
+ * @return a new {@link HLL} of specified type, which uses the parameters
+ * ({@link #LOG2M}, {@link #REGWIDTH}, {@link #EXPLICIT_THRESHOLD},
+ * and {@link #SPARSE_THRESHOLD}) specified above.
+ */
+ private static HLL newHLL(final HLLType type) {
+ return newHLL(type);
+ }
+
+ /**
+ * Returns the algorithm-specific cardinality of the specified {@link HLL}
+ * as a {@link String} appropriate for comparison with the algorithm-specific
+ * cardinality provided by the PostgreSQL implementation.
+ *
+ * @param hll the HLL whose algorithm-specific cardinality is to be printed.
+ * This cannot be <code>null</code>.
+ * @return the algorithm-specific cardinality of the instance as a PostgreSQL-
+ * compatible String. This will never be <code>null</code>
+ */
+ private static String stringCardinality(final HLL hll) {
+ switch(hll.getType()) {
+ case EMPTY:
+ return "0";
+ case EXPLICIT:/*promotion has not yet occurred*/
+ return Long.toString(hll.cardinality());
+ case SPARSE:
+ return Double.toString(hll.sparseProbabilisticAlgorithmCardinality());
+ case FULL:
+ return Double.toString(hll.fullProbabilisticAlgorithmCardinality());
+ default:
+ throw new RuntimeException("Unknown HLL type " + hll.getType());
+ }
+ }
+
+ /**
+ * Generates a random HLL and populates it with random values.
+ *
+ * @return the populated HLL. This will never be <code>null</code>.
+ */
+ public static HLL generateRandomHLL() {
+ final int randomTypeInt = randomIntBetween(0, HLLType.values().length - 1);
+ final HLLType type;
+ switch(randomTypeInt) {
+ case 0:
+ type = HLLType.EMPTY;
+ break;
+ case 1:
+ type = HLLType.EXPLICIT;
+ break;
+ case 2:
+ type = HLLType.FULL;
+ break;
+ case 3:
+ type = HLLType.EMPTY;
+ break;
+ case 4:
+ type = HLLType.SPARSE;
+ break;
+ default:
+ throw new RuntimeException("Unassigned type int " + randomTypeInt);
+ }
+
+ final int cardinalityCap;
+ final int cardinalityBaseline;
+
+ switch(type) {
+ case EMPTY:
+ return newHLL(HLLType.EMPTY);
+ case EXPLICIT:
+ cardinalityCap = EXPLICIT_THRESHOLD;
+ cardinalityBaseline = 1;
+ break;
+ case SPARSE:
+ cardinalityCap = SPARSE_THRESHOLD;
+ cardinalityBaseline = (EXPLICIT_THRESHOLD + 1);
+ break;
+ case FULL:
+ cardinalityCap = 100000;
+ cardinalityBaseline = (SPARSE_THRESHOLD*10);
+ break;
+ default:
+ throw new RuntimeException("We should never be here.");
+ }
+
+ final HLL hll = newHLL(HLLType.EMPTY);
+ for(int i=0; i<cardinalityBaseline; i++) {
+ hll.addRaw(randomLong());
+ }
+ for(int i=0; i<randomInt(cardinalityCap - cardinalityBaseline); i++) {
+ hll.addRaw(randomLong());
+ }
+
+ return hll;
+ }
+
+ /**
+ * Opens a {@link FileWriter} and writes out an appropriate CSV header.
+ *
+ * @param schemaVersion Schema version of the output. This cannot be
+ * <code>null</code>.
+ * @param description Description string used to build the filename.
+ * This cannot be <code>null</code>.
+ * @param type {@link TestType type} of the test file to be written.
+ * This cannot be <code>null</code>.
+ * @return The opened {@link FileWriter writer}. This will never be <code>null</code>.
+ */
+ private static FileWriter openOutput(final ISchemaVersion schemaVersion, final String description, final TestType type) throws IOException {
+ final String schemaVersionPrefix = "v"+ schemaVersion.schemaVersionNumber() + "_";
+ final String header;
+ final String filename;
+ switch(type) {
+ case ADD:
+ header = "cardinality,raw_value,HLL\n";
+ filename = schemaVersionPrefix + "cumulative_add_" + description + ".csv";
+ break;
+ case UNION:
+ header = "cardinality,HLL,union_cardinality,union_HLL\n";
+ filename = schemaVersionPrefix + "cumulative_union_" + description + ".csv";
+ break;
+ default:
+ throw new RuntimeException("Unknown test type " + type);
+ }
+
+ final FileWriter output = new FileWriter(OUTPUT_DIRECTORY + filename);
+ output.write(header);
+ output.flush();
+ return output;
+ }
+
+ /**
+ * Writes out a {@link TestType#ADD}-formatted test line.
+ *
+ * @param output The output {@link FileWriter writer}. This cannot be <code>null</code>.
+ * @param hll The "accumulator" HLL instance. This cannot be <code>null</code>.
+ * @param rawValue The raw value added to the HLL.
+ * @param schemaVersion the schema with which to serialize the HLLs. This cannot
+ * be <code>null</code>.
+ */
+ private static void cumulativeAddLine(final FileWriter output, final HLL hll, final long rawValue, final ISchemaVersion schemaVersion) throws IOException {
+ hll.addRaw(rawValue);
+ final String accumulatorCardinality = stringCardinality(hll);
+
+ output.write(accumulatorCardinality + "," + rawValue + "," + toByteA(hll, schemaVersion) + "\n");
+ output.flush();
+ }
+
+ /**
+ * Writes an initial line for a {@link TestType#ADD}-formatted test.
+ *
+ * @param output The output {@link FileWriter writer}. This cannot be <code>null</code>.
+ * @param hll The "accumulator" HLL instance. This cannot be <code>null</code>.
+ * @param schemaVersion the schema with which to serialize the HLLs. This cannot
+ * be <code>null</code>.
+ */
+ private static void initLineAdd(final FileWriter output, final HLL hll, final ISchemaVersion schemaVersion) throws IOException {
+ output.write(0 + "," + 0 + "," + toByteA(hll, schemaVersion) + "\n");
+ output.flush();
+ }
+
+ /**
+ * Writes out a {@link TestType#UNION}-formatted test line.
+ *
+ * @param output The output {@link FileWriter writer}. This cannot be <code>null</code>.
+ * @param hll The "accumulator" HLL instance. This cannot be <code>null</code>.
+ * @param increment The "increment" HLL instance which will be unioned into
+ * the accumulator. This cannot be <code>null</code>.
+ * @param schemaVersion the schema with which to serialize the HLLs. This cannot
+ * be <code>null</code>.
+ */
+ private static void cumulativeUnionLine(final FileWriter output, final HLL hll, final HLL increment, final ISchemaVersion schemaVersion) throws IOException {
+ hll.union(increment);
+
+ final String incrementCardinality = stringCardinality(increment);
+ final String accumulatorCardinality = stringCardinality(hll);
+ output.write(incrementCardinality + "," + toByteA(increment, schemaVersion) + "," + accumulatorCardinality + "," + toByteA(hll, schemaVersion) + "\n");
+ output.flush();
+ }
+
+ /**
+ * Serializes a HLL to Postgres 9 'bytea' hex-format, for CSV ingest.
+ *
+ * @param hll the HLL to serialize. This cannot be <code>null</code>.
+ * @param schemaVersion the schema with which to serialize the HLLs. This cannot
+ * be <code>null</code>.
+ * @return a PostgreSQL 'bytea' string representing the HLL.
+ */
+ private static String toByteA(final HLL hll, final ISchemaVersion schemaVersion) {
+ final byte[] bytes = hll.toBytes(schemaVersion);
+ return ("\\x" + NumberUtil.toHex(bytes, 0, bytes.length));
+ }
+
+ /**
+ * Indicates what kind of test output a test will generate.
+ */
+ private static enum TestType {
+ /**
+ * This type of test is characterized by values being added to an
+ * accumulator HLL whose serialized representation (after the value is added)
+ * is printed to each line along with the cardinality and added value.
+ */
+ ADD,
+ /**
+ * This type of test is characterized by HLLs being unioned into an
+ * accumulator HLL whose serialized representation (after the HLL is
+ * union'd) is printed to each line along with the cardinalities and the
+ * serialized representation of the HLL union'd in.
+ */
+ UNION;
+ }
+}
Added: lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/ProbabilisticTestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/ProbabilisticTestUtil.java?rev=1691351&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/ProbabilisticTestUtil.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/ProbabilisticTestUtil.java Thu Jul 16 10:47:37 2015
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+/**
+ * A collection of test utilities for constructing input values to HLLs and for
+ * computing their serialized size.
+ */
+public class ProbabilisticTestUtil {
+ /**
+ * Constructs a value that when added raw to a HLL will set the register at
+ * <code>registerIndex</code> to <code>registerValue</code>.
+ *
+ * @param log2m the log-base-2 of the number of registers in the HLL
+ * @param registerIndex the index of the register to set
+ * @param registerValue the value to set the register to
+ * @return the value
+ */
+ public static long constructHLLValue(final int log2m, final int registerIndex, final int registerValue) {
+ final long partition = registerIndex;
+ final long substreamValue = (1L << (registerValue - 1));
+ return (substreamValue << log2m) | partition;
+ }
+
+ /**
+ * Extracts the HLL register index from a raw value.
+ */
+ public static short getRegisterIndex(final long rawValue, final int log2m) {
+ final long mBitsMask = (1 << log2m) - 1;
+ final short j = (short)(rawValue & mBitsMask);
+ return j;
+ }
+
+ /**
+ * Extracts the HLL register value from a raw value.
+ */
+ public static byte getRegisterValue(final long rawValue, final int log2m) {
+ final long substreamValue = (rawValue >>> log2m);
+ final byte p_w;
+
+ if (substreamValue == 0L) {
+ // The paper does not cover p(0x0), so the special value 0 is used.
+ // 0 is the original initialization value of the registers, so by
+ // doing this the HLL simply ignores it. This is acceptable
+ // because the probability is 1/(2^(2^registerSizeInBits)).
+ p_w = 0;
+ } else {
+ p_w = (byte)Math.min(1 + BitUtil.leastSignificantBit(substreamValue), 31);
+ }
+
+ return p_w;
+ }
+
+ /**
+ * @return the number of bytes required to pack <code>registerCount</code>
+ * registers of width <code>shortWordLength</code>.
+ */
+ public static int getRequiredBytes(final int shortWordLength, final int registerCount) {
+ return (int)Math.ceil((registerCount * shortWordLength)/(float)8);
+ }
+}
Added: lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/SparseHLLTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/SparseHLLTest.java?rev=1691351&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/SparseHLLTest.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/util/hll/SparseHLLTest.java Thu Jul 16 10:47:37 2015
@@ -0,0 +1,453 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+import com.carrotsearch.hppc.IntByteOpenHashMap;
+import com.carrotsearch.hppc.cursors.IntByteCursor;
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+
+/**
+ * Tests {@link HLL} of type {@link HLLType#SPARSE}.
+ */
+public class SparseHLLTest extends LuceneTestCase {
+ private static final int log2m = 11;
+
+ /**
+ * Tests {@link HLL#addRaw(long)}.
+ */
+ @Test
+ public void addTest() {
+ { // insert an element with register value 1 (minimum set value)
+ final int registerIndex = 0;
+ final int registerValue = 1;
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(rawValue);
+
+ assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
+ }
+ { // insert an element with register value 31 (maximum set value)
+ final int registerIndex = 0;
+ final int registerValue = 31;
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(rawValue);
+
+ assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
+ }
+ { // insert an element that could overflow the register (past 31)
+ final int registerIndex = 0;
+ final int registerValue = 36;
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(rawValue);
+
+ assertOneRegisterSet(hll, (short)registerIndex, (byte)31/*register max*/);
+ }
+ { // insert duplicate elements, observe no change
+ final int registerIndex = 0;
+ final int registerValue = 1;
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(rawValue);
+ hll.addRaw(rawValue);
+
+ assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
+ }
+ { // insert elements that increase a register's value
+ final int registerIndex = 0;
+ final int registerValue = 1;
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(rawValue);
+
+ final int registerValue2 = 2;
+ final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
+ hll.addRaw(rawValue2);
+
+ assertOneRegisterSet(hll, registerIndex, (byte)registerValue2);
+ }
+ { // insert elements that have lower register values, observe no change
+ final int registerIndex = 0;
+ final int registerValue = 2;
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(rawValue);
+
+ final int registerValue2 = 1;
+ final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
+ hll.addRaw(rawValue2);
+
+ assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
+ }
+ }
+
+ /**
+ * Smoke test for {@link HLL#cardinality()} and the proper use of the small
+ * range correction.
+ */
+ @Test
+ public void smallRangeSmokeTest() {
+ final int log2m = 11;
+ final int m = (1 << log2m);
+ final int regwidth = 5;
+
+ // only one register set
+ {
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0, 1));
+
+ final long cardinality = hll.cardinality();
+
+ // Trivially true that small correction conditions hold: one register
+ // set implies zeroes exist, and estimator trivially smaller than 5m/2.
+ // Small range correction: m * log(m/V)
+ final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
+ assertEquals(cardinality, expected);
+ }
+
+ // all but one register set
+ {
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ for(int i=0; i<(m - 1); i++) {
+ hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
+ }
+
+ // Trivially true that small correction conditions hold: all but
+ // one register set implies a zero exists, and estimator trivially
+ // smaller than 5m/2 since it's alpha / ((m-1)/2)
+ final long cardinality = hll.cardinality();
+
+ // Small range correction: m * log(m/V)
+ final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
+ assertEquals(cardinality, expected);
+ }
+ }
+
+ /**
+ * Smoke test for {@link HLL#cardinality()} and the proper use of the
+ * uncorrected estimator.
+ */
+ @Test
+ public void normalRangeSmokeTest() {
+ final int log2m = 11;
+ final int m = (1 << log2m);
+ final int regwidth = 5;
+ // regwidth = 5, so hash space is
+ // log2m + (2^5 - 1 - 1), so L = log2m + 30
+ final int l = log2m + 30;
+
+ // all registers at 'medium' value
+ {
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
+
+ final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
+ for(int i=0; i<m; i++) {
+ hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
+ }
+
+ final long cardinality = hll.cardinality();
+
+ // Simplified estimator when all registers take same value: alpha / (m/2^val)
+ final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
+
+ // Assert conditions for uncorrected range
+ assertTrue(estimator <= Math.pow(2,l)/30);
+ assertTrue(estimator > (5 * m /(double)2));
+
+ final long expected = (long)Math.ceil(estimator);
+ assertEquals(cardinality, expected);
+ }
+ }
+
+ /**
+ * Smoke test for {@link HLL#cardinality()} and the proper use of the large
+ * range correction.
+ */
+ @Test
+ public void largeRangeSmokeTest() {
+ final int log2m = 11;
+ final int m = (1 << log2m);
+ final int regwidth = 5;
+ // regwidth = 5, so hash space is
+ // log2m + (2^5 - 1 - 1), so L = log2m + 30
+ final int l = log2m + 30;
+
+ // all registers at large value
+ {
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
+
+ final int registerValue = 31/*chosen to ensure large correction kicks in*/;
+ for(int i=0; i<m; i++) {
+ hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
+ }
+
+ final long cardinality = hll.cardinality();
+
+
+ // Simplified estimator when all registers take same value: alpha / (m/2^val)
+ final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
+
+ // Assert conditions for large range
+ assertTrue(estimator > Math.pow(2, l)/30);
+
+ // Large range correction: -2^32 * log(1 - E/2^32)
+ final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
+ assertEquals(cardinality, expected);
+ }
+ }
+
+ /**
+ * Tests {@link HLL#union(HLL)}.
+ */
+ @Test
+ public void unionTest() {
+ final int log2m = 11/*arbitrary*/;
+ final int sparseThreshold = 256/*arbitrary*/;
+
+ { // two empty multisets should union to an empty set
+ final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+ hllA.union(hllB);
+
+ assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
+ assertEquals(hllA.cardinality(), 0L);
+ }
+ { // two disjoint multisets should union properly
+ final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 1));
+ final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 2, 1));
+
+
+ hllA.union(hllB);
+
+ assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
+ assertEquals(hllA.cardinality(), 3L/*precomputed*/);
+ assertRegisterPresent(hllA, 1, (byte)1);
+ assertRegisterPresent(hllA, 2, (byte)1);
+ }
+ { // two exactly overlapping multisets should union properly
+ final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 10));
+ final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 13));
+
+ hllA.union(hllB);
+
+ assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
+ assertEquals(hllA.cardinality(), 2L/*precomputed*/);
+ assertOneRegisterSet(hllA, 1, (byte)13/*max(10,13)*/);
+ }
+ { // overlapping multisets should union properly
+ final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ // register index = 3
+ final long rawValueA = ProbabilisticTestUtil.constructHLLValue(log2m, 3, 11);
+
+ // register index = 4
+ final long rawValueB = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 13);
+ final long rawValueBPrime = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 21);
+
+ // register index = 5
+ final long rawValueC = ProbabilisticTestUtil.constructHLLValue(log2m, 5, 14);
+
+ hllA.addRaw(rawValueA);
+ hllA.addRaw(rawValueB);
+
+ hllB.addRaw(rawValueBPrime);
+ hllB.addRaw(rawValueC);
+
+ hllA.union(hllB);
+ // union should have three registers set, with partition B set to the
+ // max of the two registers
+ assertRegisterPresent(hllA, 3, (byte)11);
+ assertRegisterPresent(hllA, 4, (byte)21/*max(21,13)*/);
+ assertRegisterPresent(hllA, 5, (byte)14);
+ }
+ { // too-large unions should promote
+ final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+ // fill up sets to maxCapacity
+ for(int i=0; i<sparseThreshold; i++) {
+ hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
+ hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, (i + sparseThreshold)/*non-overlapping*/, 1));
+ }
+
+ hllA.union(hllB);
+
+ assertEquals(hllA.getType(), HLLType.FULL);
+ }
+ }
+
+ /**
+ * Tests {@link HLL#clear()}.
+ */
+ @Test
+ public void clearTest() {
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.SPARSE);
+ hll.addRaw(1L);
+ hll.clear();
+ assertEquals(hll.cardinality(), 0L);
+ }
+
+ /**
+ * Tests {@link HLL#toBytes(ISchemaVersion)} and
+ * {@link HLL#fromBytes(byte[])}.
+ */
+ @Test
+ public void toFromBytesTest() {
+ final int log2m = 11/*arbitrary*/;
+ final int regwidth = 5/*arbitrary*/;
+ final int sparseThreshold = 256/*arbitrary*/;
+ final int shortWordLength = 16/*log2m + regwidth = 11 + 5*/;
+
+ final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
+ final HLLType type = HLLType.SPARSE;
+ final int padding = schemaVersion.paddingBytes(type);
+
+ {// Should work on an empty element
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ final byte[] bytes = hll.toBytes(schemaVersion);
+
+ // output should just be padding since no registers are used
+ assertEquals(bytes.length, padding);
+
+ final HLL inHLL = HLL.fromBytes(bytes);
+
+ // assert register values correct
+ assertElementsEqual(hll, inHLL);
+ }
+ {// Should work on a partially filled element
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+ for(int i=0; i<3; i++) {
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i+9));
+ hll.addRaw(rawValue);
+ }
+
+ final byte[] bytes = hll.toBytes(schemaVersion);
+
+ assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, 3/*registerCount*/));
+
+ final HLL inHLL = HLL.fromBytes(bytes);
+
+ // assert register values correct
+ assertElementsEqual(hll, inHLL);
+ }
+ {// Should work on a full set
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+ for(int i=0; i<sparseThreshold; i++) {
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i % 9) + 1);
+ hll.addRaw(rawValue);
+ }
+
+ final byte[] bytes = hll.toBytes(schemaVersion);
+
+ // 'short words' should be 12 bits + 5 bits = 17 bits long
+ assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, sparseThreshold));
+
+ final HLL inHLL = HLL.fromBytes(bytes);
+
+ // assert register values correct
+ assertElementsEqual(hll, inHLL);
+ }
+ }
+
+ /**
+ * Smoke tests the multisets by adding random values.
+ */
+ @Test
+ public void randomValuesTest() {
+ final int log2m = 11/*arbitrary*/;
+ final int regwidth = 5/*arbitrary*/;
+ final int sparseThreshold = 256/*arbitrary*/;
+
+ for(int run=0; run<100; run++) {
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+ final IntByteOpenHashMap map = new IntByteOpenHashMap();
+
+ for(int i=0; i<sparseThreshold; i++) {
+ final long rawValue = RandomizedTest.randomLong();
+
+ final short registerIndex = ProbabilisticTestUtil.getRegisterIndex(rawValue, log2m);
+ final byte registerValue = ProbabilisticTestUtil.getRegisterValue(rawValue, log2m);
+ if(map.get(registerIndex) < registerValue) {
+ map.put(registerIndex, registerValue);
+ }
+
+ hll.addRaw(rawValue);
+ }
+
+ for (IntByteCursor c : map) {
+ final byte expectedRegisterValue = map.get(c.key);
+ assertRegisterPresent(hll, c.key, expectedRegisterValue);
+ }
+ }
+ }
+
+ //*************************************************************************
+ // assertion helpers
+ /**
+ * Asserts that the register at the specified index is set to the specified
+ * value.
+ */
+ private static void assertRegisterPresent(final HLL hll,
+ final int registerIndex,
+ final int registerValue) {
+ final IntByteOpenHashMap sparseProbabilisticStorage = hll.sparseProbabilisticStorage;
+ assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
+ }
+
+ /**
+ * Asserts that only the specified register is set and has the specified value.
+ */
+ private static void assertOneRegisterSet(final HLL hll,
+ final int registerIndex,
+ final byte registerValue) {
+ final IntByteOpenHashMap sparseProbabilisticStorage = hll.sparseProbabilisticStorage;
+ assertEquals(sparseProbabilisticStorage.size(), 1);
+ assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
+ }
+
+ /**
+ * Asserts that all registers in the two {@link HLL} instances are identical.
+ */
+ private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
+ final IntByteOpenHashMap sparseProbabilisticStorageA = hllA.sparseProbabilisticStorage;
+ final IntByteOpenHashMap sparseProbabilisticStorageB = hllB.sparseProbabilisticStorage;
+ assertEquals(sparseProbabilisticStorageA.size(), sparseProbabilisticStorageB.size());
+ for (IntByteCursor c : sparseProbabilisticStorageA) {
+ assertEquals(sparseProbabilisticStorageA.get(c.key),
+ sparseProbabilisticStorageB.get(c.key));
+ }
+ }
+}