You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2015/07/16 12:32:07 UTC

svn commit: r1691350 [2/3] - in /lucene/dev/branches/solr7787: ./ lucene/ solr/ solr/core/ solr/core/src/java/org/apache/solr/handler/component/ solr/core/src/java/org/apache/solr/search/facet/ solr/core/src/java/org/apache/solr/util/hll/ solr/core/src...

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLL.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLL.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLL.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLL.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,1088 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+
+import com.carrotsearch.hppc.IntByteOpenHashMap;
+import com.carrotsearch.hppc.LongOpenHashSet;
+import com.carrotsearch.hppc.cursors.IntByteCursor;
+import com.carrotsearch.hppc.cursors.LongCursor;
+
+/**
+ * A probabilistic set of hashed <code>long</code> elements. Useful for computing
+ * the approximate cardinality of a stream of data in very small storage.<p/>
+ *
+ * A modified version of the <a href="http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf">
+ * 'HyperLogLog' data structure and algorithm</a> is used, which combines both
+ * probabilistic and non-probabilistic techniques to improve the accuracy and
+ * storage requirements of the original algorithm.<p/>
+ *
+ * More specifically, initializing and storing a new {@link HLL} will
+ * allocate a sentinel value symbolizing the empty set ({@link HLLType#EMPTY}).
+ * After adding the first few values, a sorted list of unique integers is
+ * stored in a {@link HLLType#EXPLICIT} hash set. When configured, accuracy can
+ * be sacrificed for memory footprint: the values in the sorted list are
+ * "promoted" to a "{@link HLLType#SPARSE}" map-based HyperLogLog structure.
+ * Finally, when enough registers are set, the map-based HLL will be converted
+ * to a bit-packed "{@link HLLType#FULL}" HyperLogLog structure.<p/>
+ *
+ * This data structure is interoperable with the implementations found at:
+ * <ul>
+ *   <li><a href="https://github.com/aggregateknowledge/postgresql-hll">postgresql-hll</a>, and</li>
+ *   <li><a href="https://github.com/aggregateknowledge/js-hll">js-hll</a></li>
+ * </ul>
+ * when <a href="https://github.com/aggregateknowledge/postgresql-hll/blob/master/STORAGE.markdown">properly serialized</a>.
+ */
+public class HLL implements Cloneable {
+    // minimum and maximum values for the log-base-2 of the number of registers
+    // in the HLL
+    public static final int MINIMUM_LOG2M_PARAM = 4;
+    public static final int MAXIMUM_LOG2M_PARAM = 30;
+
+    // minimum and maximum values for the register width of the HLL
+    public static final int MINIMUM_REGWIDTH_PARAM = 1;
+    public static final int MAXIMUM_REGWIDTH_PARAM = 8;
+
+    // minimum and maximum values for the 'expthresh' parameter of the
+    // constructor that is meant to match the PostgreSQL implementation's
+    // constructor and parameter names
+    public static final int MINIMUM_EXPTHRESH_PARAM = -1;
+    public static final int MAXIMUM_EXPTHRESH_PARAM = 18;
+    public static final int MAXIMUM_EXPLICIT_THRESHOLD = (1 << (MAXIMUM_EXPTHRESH_PARAM - 1)/*per storage spec*/);
+
+    // ************************************************************************
+    // Storage
+    // storage used when #type is EXPLICIT, null otherwise
+    LongOpenHashSet explicitStorage;
+    // storage used when #type is SPARSE, null otherwise
+    IntByteOpenHashMap sparseProbabilisticStorage;
+    // storage used when #type is FULL, null otherwise
+    BitVector probabilisticStorage;
+
+    // current type of this HLL instance, if this changes then so should the
+    // storage used (see above)
+    private HLLType type;
+
+    // ------------------------------------------------------------------------
+    // Characteristic parameters
+    // NOTE:  These members are named to match the PostgreSQL implementation's
+    //        parameters.
+    // log2(the number of probabilistic HLL registers)
+    private final int log2m;
+    // the size (width) each register in bits
+    private final int regwidth;
+
+    // ------------------------------------------------------------------------
+    // Computed constants
+    // ........................................................................
+    // EXPLICIT-specific constants
+    // flag indicating if the EXPLICIT representation should NOT be used
+    private final boolean explicitOff;
+    // flag indicating that the promotion threshold from EXPLICIT should be
+    // computed automatically
+    // NOTE:  this only has meaning when 'explicitOff' is false
+    private final boolean explicitAuto;
+    // threshold (in element count) at which a EXPLICIT HLL is converted to a
+    // SPARSE or FULL HLL, always greater than or equal to zero and always a
+    // power of two OR simply zero
+    // NOTE:  this only has meaning when 'explicitOff' is false
+    private final int explicitThreshold;
+
+    // ........................................................................
+    // SPARSE-specific constants
+    // the computed width of the short words
+    private final int shortWordLength;
+    // flag indicating if the SPARSE representation should not be used
+    private final boolean sparseOff;
+    // threshold (in register count) at which a SPARSE HLL is converted to a
+    // FULL HLL, always greater than zero
+    private final int sparseThreshold;
+
+    // ........................................................................
+    // Probabilistic algorithm constants
+    // the number of registers, will always be a power of 2
+    private final int m;
+    // a mask of the log2m bits set to one and the rest to zero
+    private final int mBitsMask;
+    // a mask as wide as a register (see #fromBytes())
+    private final int valueMask;
+    // mask used to ensure that p(w) does not overflow register (see #Constructor() and #addRaw())
+    private final long pwMaxMask;
+    // alpha * m^2 (the constant in the "'raw' HyperLogLog estimator")
+    private final double alphaMSquared;
+    // the cutoff value of the estimator for using the "small" range cardinality
+    // correction formula
+    private final double smallEstimatorCutoff;
+    // the cutoff value of the estimator for using the "large" range cardinality
+    // correction formula
+    private final double largeEstimatorCutoff;
+
+    // ========================================================================
+    /**
+     * NOTE: Arguments here are named and structured identically to those in the
+     *       PostgreSQL implementation, which can be found
+     *       <a href="https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning">here</a>.
+     *
+     * @param log2m log-base-2 of the number of registers used in the HyperLogLog
+     *        algorithm. Must be at least 4 and at most 30.
+     * @param regwidth number of bits used per register in the HyperLogLog
+     *        algorithm. Must be at least 1 and at most 8.
+     * @param expthresh tunes when the {@link HLLType#EXPLICIT} to
+     *        {@link HLLType#SPARSE} promotion occurs,
+     *        based on the set's cardinality. Must be at least -1 and at most 18.
+     *        <table>
+     *        <thead><tr><th><code>expthresh</code> value</th><th>Meaning</th></tr></thead>
+     *        <tbody>
+     *        <tr>
+     *            <td>-1</td>
+     *            <td>Promote at whatever cutoff makes sense for optimal memory usage. ('auto' mode)</td>
+     *        </tr>
+     *        <tr>
+     *            <td>0</td>
+     *            <td>Skip <code>EXPLICIT</code> representation in hierarchy.</td>
+     *        </tr>
+     *        <tr>
+     *            <td>1-18</td>
+     *            <td>Promote at 2<sup>expthresh - 1</sup> cardinality</td>
+     *        </tr>
+     *        </tbody>
+     *        </table>
+     * @param sparseon Flag indicating if the {@link HLLType#SPARSE}
+     *        representation should be used.
+     * @param type the type in the promotion hierarchy which this instance should
+     *        start at. This cannot be <code>null</code>.
+     */
+    public HLL(final int log2m, final int regwidth, final int expthresh, final boolean sparseon, final HLLType type) {
+        this.log2m = log2m;
+        if((log2m < MINIMUM_LOG2M_PARAM) || (log2m > MAXIMUM_LOG2M_PARAM)) {
+            throw new IllegalArgumentException("'log2m' must be at least " + MINIMUM_LOG2M_PARAM + " and at most " + MAXIMUM_LOG2M_PARAM + " (was: " + log2m + ")");
+        }
+
+        this.regwidth = regwidth;
+        if((regwidth < MINIMUM_REGWIDTH_PARAM) || (regwidth > MAXIMUM_REGWIDTH_PARAM)) {
+            throw new IllegalArgumentException("'regwidth' must be at least " + MINIMUM_REGWIDTH_PARAM + " and at most " + MAXIMUM_REGWIDTH_PARAM + " (was: " + regwidth + ")");
+        }
+
+        this.m = (1 << log2m);
+        this.mBitsMask = m - 1;
+        this.valueMask = (1 << regwidth) - 1;
+        this.pwMaxMask = HLLUtil.pwMaxMask(regwidth);
+        this.alphaMSquared = HLLUtil.alphaMSquared(m);
+        this.smallEstimatorCutoff = HLLUtil.smallEstimatorCutoff(m);
+        this.largeEstimatorCutoff = HLLUtil.largeEstimatorCutoff(log2m, regwidth);
+
+        if(expthresh == -1) {
+            this.explicitAuto = true;
+            this.explicitOff = false;
+
+            // NOTE:  This math matches the size calculation in the PostgreSQL impl.
+            final long fullRepresentationSize = (this.regwidth * (long)this.m + 7/*round up to next whole byte*/)/Byte.SIZE;
+            final int numLongs = (int)(fullRepresentationSize / 8/*integer division to round down*/);
+
+            if(numLongs > MAXIMUM_EXPLICIT_THRESHOLD) {
+                this.explicitThreshold = MAXIMUM_EXPLICIT_THRESHOLD;
+            } else {
+                this.explicitThreshold = numLongs;
+            }
+        } else if(expthresh == 0) {
+            this.explicitAuto = false;
+            this.explicitOff = true;
+            this.explicitThreshold = 0;
+        } else if((expthresh > 0) && (expthresh <= MAXIMUM_EXPTHRESH_PARAM)){
+            this.explicitAuto = false;
+            this.explicitOff = false;
+            this.explicitThreshold = (1 << (expthresh - 1));
+        } else {
+            throw new IllegalArgumentException("'expthresh' must be at least " + MINIMUM_EXPTHRESH_PARAM + " and at most " + MAXIMUM_EXPTHRESH_PARAM + " (was: " + expthresh + ")");
+        }
+
+        this.shortWordLength = (regwidth + log2m);
+        this.sparseOff = !sparseon;
+        if(this.sparseOff) {
+            this.sparseThreshold = 0;
+        } else {
+            // TODO improve this cutoff to include the cost overhead of Java
+            //      members/objects
+            final int largestPow2LessThanCutoff =
+                    (int)NumberUtil.log2((this.m * this.regwidth) / this.shortWordLength);
+            this.sparseThreshold = (1 << largestPow2LessThanCutoff);
+        }
+
+        initializeStorage(type);
+    }
+
+    /**
+     *  Construct an empty HLL with the given {@code log2m} and {@code regwidth}.<p/>
+     *
+     *  This is equivalent to calling <code>HLL(log2m, regwidth, -1, true, HLLType.EMPTY)</code>.
+     *
+     * @param log2m log-base-2 of the number of registers used in the HyperLogLog
+     *        algorithm. Must be at least 4 and at most 30.
+     * @param regwidth number of bits used per register in the HyperLogLog
+     *        algorithm. Must be at least 1 and at most 8.
+     *
+     * @see #HLL(int, int, int, boolean, HLLType)
+     */
+    public HLL(final int log2m, final int regwidth) {
+        this(log2m, regwidth, -1, true, HLLType.EMPTY);
+    }
+
+    // -------------------------------------------------------------------------
+    /**
+     * Convenience constructor for testing. Assumes that both {@link HLLType#EXPLICIT}
+     * and {@link HLLType#SPARSE} representations should be enabled.
+     *
+     * @param log2m log-base-2 of the number of registers used in the HyperLogLog
+     *        algorithm. Must be at least 4 and at most 30.
+     * @param regwidth number of bits used per register in the HyperLogLog
+     *        algorithm. Must be at least 1 and at most 8.
+     * @param explicitThreshold cardinality threshold at which the {@link HLLType#EXPLICIT}
+     *        representation should be promoted to {@link HLLType#SPARSE}.
+     *        This must be greater than zero and less than or equal to {@value #MAXIMUM_EXPLICIT_THRESHOLD}.
+     * @param sparseThreshold register count threshold at which the {@link HLLType#SPARSE}
+     *        representation should be promoted to {@link HLLType#FULL}.
+     *        This must be greater than zero.
+     * @param type the type in the promotion hierarchy which this instance should
+     *        start at. This cannot be <code>null</code>.
+     */
+    /*package, for testing*/ HLL(final int log2m, final int regwidth, final int explicitThreshold, final int sparseThreshold, final HLLType type) {
+        this.log2m = log2m;
+        if((log2m < MINIMUM_LOG2M_PARAM) || (log2m > MAXIMUM_LOG2M_PARAM)) {
+            throw new IllegalArgumentException("'log2m' must be at least " + MINIMUM_LOG2M_PARAM + " and at most " + MAXIMUM_LOG2M_PARAM + " (was: " + log2m + ")");
+        }
+
+        this.regwidth = regwidth;
+        if((regwidth < MINIMUM_REGWIDTH_PARAM) || (regwidth > MAXIMUM_REGWIDTH_PARAM)) {
+            throw new IllegalArgumentException("'regwidth' must be at least " + MINIMUM_REGWIDTH_PARAM + " and at most " + MAXIMUM_REGWIDTH_PARAM + " (was: " + regwidth + ")");
+        }
+
+        this.m = (1 << log2m);
+        this.mBitsMask = m - 1;
+        this.valueMask = (1 << regwidth) - 1;
+        this.pwMaxMask = HLLUtil.pwMaxMask(regwidth);
+        this.alphaMSquared = HLLUtil.alphaMSquared(m);
+        this.smallEstimatorCutoff = HLLUtil.smallEstimatorCutoff(m);
+        this.largeEstimatorCutoff = HLLUtil.largeEstimatorCutoff(log2m, regwidth);
+
+        this.explicitAuto = false;
+        this.explicitOff = false;
+        this.explicitThreshold = explicitThreshold;
+        if((explicitThreshold < 1) || (explicitThreshold > MAXIMUM_EXPLICIT_THRESHOLD)) {
+            throw new IllegalArgumentException("'explicitThreshold' must be at least 1 and at most " + MAXIMUM_EXPLICIT_THRESHOLD + " (was: " + explicitThreshold + ")");
+        }
+
+        this.shortWordLength = (regwidth + log2m);
+        this.sparseOff = false;
+        this.sparseThreshold = sparseThreshold;
+
+        initializeStorage(type);
+    }
+
+    /**
+     * @return the type in the promotion hierarchy of this instance. This will
+     *         never be <code>null</code>.
+     */
+    public HLLType getType() { return type; }
+
+    // ========================================================================
+    // Add
+    /**
+     * Adds <code>rawValue</code> directly to the HLL.
+     *
+     * @param  rawValue the value to be added. It is very important that this
+     *         value <em>already be hashed</em> with a strong (but not
+     *         necessarily cryptographic) hash function. For instance, the
+     *         Murmur3 implementation in
+     *         <a href="http://guava-libraries.googlecode.com/git/guava/src/com/google/common/hash/Murmur3_128HashFunction.java">
+     *         Google's Guava</a> library is an excellent hash function for this
+     *         purpose and, for seeds greater than zero, matches the output
+     *         of the hash provided in the PostgreSQL implementation.
+     */
+    public void addRaw(final long rawValue) {
+        switch(type) {
+            case EMPTY: {
+                // NOTE:  EMPTY type is always promoted on #addRaw()
+                if(explicitThreshold > 0) {
+                    initializeStorage(HLLType.EXPLICIT);
+                    explicitStorage.add(rawValue);
+                } else if(!sparseOff) {
+                    initializeStorage(HLLType.SPARSE);
+                    addRawSparseProbabilistic(rawValue);
+                } else {
+                    initializeStorage(HLLType.FULL);
+                    addRawProbabilistic(rawValue);
+                }
+                return;
+            }
+            case EXPLICIT: {
+                explicitStorage.add(rawValue);
+
+                // promotion, if necessary
+                if(explicitStorage.size() > explicitThreshold) {
+                    if(!sparseOff) {
+                        initializeStorage(HLLType.SPARSE);
+                        for (LongCursor c : explicitStorage) {
+                            addRawSparseProbabilistic(c.value);
+                        }
+                    } else {
+                        initializeStorage(HLLType.FULL);
+                        for (LongCursor c : explicitStorage) {
+                            addRawProbabilistic(c.value);
+                        }
+                    }
+                    explicitStorage = null;
+                }
+                return;
+            }
+            case SPARSE: {
+                addRawSparseProbabilistic(rawValue);
+
+                // promotion, if necessary
+                if(sparseProbabilisticStorage.size() > sparseThreshold) {
+                    initializeStorage(HLLType.FULL);
+                    for(IntByteCursor c : sparseProbabilisticStorage) {
+                        final int registerIndex = c.key;
+                        final byte registerValue = c.value;
+                        probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+                    }
+                    sparseProbabilisticStorage = null;
+                }
+                return;
+            }
+            case FULL:
+                addRawProbabilistic(rawValue);
+                return;
+            default:
+                throw new RuntimeException("Unsupported HLL type " + type);
+        }
+    }
+
+    // ------------------------------------------------------------------------
+    // #addRaw(..) helpers
+    /**
+     * Adds the raw value to the {@link #sparseProbabilisticStorage}.
+     * {@link #type} must be {@link HLLType#SPARSE}.
+     *
+     * @param rawValue the raw value to add to the sparse storage.
+     */
+    private void addRawSparseProbabilistic(final long rawValue) {
+        // p(w): position of the least significant set bit (one-indexed)
+        // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register value)
+        //
+        // By construction of pwMaxMask (see #Constructor()),
+        //      lsb(pwMaxMask) = 2^(registerValueInBits) - 2,
+        // thus lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) - 2,
+        // thus 1 + lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) -1.
+        final long substreamValue = (rawValue >>> log2m);
+        final byte p_w;
+
+        if(substreamValue == 0L) {
+            // The paper does not cover p(0x0), so the special value 0 is used.
+            // 0 is the original initialization value of the registers, so by
+            // doing this the multiset simply ignores it. This is acceptable
+            // because the probability is 1/(2^(2^registerSizeInBits)).
+            p_w = 0;
+        } else {
+            p_w = (byte)(1 + BitUtil.leastSignificantBit(substreamValue| pwMaxMask));
+        }
+
+        // Short-circuit if the register is being set to zero, since algorithmically
+        // this corresponds to an "unset" register, and "unset" registers aren't
+        // stored to save memory. (The very reason this sparse implementation
+        // exists.) If a register is set to zero it will break the #algorithmCardinality
+        // code.
+        if(p_w == 0) {
+            return;
+        }
+
+        // NOTE:  no +1 as in paper since 0-based indexing
+        final int j = (int)(rawValue & mBitsMask);
+
+        final byte currentValue;
+        if (sparseProbabilisticStorage.containsKey(j)) {
+          currentValue = sparseProbabilisticStorage.lget();
+        } else {
+          currentValue = 0;
+        }
+
+        if(p_w > currentValue) {
+            sparseProbabilisticStorage.put(j, p_w);
+        }
+    }
+
+    /**
+     * Adds the raw value to the {@link #probabilisticStorage}.
+     * {@link #type} must be {@link HLLType#FULL}.
+     *
+     * @param rawValue the raw value to add to the full probabilistic storage.
+     */
+    private void addRawProbabilistic(final long rawValue) {
+        // p(w): position of the least significant set bit (one-indexed)
+        // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register value)
+        //
+        // By construction of pwMaxMask (see #Constructor()),
+        //      lsb(pwMaxMask) = 2^(registerValueInBits) - 2,
+        // thus lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) - 2,
+        // thus 1 + lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) -1.
+        final long substreamValue = (rawValue >>> log2m);
+        final byte p_w;
+
+        if (substreamValue == 0L) {
+            // The paper does not cover p(0x0), so the special value 0 is used.
+            // 0 is the original initialization value of the registers, so by
+            // doing this the multiset simply ignores it. This is acceptable
+            // because the probability is 1/(2^(2^registerSizeInBits)).
+            p_w = 0;
+        } else {
+            p_w = (byte)(1 + BitUtil.leastSignificantBit(substreamValue| pwMaxMask));
+        }
+
+        // Short-circuit if the register is being set to zero, since algorithmically
+        // this corresponds to an "unset" register, and "unset" registers aren't
+        // stored to save memory. (The very reason this sparse implementation
+        // exists.) If a register is set to zero it will break the #algorithmCardinality
+        // code.
+        if(p_w == 0) {
+            return;
+        }
+
+        // NOTE:  no +1 as in paper since 0-based indexing
+        final int j = (int)(rawValue & mBitsMask);
+
+        probabilisticStorage.setMaxRegister(j, p_w);
+    }
+
+    // ------------------------------------------------------------------------
+    // Storage helper
+    /**
+     * Initializes storage for the specified {@link HLLType} and changes the
+     * instance's {@link #type}.
+     *
+     * @param type the {@link HLLType} to initialize storage for. This cannot be
+     *        <code>null</code> and must be an instantiable type.
+     */
+    private void initializeStorage(final HLLType type) {
+        this.type = type;
+        switch(type) {
+            case EMPTY:
+                // nothing to be done
+                break;
+            case EXPLICIT:
+                this.explicitStorage = new LongOpenHashSet();
+                break;
+            case SPARSE:
+                this.sparseProbabilisticStorage = new IntByteOpenHashMap();
+                break;
+            case FULL:
+                this.probabilisticStorage = new BitVector(regwidth, m);
+                break;
+            default:
+                throw new RuntimeException("Unsupported HLL type " + type);
+        }
+    }
+
+    // ========================================================================
+    // Cardinality
+    /**
+     * Computes the cardinality of the HLL.
+     *
+     * @return the cardinality of HLL. This will never be negative.
+     */
+    public long cardinality() {
+        switch(type) {
+            case EMPTY:
+                return 0/*by definition*/;
+            case EXPLICIT:
+                return explicitStorage.size();
+            case SPARSE:
+                return (long)Math.ceil(sparseProbabilisticAlgorithmCardinality());
+            case FULL:
+                return (long)Math.ceil(fullProbabilisticAlgorithmCardinality());
+            default:
+                throw new RuntimeException("Unsupported HLL type " + type);
+        }
+    }
+
+    // ------------------------------------------------------------------------
+    // Cardinality helpers
+    /**
+     * Computes the exact cardinality value returned by the HLL algorithm when
+     * represented as a {@link HLLType#SPARSE} HLL. Kept
+     * separate from {@link #cardinality()} for testing purposes. {@link #type}
+     * must be {@link HLLType#SPARSE}.
+     *
+     * @return the exact, unrounded cardinality given by the HLL algorithm
+     */
+    /*package, for testing*/ double sparseProbabilisticAlgorithmCardinality() {
+        final int m = this.m/*for performance*/;
+
+        // compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
+        // 'j'th register value
+        double sum = 0;
+        int numberOfZeroes = 0/*"V" in the paper*/;
+        for(int j=0; j<m; j++) {
+            final long register;
+            if (sparseProbabilisticStorage.containsKey(j)) {
+              register = sparseProbabilisticStorage.lget();
+            } else {
+              register = 0;
+            }
+
+            sum += 1.0 / (1L << register);
+            if(register == 0L) numberOfZeroes++;
+        }
+
+        // apply the estimate and correction to the indicator function
+        final double estimator = alphaMSquared / sum;
+        if((numberOfZeroes != 0) && (estimator < smallEstimatorCutoff)) {
+            return HLLUtil.smallEstimator(m, numberOfZeroes);
+        } else if(estimator <= largeEstimatorCutoff) {
+            return estimator;
+        } else {
+            return HLLUtil.largeEstimator(log2m, regwidth, estimator);
+        }
+    }
+
+    /**
+     * Computes the exact cardinality value returned by the HLL algorithm when
+     * represented as a {@link HLLType#FULL} HLL. Kept
+     * separate from {@link #cardinality()} for testing purposes. {@link #type}
+     * must be {@link HLLType#FULL}.
+     *
+     * @return the exact, unrounded cardinality given by the HLL algorithm
+     */
+    /*package, for testing*/ double fullProbabilisticAlgorithmCardinality() {
+        final int m = this.m/*for performance*/;
+
+        // compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
+        // 'j'th register value
+        double sum = 0;
+        int numberOfZeroes = 0/*"V" in the paper*/;
+        final LongIterator iterator = probabilisticStorage.registerIterator();
+        while(iterator.hasNext()) {
+            final long register = iterator.next();
+
+            sum += 1.0 / (1L << register);
+            if(register == 0L) numberOfZeroes++;
+        }
+
+        // apply the estimate and correction to the indicator function
+        final double estimator = alphaMSquared / sum;
+        if((numberOfZeroes != 0) && (estimator < smallEstimatorCutoff)) {
+            return HLLUtil.smallEstimator(m, numberOfZeroes);
+        } else if(estimator <= largeEstimatorCutoff) {
+            return estimator;
+        } else {
+            return HLLUtil.largeEstimator(log2m, regwidth, estimator);
+        }
+    }
+
+    // ========================================================================
+    // Clear
+    /**
+     * Clears the HLL. The HLL will have cardinality zero and will act as if no
+     * elements have been added.<p/>
+     *
+     * NOTE: Unlike {@link #addRaw(long)}, <code>clear</code> does NOT handle
+     * transitions between {@link HLLType}s - a probabilistic type will remain
+     * probabilistic after being cleared.
+     */
+    public void clear() {
+        switch(type) {
+            case EMPTY:
+                return /*do nothing*/;
+            case EXPLICIT:
+                explicitStorage.clear();
+                return;
+            case SPARSE:
+                sparseProbabilisticStorage.clear();
+                return;
+            case FULL:
+                probabilisticStorage.fill(0);
+                return;
+            default:
+                throw new RuntimeException("Unsupported HLL type " + type);
+        }
+    }
+
+    // ========================================================================
+    // Union
+    /**
+     * Computes the union of HLLs and stores the result in this instance.
+     *
+     * @param other the other {@link HLL} instance to union into this one. This
+     *        cannot be <code>null</code>.
+     */
+    public void union(final HLL other) {
+        // TODO: verify HLLs are compatible
+        final HLLType otherType = other.getType();
+
+        if(type.equals(otherType)) {
+            homogeneousUnion(other);
+            return;
+        } else {
+            heterogenousUnion(other);
+            return;
+        }
+    }
+
+    // ------------------------------------------------------------------------
+    // Union helpers
+    /**
+     * Computes the union of two HLLs, of different types, and stores the
+     * result in this instance.
+     *
+     * @param other the other {@link HLL} instance to union into this one. This
+     *        cannot be <code>null</code>.
+     */
+    /*package, for testing*/ void heterogenousUnion(final HLL other) {
+        /*
+         * The logic here is divided into two sections: unions with an EMPTY
+         * HLL, and unions between EXPLICIT/SPARSE/FULL
+         * HLL.
+         *
+         * Between those two sections, all possible heterogeneous unions are
+         * covered. Should another type be added to HLLType whose unions
+         * are not easily reduced (say, as EMPTY's are below) this may be more
+         * easily implemented as Strategies. However, that is unnecessary as it
+         * stands.
+         */
+
+        // ....................................................................
+        // Union with an EMPTY
+        if(HLLType.EMPTY.equals(type)) {
+            // NOTE:  The union of empty with non-empty HLL is just a
+            //        clone of the non-empty.
+
+            switch(other.getType()) {
+                case EXPLICIT: {
+                    // src:  EXPLICIT
+                    // dest: EMPTY
+
+                    if(other.explicitStorage.size() <= explicitThreshold) {
+                        type = HLLType.EXPLICIT;
+                        explicitStorage = other.explicitStorage.clone();
+                    } else {
+                        if(!sparseOff) {
+                            initializeStorage(HLLType.SPARSE);
+                        } else {
+                            initializeStorage(HLLType.FULL);
+                        }
+                        for(LongCursor c : other.explicitStorage) {
+                            addRaw(c.value);
+                        }
+                    }
+                    return;
+                }
+                case SPARSE: {
+                    // src:  SPARSE
+                    // dest: EMPTY
+
+                    if(!sparseOff) {
+                        type = HLLType.SPARSE;
+                        sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone();
+                    } else {
+                        initializeStorage(HLLType.FULL);
+                        for(IntByteCursor c : other.sparseProbabilisticStorage) {
+                          final int registerIndex = c.key;
+                          final byte registerValue = c.value;
+                          probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+                        }
+                    }
+                    return;
+                }
+                default/*case FULL*/: {
+                    // src:  FULL
+                    // dest: EMPTY
+
+                    type = HLLType.FULL;
+                    probabilisticStorage = other.probabilisticStorage.clone();
+                    return;
+                }
+            }
+        } else if (HLLType.EMPTY.equals(other.getType())) {
+            // source is empty, so just return destination since it is unchanged
+            return;
+        } /* else -- both of the sets are not empty */
+
+        // ....................................................................
+        // NOTE: Since EMPTY is handled above, the HLLs are non-EMPTY below
+        switch(type) {
+            case EXPLICIT: {
+                // src:  FULL/SPARSE
+                // dest: EXPLICIT
+                // "Storing into destination" cannot be done (since destination
+                // is by definition of smaller capacity than source), so a clone
+                // of source is made and values from destination are inserted
+                // into that.
+
+                // Determine source and destination storage.
+                // NOTE:  destination storage may change through promotion if
+                //        source is SPARSE.
+                if(HLLType.SPARSE.equals(other.getType())) {
+                    if(!sparseOff) {
+                        type = HLLType.SPARSE;
+                        sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone();
+                    } else {
+                        initializeStorage(HLLType.FULL);
+                        for(IntByteCursor c : other.sparseProbabilisticStorage) {
+                          final int registerIndex = c.key;
+                          final byte registerValue = c.value;
+                          probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+                        }
+                    }
+                } else /*source is HLLType.FULL*/ {
+                    type = HLLType.FULL;
+                    probabilisticStorage = other.probabilisticStorage.clone();
+                }
+                for(LongCursor c : explicitStorage) {
+                    addRaw(c.value);
+                }
+                explicitStorage = null;
+                return;
+            }
+            case SPARSE: {
+                if(HLLType.EXPLICIT.equals(other.getType())) {
+                    // src:  EXPLICIT
+                    // dest: SPARSE
+                    // Add the raw values from the source to the destination.
+
+                    for(LongCursor c : other.explicitStorage) {
+                        addRaw(c.value);
+                    }
+                    // NOTE:  addRaw will handle promotion cleanup
+                } else /*source is HLLType.FULL*/ {
+                    // src:  FULL
+                    // dest: SPARSE
+                    // "Storing into destination" cannot be done (since destination
+                    // is by definition of smaller capacity than source), so a
+                    // clone of source is made and registers from the destination
+                    // are merged into the clone.
+
+                    type = HLLType.FULL;
+                    probabilisticStorage = other.probabilisticStorage.clone();
+                    for(IntByteCursor c : sparseProbabilisticStorage) {
+                      final int registerIndex = c.key;
+                      final byte registerValue = c.value;
+                      probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+                    }
+                    sparseProbabilisticStorage = null;
+                }
+                return;
+            }
+            default/*destination is HLLType.FULL*/: {
+                if(HLLType.EXPLICIT.equals(other.getType())) {
+                    // src:  EXPLICIT
+                    // dest: FULL
+                    // Add the raw values from the source to the destination.
+                    // Promotion is not possible, so don't bother checking.
+
+                    for(LongCursor c : other.explicitStorage) {
+                        addRaw(c.value);
+                    }
+                } else /*source is HLLType.SPARSE*/ {
+                    // src:  SPARSE
+                    // dest: FULL
+                    // Merge the registers from the source into the destination.
+                    // Promotion is not possible, so don't bother checking.
+
+                    for(IntByteCursor c : other.sparseProbabilisticStorage) {
+                      final int registerIndex = c.key;
+                      final byte registerValue = c.value;
+                      probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Computes the union of two HLLs of the same type, and stores the
+     * result in this instance.
+     *
+     * @param other the other {@link HLL} instance to union into this one. This
+     *        cannot be <code>null</code>.
+     */
+    private void homogeneousUnion(final HLL other) {
+        switch(type) {
+            case EMPTY:
+                // union of empty and empty is empty
+                return;
+        case EXPLICIT:
+            for(LongCursor c : other.explicitStorage) {
+                addRaw(c.value);
+            }
+            // NOTE:  #addRaw() will handle promotion, if necessary
+            return;
+        case SPARSE:
+            for(IntByteCursor c : other.sparseProbabilisticStorage) {
+              final int registerIndex = c.key;
+              final byte registerValue = c.value;
+              final byte currentRegisterValue = sparseProbabilisticStorage.get(registerIndex);
+              if(registerValue > currentRegisterValue) {
+                sparseProbabilisticStorage.put(registerIndex, registerValue);
+              }
+            }
+
+            // promotion, if necessary
+            if(sparseProbabilisticStorage.size() > sparseThreshold) {
+                initializeStorage(HLLType.FULL);
+                for(IntByteCursor c : sparseProbabilisticStorage) {
+                  final int registerIndex = c.key;
+                  final byte registerValue = c.value;
+                  probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+                }
+                sparseProbabilisticStorage = null;
+            }
+            return;
+        case FULL:
+            for(int i=0; i<m; i++) {
+                final long registerValue = other.probabilisticStorage.getRegister(i);
+                probabilisticStorage.setMaxRegister(i, registerValue);
+            }
+            return;
+        default:
+                throw new RuntimeException("Unsupported HLL type " + type);
+        }
+    }
+
+    // ========================================================================
+    // Serialization
+    /**
+     * Serializes the HLL to an array of bytes in correspondence with the format
+     * of the default schema version, {@link SerializationUtil#DEFAULT_SCHEMA_VERSION}.
+     *
+     * @return the array of bytes representing the HLL. This will never be
+     *         <code>null</code> or empty.
+     */
+    public byte[] toBytes() {
+        return toBytes(SerializationUtil.DEFAULT_SCHEMA_VERSION);
+    }
+
+    /**
+     * Serializes the HLL to an array of bytes in correspondence with the format
+     * of the specified schema version.
+     *
+     * @param  schemaVersion the schema version dictating the serialization format
+     * @return the array of bytes representing the HLL. This will never be
+     *         <code>null</code> or empty.
+     */
+    public byte[] toBytes(final ISchemaVersion schemaVersion) {
+        final byte[] bytes;
+        switch(type) {
+            case EMPTY:
+                bytes = new byte[schemaVersion.paddingBytes(type)];
+                break;
+            case EXPLICIT: {
+                final IWordSerializer serializer =
+                    schemaVersion.getSerializer(type, Long.SIZE, explicitStorage.size());
+
+                final long[] values = explicitStorage.toArray();
+                Arrays.sort(values);
+                for(final long value : values) {
+                    serializer.writeWord(value);
+                }
+
+                bytes = serializer.getBytes();
+                break;
+            }
+            case SPARSE: {
+                final IWordSerializer serializer =
+                        schemaVersion.getSerializer(type, shortWordLength, sparseProbabilisticStorage.size());
+
+                final int[] indices = sparseProbabilisticStorage.keys().toArray();
+                Arrays.sort(indices);
+                for(final int registerIndex : indices) {
+                    assert sparseProbabilisticStorage.containsKey(registerIndex);
+                    final long registerValue = sparseProbabilisticStorage.get(registerIndex);
+                    // pack index and value into "short word"
+                    final long shortWord = ((registerIndex << regwidth) | registerValue);
+                    serializer.writeWord(shortWord);
+                }
+
+                bytes = serializer.getBytes();
+                break;
+            }
+            case FULL: {
+                final IWordSerializer serializer = schemaVersion.getSerializer(type, regwidth, m);
+                probabilisticStorage.getRegisterContents(serializer);
+
+                bytes = serializer.getBytes();
+                break;
+            }
+            default:
+                throw new RuntimeException("Unsupported HLL type " + type);
+        }
+
+        final IHLLMetadata metadata = new HLLMetadata(schemaVersion.schemaVersionNumber(),
+                                                      type,
+                                                      log2m,
+                                                      regwidth,
+                                                      (int)NumberUtil.log2(explicitThreshold),
+                                                      explicitOff,
+                                                      explicitAuto,
+                                                      !sparseOff);
+        schemaVersion.writeMetadata(bytes, metadata);
+
+        return bytes;
+    }
+
+    /**
+     * Deserializes the HLL (in {@link #toBytes(ISchemaVersion)} format) serialized
+     * into <code>bytes</code>.<p/>
+     *
+     * @param  bytes the serialized bytes of new HLL
+     * @return the deserialized HLL. This will never be <code>null</code>.
+     *
+     * @see #toBytes(ISchemaVersion)
+     */
+    public static HLL fromBytes(final byte[] bytes) {
+        final ISchemaVersion schemaVersion = SerializationUtil.getSchemaVersion(bytes);
+        final IHLLMetadata metadata = schemaVersion.readMetadata(bytes);
+
+        final HLLType type = metadata.HLLType();
+        final int regwidth = metadata.registerWidth();
+        final int log2m = metadata.registerCountLog2();
+        final boolean sparseon = metadata.sparseEnabled();
+
+        final int expthresh;
+        if(metadata.explicitAuto()) {
+            expthresh = -1;
+        } else if(metadata.explicitOff()) {
+            expthresh = 0;
+        } else {
+            // NOTE: take into account that the postgres-compatible constructor
+            //       subtracts one before taking a power of two.
+            expthresh = metadata.log2ExplicitCutoff() + 1;
+        }
+
+        final HLL hll = new HLL(log2m, regwidth, expthresh, sparseon, type);
+
+        // Short-circuit on empty, which needs no other deserialization.
+        if(HLLType.EMPTY.equals(type)) {
+            return hll;
+        }
+
+        final int wordLength;
+        switch(type) {
+            case EXPLICIT:
+                wordLength = Long.SIZE;
+                break;
+            case SPARSE:
+                wordLength = hll.shortWordLength;
+                break;
+            case FULL:
+                wordLength = hll.regwidth;
+                break;
+            default:
+                throw new RuntimeException("Unsupported HLL type " + type);
+        }
+
+        final IWordDeserializer deserializer =
+                schemaVersion.getDeserializer(type, wordLength, bytes);
+        switch(type) {
+            case EXPLICIT:
+                // NOTE:  This should not exceed expthresh and this will always
+                //        be exactly the number of words that were encoded,
+                //        because the word length is at least a byte wide.
+                // SEE:   IWordDeserializer#totalWordCount()
+                for(int i=0; i<deserializer.totalWordCount(); i++) {
+                    hll.explicitStorage.add(deserializer.readWord());
+                }
+                break;
+            case SPARSE:
+                // NOTE:  If the shortWordLength were smaller than 8 bits
+                //        (1 byte) there would be a possibility (because of
+                //        padding arithmetic) of having one or more extra
+                //        registers read. However, this is not relevant as the
+                //        extra registers will be all zeroes, which are ignored
+                //        in the sparse representation.
+                for(int i=0; i<deserializer.totalWordCount(); i++) {
+                    final long shortWord = deserializer.readWord();
+                    final byte registerValue = (byte)(shortWord & hll.valueMask);
+                    // Only set non-zero registers.
+                    if (registerValue != 0) {
+                        hll.sparseProbabilisticStorage.put((int)(shortWord >>> hll.regwidth), registerValue);
+                    }
+                }
+                break;
+            case FULL:
+                // NOTE:  Iteration is done using m (register count) and NOT
+                //        deserializer#totalWordCount() because regwidth may be
+                //        less than 8 and as such the padding on the 'last' byte
+                //        may be larger than regwidth, causing an extra register
+                //        to be read.
+                // SEE: IWordDeserializer#totalWordCount()
+                for(long i=0; i<hll.m; i++) {
+                    hll.probabilisticStorage.setRegister(i, deserializer.readWord());
+                }
+                break;
+            default:
+                throw new RuntimeException("Unsupported HLL type " + type);
+        }
+
+        return hll;
+    }
+    
+    /**
+     * Create a deep copy of this HLL.
+     *
+     * @see java.lang.Object#clone()
+     */
+    @Override
+    public HLL clone() throws CloneNotSupportedException {
+        // NOTE: Since the package-only constructor assumes both explicit and
+        //       sparse are enabled, the easiest thing to do here is to re-derive
+        //       the expthresh parameter and create a new HLL with the public
+        //       constructor.
+        // TODO: add a more sensible constructor to make this less obfuscated
+        final int copyExpthresh;
+        if(explicitAuto) {
+            copyExpthresh = -1;
+        } else if(explicitOff) {
+            copyExpthresh = 0;
+        } else {
+            // explicitThreshold is defined as:
+            //
+            //      this.explicitThreshold = (1 << (expthresh - 1));
+            //
+            // Since explicitThreshold is a power of two and only has a single
+            // bit set, finding the LSB is the same as finding the inverse
+            copyExpthresh = BitUtil.leastSignificantBit(explicitThreshold) + 1;
+        }
+        final HLL copy = new HLL(log2m, regwidth, copyExpthresh, !sparseOff/*sparseOn*/, type);
+        switch(type) {
+            case EMPTY:
+                // nothing to be done
+                break;
+            case EXPLICIT:
+                copy.explicitStorage = this.explicitStorage.clone();
+                break;
+            case SPARSE:
+                copy.sparseProbabilisticStorage = this.sparseProbabilisticStorage.clone();
+                break;
+            case FULL:
+                copy.probabilisticStorage = this.probabilisticStorage.clone();
+                break;
+            default:
+                throw new RuntimeException("Unsupported HLL type " + type);
+        }
+        return copy;
+    }
+}

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLMetadata.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLMetadata.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLMetadata.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLMetadata.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,138 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A concrete {@link IHLLMetadata} implemented as a simple struct.
+ *
+ * @author timon
+ */
+class HLLMetadata implements IHLLMetadata {
+    private final int schemaVersion;
+    private final HLLType type;
+    private final int registerCountLog2;
+    private final int registerWidth;
+    private final int log2ExplicitCutoff;
+    private final boolean explicitOff;
+    private final boolean explicitAuto;
+    private final boolean sparseEnabled;
+
+    /**
+     * @param schemaVersion the schema version number of the HLL. This must
+     *        be greater than or equal to zero.
+     * @param type the {@link HLLType type} of the HLL. This cannot
+     *        be <code>null</code>.
+     * @param registerCountLog2 the log-base-2 register count parameter for
+     *        probabilistic HLLs. This must be greater than or equal to zero.
+     * @param registerWidth the register width parameter for probabilistic
+     *        HLLs. This must be greater than or equal to zero.
+     * @param log2ExplicitCutoff the log-base-2 of the explicit cardinality cutoff,
+     *        if it is explicitly defined. (If <code>explicitOff</code> or
+     *        <code>explicitAuto</code> is <code>true</code> then this has no
+     *        meaning.)
+     * @param explicitOff the flag for 'explicit off'-mode, where the
+     *        {@link HLLType#EXPLICIT} representation is not used. Both this and
+     *        <code>explicitAuto</code> cannot be <code>true</code> at the same
+     *        time.
+     * @param explicitAuto the flag for 'explicit auto'-mode, where the
+     *        {@link HLLType#EXPLICIT} representation's promotion cutoff is
+     *        determined based on in-memory size automatically. Both this and
+     *        <code>explicitOff</code> cannot be <code>true</code> at the same
+     *        time.
+     * @param sparseEnabled the flag for 'sparse-enabled'-mode, where the
+     *        {@link HLLType#SPARSE} representation is used.
+     */
+    public HLLMetadata(final int schemaVersion,
+                       final HLLType type,
+                       final int registerCountLog2,
+                       final int registerWidth,
+                       final int log2ExplicitCutoff,
+                       final boolean explicitOff,
+                       final boolean explicitAuto,
+                       final boolean sparseEnabled) {
+        this.schemaVersion = schemaVersion;
+        this.type = type;
+        this.registerCountLog2 = registerCountLog2;
+        this.registerWidth = registerWidth;
+        this.log2ExplicitCutoff = log2ExplicitCutoff;
+        this.explicitOff = explicitOff;
+        this.explicitAuto = explicitAuto;
+        this.sparseEnabled = sparseEnabled;
+    }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.IHLLMetadata#schemaVersion()
+     */
+    @Override
+    public int schemaVersion() { return schemaVersion; }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.IHLLMetadata#HLLType()
+     */
+    @Override
+    public HLLType HLLType() { return type; }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.IHLLMetadata#registerCountLog2()
+     */
+    @Override
+    public int registerCountLog2() { return registerCountLog2; }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.IHLLMetadata#registerWidth()
+     */
+    @Override
+    public int registerWidth() { return registerWidth; }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.IHLLMetadata#log2ExplicitCutoff()
+     */
+    @Override
+    public int log2ExplicitCutoff() { return log2ExplicitCutoff; }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.IHLLMetadata#explicitOff()
+     */
+    @Override
+    public boolean explicitOff() {
+        return explicitOff;
+    }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.IHLLMetadata#explicitAuto()
+     * @see net.agkn.hll.serialization.IHLLMetadata#log2ExplicitCutoff()
+     */
+    @Override
+    public boolean explicitAuto() {
+        return explicitAuto;
+    }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.IHLLMetadata#sparseEnabled()
+     */
+    @Override
+    public boolean sparseEnabled() { return sparseEnabled; }
+
+    /* (non-Javadoc)
+     * @see java.lang.Object#toString()
+     */
+    @Override
+    public String toString() {
+        return "<HLLMetadata schemaVersion: " + this.schemaVersion + ", type: " + this.type.toString() + ", registerCountLog2: " + this.registerCountLog2 + ", registerWidth: " + this.registerWidth + ", log2ExplicitCutoff: " + this.log2ExplicitCutoff + ", explicitOff: " + this.explicitOff + ", explicitAuto: " +this.explicitAuto + ">";
+    }
+}

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLType.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLType.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLType.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLType.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,29 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The types of algorithm/data structure that {@link HLL} can utilize. For more
+ * information, see the Javadoc for {@link HLL}.
+ */
+public enum HLLType {
+    EMPTY,
+    EXPLICIT,
+    SPARSE,
+    FULL;
+}
\ No newline at end of file

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,199 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Static functions for computing constants and parameters used in the HLL
+ * algorithm.
+ */
+final class HLLUtil {
+    /**
+     * Precomputed <code>pwMaxMask</code> values indexed by <code>registerSizeInBits</code>.
+     * Calculated with this formula:
+     * <pre>
+     *     int maxRegisterValue = (1 << registerSizeInBits) - 1;
+     *     // Mask with all bits set except for (maxRegisterValue - 1) least significant bits (see #addRaw())
+     *     return ~((1L << (maxRegisterValue - 1)) - 1);
+     * </pre>
+     *
+     * @see #pwMaxMask(int)
+     */
+    private static final long[] PW_MASK = {
+            ~((1L << (((1 << 0) - 1) - 1)) - 1),
+            ~((1L << (((1 << 1) - 1) - 1)) - 1),
+            ~((1L << (((1 << 2) - 1) - 1)) - 1),
+            ~((1L << (((1 << 3) - 1) - 1)) - 1),
+            ~((1L << (((1 << 4) - 1) - 1)) - 1),
+            ~((1L << (((1 << 5) - 1) - 1)) - 1),
+            ~((1L << (((1 << 6) - 1) - 1)) - 1),
+            ~((1L << (((1 << 7) - 1) - 1)) - 1),
+            ~((1L << (((1 << 8) - 1) - 1)) - 1)
+    };
+
+    /**
+     * Precomputed <code>twoToL</code> values indexed by a linear combination of
+     * <code>regWidth</code> and <code>log2m</code>.
+     *
+     * The array is one-dimensional and can be accessed by using index
+     * <code>(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m</code>
+     * for <code>regWidth</code> and <code>log2m</code> between the specified
+     * <code>HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM</code> constants.
+     *
+     * @see #largeEstimator(int, int, double)
+     * @see #largeEstimatorCutoff(int, int)
+     * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 2^L</a>"
+     */
+    private static final double[] TWO_TO_L = new double[(HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)];
+
+    /**
+     * Spacing constant used to compute offsets into {@link #TWO_TO_L}.
+     */
+    private static final int REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1;
+
+    static {
+        for(int regWidth = HLL.MINIMUM_REGWIDTH_PARAM; regWidth <= HLL.MAXIMUM_REGWIDTH_PARAM; regWidth++) {
+            for(int log2m = HLL.MINIMUM_LOG2M_PARAM ; log2m <= HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
+                int maxRegisterValue = (1 << regWidth) - 1;
+
+                // Since 1 is added to p(w) in the insertion algorithm, only
+                // (maxRegisterValue - 1) bits are inspected hence the hash
+                // space is one power of two smaller.
+                final int pwBits = (maxRegisterValue - 1);
+                final int totalBits = (pwBits + log2m);
+                final double twoToL = Math.pow(2, totalBits);
+                TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m] = twoToL;
+            }
+        }
+    }
+
+    // ************************************************************************
+    /**
+     * Computes the bit-width of HLL registers necessary to estimate a set of
+     * the specified cardinality.
+     *
+     * @param  expectedUniqueElements an upper bound on the number of unique
+     *         elements that are expected.  This must be greater than zero.
+     * @return a register size in bits (i.e. <code>log2(log2(n))</code>)
+     */
+    public static int registerBitSize(final long expectedUniqueElements) {
+        return Math.max(HLL.MINIMUM_REGWIDTH_PARAM,
+                        (int)Math.ceil(NumberUtil.log2(NumberUtil.log2(expectedUniqueElements))));
+    }
+
+    // ========================================================================
+    /**
+     * Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm.
+     *
+     * @param  m this must be a power of two, cannot be less than
+     *         16 (2<sup>4</sup>), and cannot be greater than 65536 (2<sup>16</sup>).
+     * @return gamma times <code>registerCount</code> squared where gamma is
+     *         based on the value of <code>registerCount</code>.
+     * @throws IllegalArgumentException if <code>registerCount</code> is less
+     *         than 16.
+     */
+    public static double alphaMSquared(final int m) {
+        switch(m) {
+            case 1/*2^0*/:
+            case 2/*2^1*/:
+            case 4/*2^2*/:
+            case 8/*2^3*/:
+                throw new IllegalArgumentException("'m' cannot be less than 16 (" + m + " < 16).");
+
+            case 16/*2^4*/:
+                return 0.673 * m * m;
+
+            case 32/*2^5*/:
+                return 0.697 * m * m;
+
+            case 64/*2^6*/:
+                return 0.709 * m * m;
+
+            default/*>2^6*/:
+                return (0.7213 / (1.0 + 1.079 / m)) * m * m;
+        }
+    }
+
+    // ========================================================================
+    /**
+     * Computes a mask that prevents overflow of HyperLogLog registers.
+     *
+     * @param  registerSizeInBits the size of the HLL registers, in bits.
+     * @return mask a <code>long</code> mask to prevent overflow of the registers
+     * @see #registerBitSize(long)
+     */
+    public static long pwMaxMask(final int registerSizeInBits) {
+        return PW_MASK[registerSizeInBits];
+    }
+
+    // ========================================================================
+    /**
+     * The cutoff for using the "small range correction" formula, in the
+     * HyperLogLog algorithm.
+     *
+     * @param  m the number of registers in the HLL. <em>m<em> in the paper.
+     * @return the cutoff for the small range correction.
+     * @see #smallEstimator(int, int)
+     */
+    public static double smallEstimatorCutoff(final int m) {
+        return ((double)m * 5) / 2;
+    }
+
+    /**
+     * The "small range correction" formula from the HyperLogLog algorithm. Only
+     * appropriate if both the estimator is smaller than <pre>(5/2) * m</pre> and
+     * there are still registers that have the zero value.
+     *
+     * @param  m the number of registers in the HLL. <em>m<em> in the paper.
+     * @param  numberOfZeroes the number of registers with value zero. <em>V</em>
+     *         in the paper.
+     * @return a corrected cardinality estimate.
+     */
+    public static double smallEstimator(final int m, final int numberOfZeroes) {
+        return m * Math.log((double)m / numberOfZeroes);
+    }
+
+    /**
+     * The cutoff for using the "large range correction" formula, from the
+     * HyperLogLog algorithm, adapted for 64 bit hashes.
+     *
+     * @param  log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
+     * @param  registerSizeInBits the size of the HLL registers, in bits.
+     * @return the cutoff for the large range correction.
+     * @see #largeEstimator(int, int, double)
+     * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction' cutoff</a>"
+     */
+    public static double largeEstimatorCutoff(final int log2m, final int registerSizeInBits) {
+        return (TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]) / 30.0;
+    }
+
+    /**
+     * The "large range correction" formula from the HyperLogLog algorithm, adapted
+     * for 64 bit hashes. Only appropriate for estimators whose value exceeds
+     * the return of {@link #largeEstimatorCutoff(int, int)}.
+     *
+     * @param  log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
+     * @param  registerSizeInBits the size of the HLL registers, in bits.
+     * @param  estimator the original estimator ("E" in the paper).
+     * @return a corrected cardinality estimate.
+     * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction'</a>"
+     */
+    public static double largeEstimator(final int log2m, final int registerSizeInBits, final double estimator) {
+        final double twoToL = TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m];
+        return -1 * twoToL * Math.log(1.0 - (estimator/twoToL));
+    }
+}

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IHLLMetadata.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IHLLMetadata.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IHLLMetadata.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IHLLMetadata.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,71 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The metadata and parameters associated with a HLL.
+ */
+interface IHLLMetadata {
+    /**
+     * @return the schema version of the HLL. This will never be <code>null</code>.
+     */
+    int schemaVersion();
+
+    /**
+     * @return the type of the HLL. This will never be <code>null</code>.
+     */
+    HLLType HLLType();
+
+    /**
+     * @return the log-base-2 of the register count parameter of the HLL. This
+     *         will always be greater than or equal to 4 and less than or equal
+     *         to 31.
+     */
+    int registerCountLog2();
+
+    /**
+     * @return the register width parameter of the HLL. This will always be
+     *         greater than or equal to 1 and less than or equal to 8.
+     */
+    int registerWidth();
+
+    /**
+     * @return the log-base-2 of the explicit cutoff cardinality. This will always
+     *         be greater than or equal to zero and less than 31, per the specification.
+     */
+    int log2ExplicitCutoff();
+
+    /**
+     * @return <code>true</code> if the {@link HLLType#EXPLICIT} representation
+     *         has been disabled. <code>false</code> otherwise.
+     */
+    boolean explicitOff();
+
+    /**
+     * @return <code>true</code> if the {@link HLLType#EXPLICIT} representation
+     *         cutoff cardinality is set to be automatically chosen,
+     *         <code>false</code> otherwise.
+     */
+    boolean explicitAuto();
+
+    /**
+     * @return <code>true</code> if the {@link HLLType#SPARSE} representation
+     *         is enabled.
+     */
+    boolean sparseEnabled();
+}
\ No newline at end of file

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/ISchemaVersion.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/ISchemaVersion.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/ISchemaVersion.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/ISchemaVersion.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,87 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A serialization schema for HLLs. Reads and writes HLL metadata to
+ * and from <code>byte[]</code> representations.
+ *
+ * @author timon
+ */
+interface ISchemaVersion {
+    /**
+     * The number of metadata bytes required for a serialized HLL of the
+     * specified type.
+     *
+     * @param  type the type of the serialized HLL
+     * @return the number of padding bytes needed in order to fully accommodate
+     *         the needed metadata.
+     */
+    int paddingBytes(HLLType type);
+
+    /**
+     * Writes metadata bytes to serialized HLL.
+     *
+     * @param bytes the padded data bytes of the HLL
+     * @param metadata the metadata to write to the padding bytes
+     */
+    void writeMetadata(byte[] bytes, IHLLMetadata metadata);
+
+    /**
+     * Reads the metadata bytes of the serialized HLL.
+     *
+     * @param  bytes the serialized HLL
+     * @return the HLL metadata
+     */
+    IHLLMetadata readMetadata(byte[] bytes);
+
+    /**
+     * Builds an HLL serializer that matches this schema version.
+     *
+     * @param  type the HLL type that will be serialized. This cannot be
+     *         <code>null</code>.
+     * @param  wordLength the length of the 'words' that comprise the data of the
+     *         HLL. Words must be at least 5 bits and at most 64 bits long.
+     * @param  wordCount the number of 'words' in the HLL's data.
+     * @return a byte array serializer used to serialize a HLL according
+     *         to this schema version's specification.
+     * @see #paddingBytes(HLLType)
+     * @see IWordSerializer
+     */
+    IWordSerializer getSerializer(HLLType type, int wordLength, int wordCount);
+
+    /**
+     * Builds an HLL deserializer that matches this schema version.
+     *
+     * @param  type the HLL type that will be deserialized. This cannot be
+     *         <code>null</code>.
+     * @param  wordLength the length of the 'words' that comprise the data of the
+     *         serialized HLL. Words must be at least 5 bits and at most 64
+     *         bits long.
+     * @param  bytes the serialized HLL to deserialize. This cannot be
+     *         <code>null</code>.
+     * @return a byte array deserializer used to deserialize a HLL serialized
+     *         according to this schema version's specification.
+     */
+    IWordDeserializer getDeserializer(HLLType type, int wordLength, byte[] bytes);
+
+    /**
+     * @return the schema version number.
+     */
+    int schemaVersionNumber();
+}

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordDeserializer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordDeserializer.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordDeserializer.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordDeserializer.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,41 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Reads 'words' of a fixed width, in sequence, from a byte array.
+ */
+public interface IWordDeserializer {
+    /**
+     * @return the next word in the sequence. Should not be called more than
+     * {@link #totalWordCount()} times.
+     */
+    long readWord();
+
+    /**
+     * Returns the number of words that could be encoded in the sequence.<p/>
+     *
+     * NOTE:  the sequence that was encoded may be shorter than the value this
+     *        method returns due to padding issues within bytes. This guarantees
+     *        only an upper bound on the number of times {@link #readWord()}
+     *        can be called.
+     *
+     * @return the maximum number of words that could be read from the sequence.
+     */
+    int totalWordCount();
+}
\ No newline at end of file

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordSerializer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordSerializer.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordSerializer.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordSerializer.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,39 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Writes 'words' of fixed width, in sequence, to a byte array.
+ */
+interface IWordSerializer {
+
+    /**
+     * Writes the word to the backing array.
+     *
+     * @param  word the word to write.
+     */
+    void writeWord(final long word);
+
+    /**
+     * Returns the backing array of <code>byte</code>s that contain the serialized
+     * words.
+     * @return the serialized words as a <code>byte[]</code>.
+     */
+    byte[] getBytes();
+
+}
\ No newline at end of file

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/LongIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/LongIterator.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/LongIterator.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/LongIterator.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,35 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A <code>long</code>-based iterator.  This is not <i>is-a</i> {@link java.util.Iterator}
+ * to prevent autoboxing between <code>Long</code> and <code>long</code>.
+ */
+interface LongIterator {
+    /**
+     * @return <code>true</code> if and only if there are more elements to
+     *         iterate over.  <code>false</code> otherwise.
+     */
+    boolean hasNext();
+
+    /**
+     * @return the next <code>long</code> in the collection.
+     */
+    long next();
+}
\ No newline at end of file

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/NumberUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/NumberUtil.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/NumberUtil.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/NumberUtil.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,172 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A collection of utilities to work with numbers.
+ */
+class NumberUtil {
+    // loge(2) (log-base e of 2)
+    public static final double LOGE_2 = 0.6931471805599453;
+
+    // ************************************************************************
+    /**
+     * Computes the <code>log2</code> (log-base-two) of the specified value.
+     *
+     * @param  value the <code>double</code> for which the <code>log2</code> is
+     *         desired.
+     * @return the <code>log2</code> of the specified value
+     */
+    public static double log2(final double value) {
+        // REF:  http://en.wikipedia.org/wiki/Logarithmic_scale (conversion of bases)
+        return Math.log(value) / LOGE_2;
+    }
+
+    // ========================================================================
+    // the hex characters
+    private static final char[] HEX = { '0', '1', '2', '3', '4', '5', '6', '7',
+                                        '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
+
+    // ------------------------------------------------------------------------
+    /**
+     * Converts the specified array of <code>byte</code>s into a string of
+     * hex characters (low <code>byte</code> first).
+     *
+     * @param  bytes the array of <code>byte</code>s that are to be converted.
+     *         This cannot be <code>null</code> though it may be empty.
+     * @param  offset the offset in <code>bytes</code> at which the bytes will
+     *         be taken.  This cannot be negative and must be less than
+     *         <code>bytes.length - 1</code>.
+     * @param  count the number of bytes to be retrieved from the specified array.
+     *         This cannot be negative.  If greater than <code>bytes.length - offset</code>
+     *         then that value is used.
+     * @return a string of at most <code>count</code> characters that represents
+     *         the specified byte array in hex.  This will never be <code>null</code>
+     *         though it may be empty if <code>bytes</code> is empty or <code>count</code>
+     *         is zero.
+     * @throws IllegalArgumentException if <code>offset</code> is greater than
+     *         or equal to <code>bytes.length</code>.
+     * @see #fromHex(String, int, int)
+     */
+    public static String toHex(final byte[] bytes, final int offset, final int count) {
+        if(offset >= bytes.length) throw new IllegalArgumentException("Offset is greater than the length (" + offset + " >= " + bytes.length + ").")/*by contract*/;
+        final int byteCount = Math.min( (bytes.length - offset), count);
+        final int upperBound = byteCount + offset;
+
+        final char[] chars = new char[byteCount * 2/*two chars per byte*/];
+        int charIndex = 0;
+        for(int i=offset; i<upperBound; i++) {
+            final byte value = bytes[i];
+            chars[charIndex++] = HEX[(value >>> 4) & 0x0F];
+            chars[charIndex++] = HEX[value & 0x0F];
+        }
+
+        return new String(chars);
+    }
+
+    /**
+     * Converts the specified array of hex characters into an array of <code>byte</code>s
+     * (low <code>byte</code> first).
+     *
+     * @param  string the string of hex characters to be converted into <code>byte</code>s.
+     *         This cannot be <code>null</code> though it may be blank.
+     * @param  offset the offset in the string at which the characters will be
+     *         taken.  This cannot be negative and must be less than <code>string.length() - 1</code>.
+     * @param  count the number of characters to be retrieved from the specified
+     *         string.  This cannot be negative and must be divisible by two
+     *         (since there are two characters per <code>byte</code>).
+     * @return the array of <code>byte</code>s that were converted from the
+     *         specified string (in the specified range).  This will never be
+     *         <code>null</code> though it may be empty if <code>string</code>
+     *         is empty or <code>count</code> is zero.
+     * @throws IllegalArgumentException if <code>offset</code> is greater than
+     *         or equal to <code>string.length()</code> or if <code>count</code>
+     *         is not divisible by two.
+     * @see #toHex(byte[], int, int)
+     */
+    public static byte[] fromHex(final String string, final int offset, final int count) {
+        if(offset >= string.length()) throw new IllegalArgumentException("Offset is greater than the length (" + offset + " >= " + string.length() + ").")/*by contract*/;
+        if( (count & 0x01) != 0) throw new IllegalArgumentException("Count is not divisible by two (" + count + ").")/*by contract*/;
+        final int charCount = Math.min((string.length() - offset), count);
+        final int upperBound = offset + charCount;
+
+        final byte[] bytes = new byte[charCount >>> 1/*aka /2*/];
+        int byteIndex = 0/*beginning*/;
+        for(int i=offset; i<upperBound; i+=2) {
+            bytes[byteIndex++] = (byte)(( (digit(string.charAt(i)) << 4)
+                                         | digit(string.charAt(i + 1))) & 0xFF);
+        }
+
+        return bytes;
+    }
+
+    // ------------------------------------------------------------------------
+    /**
+     * @param  character a hex character to be converted to a <code>byte</code>.
+     *         This cannot be a character other than [a-fA-F0-9].
+     * @return the value of the specified character.  This will be a value <code>0</code>
+     *         through <code>15</code>.
+     * @throws IllegalArgumentException if the specified character is not in
+     *         [a-fA-F0-9]
+     */
+    private static final int digit(final char character) {
+        switch(character) {
+            case '0':
+                return 0;
+            case '1':
+                return 1;
+            case '2':
+                return 2;
+            case '3':
+                return 3;
+            case '4':
+                return 4;
+            case '5':
+                return 5;
+            case '6':
+                return 6;
+            case '7':
+                return 7;
+            case '8':
+                return 8;
+            case '9':
+                return 9;
+            case 'a':
+            case 'A':
+                return 10;
+            case 'b':
+            case 'B':
+                return 11;
+            case 'c':
+            case 'C':
+                return 12;
+            case 'd':
+            case 'D':
+                return 13;
+            case 'e':
+            case 'E':
+                return 14;
+            case 'f':
+            case 'F':
+                return 15;
+
+            default:
+                throw new IllegalArgumentException("Character is not in [a-fA-F0-9] ('" + character + "').");
+        }
+    }
+}
\ No newline at end of file