You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2015/07/16 12:32:07 UTC
svn commit: r1691350 [2/3] - in /lucene/dev/branches/solr7787: ./ lucene/
solr/ solr/core/ solr/core/src/java/org/apache/solr/handler/component/
solr/core/src/java/org/apache/solr/search/facet/
solr/core/src/java/org/apache/solr/util/hll/ solr/core/src...
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLL.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLL.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLL.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLL.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,1088 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+
+import com.carrotsearch.hppc.IntByteOpenHashMap;
+import com.carrotsearch.hppc.LongOpenHashSet;
+import com.carrotsearch.hppc.cursors.IntByteCursor;
+import com.carrotsearch.hppc.cursors.LongCursor;
+
+/**
+ * A probabilistic set of hashed <code>long</code> elements. Useful for computing
+ * the approximate cardinality of a stream of data in very small storage.<p/>
+ *
+ * A modified version of the <a href="http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf">
+ * 'HyperLogLog' data structure and algorithm</a> is used, which combines both
+ * probabilistic and non-probabilistic techniques to improve the accuracy and
+ * storage requirements of the original algorithm.<p/>
+ *
+ * More specifically, initializing and storing a new {@link HLL} will
+ * allocate a sentinel value symbolizing the empty set ({@link HLLType#EMPTY}).
+ * After adding the first few values, a sorted list of unique integers is
+ * stored in a {@link HLLType#EXPLICIT} hash set. When configured, accuracy can
+ * be sacrificed for memory footprint: the values in the sorted list are
+ * "promoted" to a "{@link HLLType#SPARSE}" map-based HyperLogLog structure.
+ * Finally, when enough registers are set, the map-based HLL will be converted
+ * to a bit-packed "{@link HLLType#FULL}" HyperLogLog structure.<p/>
+ *
+ * This data structure is interoperable with the implementations found at:
+ * <ul>
+ * <li><a href="https://github.com/aggregateknowledge/postgresql-hll">postgresql-hll</a>, and</li>
+ * <li><a href="https://github.com/aggregateknowledge/js-hll">js-hll</a></li>
+ * </ul>
+ * when <a href="https://github.com/aggregateknowledge/postgresql-hll/blob/master/STORAGE.markdown">properly serialized</a>.
+ */
+public class HLL implements Cloneable {
+ // minimum and maximum values for the log-base-2 of the number of registers
+ // in the HLL
+ public static final int MINIMUM_LOG2M_PARAM = 4;
+ public static final int MAXIMUM_LOG2M_PARAM = 30;
+
+ // minimum and maximum values for the register width of the HLL
+ public static final int MINIMUM_REGWIDTH_PARAM = 1;
+ public static final int MAXIMUM_REGWIDTH_PARAM = 8;
+
+ // minimum and maximum values for the 'expthresh' parameter of the
+ // constructor that is meant to match the PostgreSQL implementation's
+ // constructor and parameter names
+ public static final int MINIMUM_EXPTHRESH_PARAM = -1;
+ public static final int MAXIMUM_EXPTHRESH_PARAM = 18;
+ public static final int MAXIMUM_EXPLICIT_THRESHOLD = (1 << (MAXIMUM_EXPTHRESH_PARAM - 1)/*per storage spec*/);
+
+ // ************************************************************************
+ // Storage
+ // storage used when #type is EXPLICIT, null otherwise
+ LongOpenHashSet explicitStorage;
+ // storage used when #type is SPARSE, null otherwise
+ IntByteOpenHashMap sparseProbabilisticStorage;
+ // storage used when #type is FULL, null otherwise
+ BitVector probabilisticStorage;
+
+ // current type of this HLL instance, if this changes then so should the
+ // storage used (see above)
+ private HLLType type;
+
+ // ------------------------------------------------------------------------
+ // Characteristic parameters
+ // NOTE: These members are named to match the PostgreSQL implementation's
+ // parameters.
+ // log2(the number of probabilistic HLL registers)
+ private final int log2m;
+ // the size (width) each register in bits
+ private final int regwidth;
+
+ // ------------------------------------------------------------------------
+ // Computed constants
+ // ........................................................................
+ // EXPLICIT-specific constants
+ // flag indicating if the EXPLICIT representation should NOT be used
+ private final boolean explicitOff;
+ // flag indicating that the promotion threshold from EXPLICIT should be
+ // computed automatically
+ // NOTE: this only has meaning when 'explicitOff' is false
+ private final boolean explicitAuto;
+ // threshold (in element count) at which a EXPLICIT HLL is converted to a
+ // SPARSE or FULL HLL, always greater than or equal to zero and always a
+ // power of two OR simply zero
+ // NOTE: this only has meaning when 'explicitOff' is false
+ private final int explicitThreshold;
+
+ // ........................................................................
+ // SPARSE-specific constants
+ // the computed width of the short words
+ private final int shortWordLength;
+ // flag indicating if the SPARSE representation should not be used
+ private final boolean sparseOff;
+ // threshold (in register count) at which a SPARSE HLL is converted to a
+ // FULL HLL, always greater than zero
+ private final int sparseThreshold;
+
+ // ........................................................................
+ // Probabilistic algorithm constants
+ // the number of registers, will always be a power of 2
+ private final int m;
+ // a mask of the log2m bits set to one and the rest to zero
+ private final int mBitsMask;
+ // a mask as wide as a register (see #fromBytes())
+ private final int valueMask;
+ // mask used to ensure that p(w) does not overflow register (see #Constructor() and #addRaw())
+ private final long pwMaxMask;
+ // alpha * m^2 (the constant in the "'raw' HyperLogLog estimator")
+ private final double alphaMSquared;
+ // the cutoff value of the estimator for using the "small" range cardinality
+ // correction formula
+ private final double smallEstimatorCutoff;
+ // the cutoff value of the estimator for using the "large" range cardinality
+ // correction formula
+ private final double largeEstimatorCutoff;
+
+ // ========================================================================
+ /**
+ * NOTE: Arguments here are named and structured identically to those in the
+ * PostgreSQL implementation, which can be found
+ * <a href="https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning">here</a>.
+ *
+ * @param log2m log-base-2 of the number of registers used in the HyperLogLog
+ * algorithm. Must be at least 4 and at most 30.
+ * @param regwidth number of bits used per register in the HyperLogLog
+ * algorithm. Must be at least 1 and at most 8.
+ * @param expthresh tunes when the {@link HLLType#EXPLICIT} to
+ * {@link HLLType#SPARSE} promotion occurs,
+ * based on the set's cardinality. Must be at least -1 and at most 18.
+ * <table>
+ * <thead><tr><th><code>expthresh</code> value</th><th>Meaning</th></tr></thead>
+ * <tbody>
+ * <tr>
+ * <td>-1</td>
+ * <td>Promote at whatever cutoff makes sense for optimal memory usage. ('auto' mode)</td>
+ * </tr>
+ * <tr>
+ * <td>0</td>
+ * <td>Skip <code>EXPLICIT</code> representation in hierarchy.</td>
+ * </tr>
+ * <tr>
+ * <td>1-18</td>
+ * <td>Promote at 2<sup>expthresh - 1</sup> cardinality</td>
+ * </tr>
+ * </tbody>
+ * </table>
+ * @param sparseon Flag indicating if the {@link HLLType#SPARSE}
+ * representation should be used.
+ * @param type the type in the promotion hierarchy which this instance should
+ * start at. This cannot be <code>null</code>.
+ */
+ public HLL(final int log2m, final int regwidth, final int expthresh, final boolean sparseon, final HLLType type) {
+ this.log2m = log2m;
+ if((log2m < MINIMUM_LOG2M_PARAM) || (log2m > MAXIMUM_LOG2M_PARAM)) {
+ throw new IllegalArgumentException("'log2m' must be at least " + MINIMUM_LOG2M_PARAM + " and at most " + MAXIMUM_LOG2M_PARAM + " (was: " + log2m + ")");
+ }
+
+ this.regwidth = regwidth;
+ if((regwidth < MINIMUM_REGWIDTH_PARAM) || (regwidth > MAXIMUM_REGWIDTH_PARAM)) {
+ throw new IllegalArgumentException("'regwidth' must be at least " + MINIMUM_REGWIDTH_PARAM + " and at most " + MAXIMUM_REGWIDTH_PARAM + " (was: " + regwidth + ")");
+ }
+
+ this.m = (1 << log2m);
+ this.mBitsMask = m - 1;
+ this.valueMask = (1 << regwidth) - 1;
+ this.pwMaxMask = HLLUtil.pwMaxMask(regwidth);
+ this.alphaMSquared = HLLUtil.alphaMSquared(m);
+ this.smallEstimatorCutoff = HLLUtil.smallEstimatorCutoff(m);
+ this.largeEstimatorCutoff = HLLUtil.largeEstimatorCutoff(log2m, regwidth);
+
+ if(expthresh == -1) {
+ this.explicitAuto = true;
+ this.explicitOff = false;
+
+ // NOTE: This math matches the size calculation in the PostgreSQL impl.
+ final long fullRepresentationSize = (this.regwidth * (long)this.m + 7/*round up to next whole byte*/)/Byte.SIZE;
+ final int numLongs = (int)(fullRepresentationSize / 8/*integer division to round down*/);
+
+ if(numLongs > MAXIMUM_EXPLICIT_THRESHOLD) {
+ this.explicitThreshold = MAXIMUM_EXPLICIT_THRESHOLD;
+ } else {
+ this.explicitThreshold = numLongs;
+ }
+ } else if(expthresh == 0) {
+ this.explicitAuto = false;
+ this.explicitOff = true;
+ this.explicitThreshold = 0;
+ } else if((expthresh > 0) && (expthresh <= MAXIMUM_EXPTHRESH_PARAM)){
+ this.explicitAuto = false;
+ this.explicitOff = false;
+ this.explicitThreshold = (1 << (expthresh - 1));
+ } else {
+ throw new IllegalArgumentException("'expthresh' must be at least " + MINIMUM_EXPTHRESH_PARAM + " and at most " + MAXIMUM_EXPTHRESH_PARAM + " (was: " + expthresh + ")");
+ }
+
+ this.shortWordLength = (regwidth + log2m);
+ this.sparseOff = !sparseon;
+ if(this.sparseOff) {
+ this.sparseThreshold = 0;
+ } else {
+ // TODO improve this cutoff to include the cost overhead of Java
+ // members/objects
+ final int largestPow2LessThanCutoff =
+ (int)NumberUtil.log2((this.m * this.regwidth) / this.shortWordLength);
+ this.sparseThreshold = (1 << largestPow2LessThanCutoff);
+ }
+
+ initializeStorage(type);
+ }
+
+ /**
+ * Construct an empty HLL with the given {@code log2m} and {@code regwidth}.<p/>
+ *
+ * This is equivalent to calling <code>HLL(log2m, regwidth, -1, true, HLLType.EMPTY)</code>.
+ *
+ * @param log2m log-base-2 of the number of registers used in the HyperLogLog
+ * algorithm. Must be at least 4 and at most 30.
+ * @param regwidth number of bits used per register in the HyperLogLog
+ * algorithm. Must be at least 1 and at most 8.
+ *
+ * @see #HLL(int, int, int, boolean, HLLType)
+ */
+ public HLL(final int log2m, final int regwidth) {
+ this(log2m, regwidth, -1, true, HLLType.EMPTY);
+ }
+
+ // -------------------------------------------------------------------------
+ /**
+ * Convenience constructor for testing. Assumes that both {@link HLLType#EXPLICIT}
+ * and {@link HLLType#SPARSE} representations should be enabled.
+ *
+ * @param log2m log-base-2 of the number of registers used in the HyperLogLog
+ * algorithm. Must be at least 4 and at most 30.
+ * @param regwidth number of bits used per register in the HyperLogLog
+ * algorithm. Must be at least 1 and at most 8.
+ * @param explicitThreshold cardinality threshold at which the {@link HLLType#EXPLICIT}
+ * representation should be promoted to {@link HLLType#SPARSE}.
+ * This must be greater than zero and less than or equal to {@value #MAXIMUM_EXPLICIT_THRESHOLD}.
+ * @param sparseThreshold register count threshold at which the {@link HLLType#SPARSE}
+ * representation should be promoted to {@link HLLType#FULL}.
+ * This must be greater than zero.
+ * @param type the type in the promotion hierarchy which this instance should
+ * start at. This cannot be <code>null</code>.
+ */
+ /*package, for testing*/ HLL(final int log2m, final int regwidth, final int explicitThreshold, final int sparseThreshold, final HLLType type) {
+ this.log2m = log2m;
+ if((log2m < MINIMUM_LOG2M_PARAM) || (log2m > MAXIMUM_LOG2M_PARAM)) {
+ throw new IllegalArgumentException("'log2m' must be at least " + MINIMUM_LOG2M_PARAM + " and at most " + MAXIMUM_LOG2M_PARAM + " (was: " + log2m + ")");
+ }
+
+ this.regwidth = regwidth;
+ if((regwidth < MINIMUM_REGWIDTH_PARAM) || (regwidth > MAXIMUM_REGWIDTH_PARAM)) {
+ throw new IllegalArgumentException("'regwidth' must be at least " + MINIMUM_REGWIDTH_PARAM + " and at most " + MAXIMUM_REGWIDTH_PARAM + " (was: " + regwidth + ")");
+ }
+
+ this.m = (1 << log2m);
+ this.mBitsMask = m - 1;
+ this.valueMask = (1 << regwidth) - 1;
+ this.pwMaxMask = HLLUtil.pwMaxMask(regwidth);
+ this.alphaMSquared = HLLUtil.alphaMSquared(m);
+ this.smallEstimatorCutoff = HLLUtil.smallEstimatorCutoff(m);
+ this.largeEstimatorCutoff = HLLUtil.largeEstimatorCutoff(log2m, regwidth);
+
+ this.explicitAuto = false;
+ this.explicitOff = false;
+ this.explicitThreshold = explicitThreshold;
+ if((explicitThreshold < 1) || (explicitThreshold > MAXIMUM_EXPLICIT_THRESHOLD)) {
+ throw new IllegalArgumentException("'explicitThreshold' must be at least 1 and at most " + MAXIMUM_EXPLICIT_THRESHOLD + " (was: " + explicitThreshold + ")");
+ }
+
+ this.shortWordLength = (regwidth + log2m);
+ this.sparseOff = false;
+ this.sparseThreshold = sparseThreshold;
+
+ initializeStorage(type);
+ }
+
+ /**
+ * @return the type in the promotion hierarchy of this instance. This will
+ * never be <code>null</code>.
+ */
+ public HLLType getType() { return type; }
+
+ // ========================================================================
+ // Add
+ /**
+ * Adds <code>rawValue</code> directly to the HLL.
+ *
+ * @param rawValue the value to be added. It is very important that this
+ * value <em>already be hashed</em> with a strong (but not
+ * necessarily cryptographic) hash function. For instance, the
+ * Murmur3 implementation in
+ * <a href="http://guava-libraries.googlecode.com/git/guava/src/com/google/common/hash/Murmur3_128HashFunction.java">
+ * Google's Guava</a> library is an excellent hash function for this
+ * purpose and, for seeds greater than zero, matches the output
+ * of the hash provided in the PostgreSQL implementation.
+ */
+ public void addRaw(final long rawValue) {
+ switch(type) {
+ case EMPTY: {
+ // NOTE: EMPTY type is always promoted on #addRaw()
+ if(explicitThreshold > 0) {
+ initializeStorage(HLLType.EXPLICIT);
+ explicitStorage.add(rawValue);
+ } else if(!sparseOff) {
+ initializeStorage(HLLType.SPARSE);
+ addRawSparseProbabilistic(rawValue);
+ } else {
+ initializeStorage(HLLType.FULL);
+ addRawProbabilistic(rawValue);
+ }
+ return;
+ }
+ case EXPLICIT: {
+ explicitStorage.add(rawValue);
+
+ // promotion, if necessary
+ if(explicitStorage.size() > explicitThreshold) {
+ if(!sparseOff) {
+ initializeStorage(HLLType.SPARSE);
+ for (LongCursor c : explicitStorage) {
+ addRawSparseProbabilistic(c.value);
+ }
+ } else {
+ initializeStorage(HLLType.FULL);
+ for (LongCursor c : explicitStorage) {
+ addRawProbabilistic(c.value);
+ }
+ }
+ explicitStorage = null;
+ }
+ return;
+ }
+ case SPARSE: {
+ addRawSparseProbabilistic(rawValue);
+
+ // promotion, if necessary
+ if(sparseProbabilisticStorage.size() > sparseThreshold) {
+ initializeStorage(HLLType.FULL);
+ for(IntByteCursor c : sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+ }
+ sparseProbabilisticStorage = null;
+ }
+ return;
+ }
+ case FULL:
+ addRawProbabilistic(rawValue);
+ return;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ // #addRaw(..) helpers
+ /**
+ * Adds the raw value to the {@link #sparseProbabilisticStorage}.
+ * {@link #type} must be {@link HLLType#SPARSE}.
+ *
+ * @param rawValue the raw value to add to the sparse storage.
+ */
+ private void addRawSparseProbabilistic(final long rawValue) {
+ // p(w): position of the least significant set bit (one-indexed)
+ // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register value)
+ //
+ // By construction of pwMaxMask (see #Constructor()),
+ // lsb(pwMaxMask) = 2^(registerValueInBits) - 2,
+ // thus lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) - 2,
+ // thus 1 + lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) -1.
+ final long substreamValue = (rawValue >>> log2m);
+ final byte p_w;
+
+ if(substreamValue == 0L) {
+ // The paper does not cover p(0x0), so the special value 0 is used.
+ // 0 is the original initialization value of the registers, so by
+ // doing this the multiset simply ignores it. This is acceptable
+ // because the probability is 1/(2^(2^registerSizeInBits)).
+ p_w = 0;
+ } else {
+ p_w = (byte)(1 + BitUtil.leastSignificantBit(substreamValue| pwMaxMask));
+ }
+
+ // Short-circuit if the register is being set to zero, since algorithmically
+ // this corresponds to an "unset" register, and "unset" registers aren't
+ // stored to save memory. (The very reason this sparse implementation
+ // exists.) If a register is set to zero it will break the #algorithmCardinality
+ // code.
+ if(p_w == 0) {
+ return;
+ }
+
+ // NOTE: no +1 as in paper since 0-based indexing
+ final int j = (int)(rawValue & mBitsMask);
+
+ final byte currentValue;
+ if (sparseProbabilisticStorage.containsKey(j)) {
+ currentValue = sparseProbabilisticStorage.lget();
+ } else {
+ currentValue = 0;
+ }
+
+ if(p_w > currentValue) {
+ sparseProbabilisticStorage.put(j, p_w);
+ }
+ }
+
+ /**
+ * Adds the raw value to the {@link #probabilisticStorage}.
+ * {@link #type} must be {@link HLLType#FULL}.
+ *
+ * @param rawValue the raw value to add to the full probabilistic storage.
+ */
+ private void addRawProbabilistic(final long rawValue) {
+ // p(w): position of the least significant set bit (one-indexed)
+ // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register value)
+ //
+ // By construction of pwMaxMask (see #Constructor()),
+ // lsb(pwMaxMask) = 2^(registerValueInBits) - 2,
+ // thus lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) - 2,
+ // thus 1 + lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) -1.
+ final long substreamValue = (rawValue >>> log2m);
+ final byte p_w;
+
+ if (substreamValue == 0L) {
+ // The paper does not cover p(0x0), so the special value 0 is used.
+ // 0 is the original initialization value of the registers, so by
+ // doing this the multiset simply ignores it. This is acceptable
+ // because the probability is 1/(2^(2^registerSizeInBits)).
+ p_w = 0;
+ } else {
+ p_w = (byte)(1 + BitUtil.leastSignificantBit(substreamValue| pwMaxMask));
+ }
+
+ // Short-circuit if the register is being set to zero, since algorithmically
+ // this corresponds to an "unset" register, and "unset" registers aren't
+ // stored to save memory. (The very reason this sparse implementation
+ // exists.) If a register is set to zero it will break the #algorithmCardinality
+ // code.
+ if(p_w == 0) {
+ return;
+ }
+
+ // NOTE: no +1 as in paper since 0-based indexing
+ final int j = (int)(rawValue & mBitsMask);
+
+ probabilisticStorage.setMaxRegister(j, p_w);
+ }
+
+ // ------------------------------------------------------------------------
+ // Storage helper
+ /**
+ * Initializes storage for the specified {@link HLLType} and changes the
+ * instance's {@link #type}.
+ *
+ * @param type the {@link HLLType} to initialize storage for. This cannot be
+ * <code>null</code> and must be an instantiable type.
+ */
+ private void initializeStorage(final HLLType type) {
+ this.type = type;
+ switch(type) {
+ case EMPTY:
+ // nothing to be done
+ break;
+ case EXPLICIT:
+ this.explicitStorage = new LongOpenHashSet();
+ break;
+ case SPARSE:
+ this.sparseProbabilisticStorage = new IntByteOpenHashMap();
+ break;
+ case FULL:
+ this.probabilisticStorage = new BitVector(regwidth, m);
+ break;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+ }
+
+ // ========================================================================
+ // Cardinality
+ /**
+ * Computes the cardinality of the HLL.
+ *
+ * @return the cardinality of HLL. This will never be negative.
+ */
+ public long cardinality() {
+ switch(type) {
+ case EMPTY:
+ return 0/*by definition*/;
+ case EXPLICIT:
+ return explicitStorage.size();
+ case SPARSE:
+ return (long)Math.ceil(sparseProbabilisticAlgorithmCardinality());
+ case FULL:
+ return (long)Math.ceil(fullProbabilisticAlgorithmCardinality());
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ // Cardinality helpers
+ /**
+ * Computes the exact cardinality value returned by the HLL algorithm when
+ * represented as a {@link HLLType#SPARSE} HLL. Kept
+ * separate from {@link #cardinality()} for testing purposes. {@link #type}
+ * must be {@link HLLType#SPARSE}.
+ *
+ * @return the exact, unrounded cardinality given by the HLL algorithm
+ */
+ /*package, for testing*/ double sparseProbabilisticAlgorithmCardinality() {
+ final int m = this.m/*for performance*/;
+
+ // compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
+ // 'j'th register value
+ double sum = 0;
+ int numberOfZeroes = 0/*"V" in the paper*/;
+ for(int j=0; j<m; j++) {
+ final long register;
+ if (sparseProbabilisticStorage.containsKey(j)) {
+ register = sparseProbabilisticStorage.lget();
+ } else {
+ register = 0;
+ }
+
+ sum += 1.0 / (1L << register);
+ if(register == 0L) numberOfZeroes++;
+ }
+
+ // apply the estimate and correction to the indicator function
+ final double estimator = alphaMSquared / sum;
+ if((numberOfZeroes != 0) && (estimator < smallEstimatorCutoff)) {
+ return HLLUtil.smallEstimator(m, numberOfZeroes);
+ } else if(estimator <= largeEstimatorCutoff) {
+ return estimator;
+ } else {
+ return HLLUtil.largeEstimator(log2m, regwidth, estimator);
+ }
+ }
+
+ /**
+ * Computes the exact cardinality value returned by the HLL algorithm when
+ * represented as a {@link HLLType#FULL} HLL. Kept
+ * separate from {@link #cardinality()} for testing purposes. {@link #type}
+ * must be {@link HLLType#FULL}.
+ *
+ * @return the exact, unrounded cardinality given by the HLL algorithm
+ */
+ /*package, for testing*/ double fullProbabilisticAlgorithmCardinality() {
+ final int m = this.m/*for performance*/;
+
+ // compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
+ // 'j'th register value
+ double sum = 0;
+ int numberOfZeroes = 0/*"V" in the paper*/;
+ final LongIterator iterator = probabilisticStorage.registerIterator();
+ while(iterator.hasNext()) {
+ final long register = iterator.next();
+
+ sum += 1.0 / (1L << register);
+ if(register == 0L) numberOfZeroes++;
+ }
+
+ // apply the estimate and correction to the indicator function
+ final double estimator = alphaMSquared / sum;
+ if((numberOfZeroes != 0) && (estimator < smallEstimatorCutoff)) {
+ return HLLUtil.smallEstimator(m, numberOfZeroes);
+ } else if(estimator <= largeEstimatorCutoff) {
+ return estimator;
+ } else {
+ return HLLUtil.largeEstimator(log2m, regwidth, estimator);
+ }
+ }
+
+ // ========================================================================
+ // Clear
+ /**
+ * Clears the HLL. The HLL will have cardinality zero and will act as if no
+ * elements have been added.<p/>
+ *
+ * NOTE: Unlike {@link #addRaw(long)}, <code>clear</code> does NOT handle
+ * transitions between {@link HLLType}s - a probabilistic type will remain
+ * probabilistic after being cleared.
+ */
+ public void clear() {
+ switch(type) {
+ case EMPTY:
+ return /*do nothing*/;
+ case EXPLICIT:
+ explicitStorage.clear();
+ return;
+ case SPARSE:
+ sparseProbabilisticStorage.clear();
+ return;
+ case FULL:
+ probabilisticStorage.fill(0);
+ return;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+ }
+
+ // ========================================================================
+ // Union
+ /**
+ * Computes the union of HLLs and stores the result in this instance.
+ *
+ * @param other the other {@link HLL} instance to union into this one. This
+ * cannot be <code>null</code>.
+ */
+ public void union(final HLL other) {
+ // TODO: verify HLLs are compatible
+ final HLLType otherType = other.getType();
+
+ if(type.equals(otherType)) {
+ homogeneousUnion(other);
+ return;
+ } else {
+ heterogenousUnion(other);
+ return;
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ // Union helpers
+ /**
+ * Computes the union of two HLLs, of different types, and stores the
+ * result in this instance.
+ *
+ * @param other the other {@link HLL} instance to union into this one. This
+ * cannot be <code>null</code>.
+ */
+ /*package, for testing*/ void heterogenousUnion(final HLL other) {
+ /*
+ * The logic here is divided into two sections: unions with an EMPTY
+ * HLL, and unions between EXPLICIT/SPARSE/FULL
+ * HLL.
+ *
+ * Between those two sections, all possible heterogeneous unions are
+ * covered. Should another type be added to HLLType whose unions
+ * are not easily reduced (say, as EMPTY's are below) this may be more
+ * easily implemented as Strategies. However, that is unnecessary as it
+ * stands.
+ */
+
+ // ....................................................................
+ // Union with an EMPTY
+ if(HLLType.EMPTY.equals(type)) {
+ // NOTE: The union of empty with non-empty HLL is just a
+ // clone of the non-empty.
+
+ switch(other.getType()) {
+ case EXPLICIT: {
+ // src: EXPLICIT
+ // dest: EMPTY
+
+ if(other.explicitStorage.size() <= explicitThreshold) {
+ type = HLLType.EXPLICIT;
+ explicitStorage = other.explicitStorage.clone();
+ } else {
+ if(!sparseOff) {
+ initializeStorage(HLLType.SPARSE);
+ } else {
+ initializeStorage(HLLType.FULL);
+ }
+ for(LongCursor c : other.explicitStorage) {
+ addRaw(c.value);
+ }
+ }
+ return;
+ }
+ case SPARSE: {
+ // src: SPARSE
+ // dest: EMPTY
+
+ if(!sparseOff) {
+ type = HLLType.SPARSE;
+ sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone();
+ } else {
+ initializeStorage(HLLType.FULL);
+ for(IntByteCursor c : other.sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+ }
+ }
+ return;
+ }
+ default/*case FULL*/: {
+ // src: FULL
+ // dest: EMPTY
+
+ type = HLLType.FULL;
+ probabilisticStorage = other.probabilisticStorage.clone();
+ return;
+ }
+ }
+ } else if (HLLType.EMPTY.equals(other.getType())) {
+ // source is empty, so just return destination since it is unchanged
+ return;
+ } /* else -- both of the sets are not empty */
+
+ // ....................................................................
+ // NOTE: Since EMPTY is handled above, the HLLs are non-EMPTY below
+ switch(type) {
+ case EXPLICIT: {
+ // src: FULL/SPARSE
+ // dest: EXPLICIT
+ // "Storing into destination" cannot be done (since destination
+ // is by definition of smaller capacity than source), so a clone
+ // of source is made and values from destination are inserted
+ // into that.
+
+ // Determine source and destination storage.
+ // NOTE: destination storage may change through promotion if
+ // source is SPARSE.
+ if(HLLType.SPARSE.equals(other.getType())) {
+ if(!sparseOff) {
+ type = HLLType.SPARSE;
+ sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone();
+ } else {
+ initializeStorage(HLLType.FULL);
+ for(IntByteCursor c : other.sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+ }
+ }
+ } else /*source is HLLType.FULL*/ {
+ type = HLLType.FULL;
+ probabilisticStorage = other.probabilisticStorage.clone();
+ }
+ for(LongCursor c : explicitStorage) {
+ addRaw(c.value);
+ }
+ explicitStorage = null;
+ return;
+ }
+ case SPARSE: {
+ if(HLLType.EXPLICIT.equals(other.getType())) {
+ // src: EXPLICIT
+ // dest: SPARSE
+ // Add the raw values from the source to the destination.
+
+ for(LongCursor c : other.explicitStorage) {
+ addRaw(c.value);
+ }
+ // NOTE: addRaw will handle promotion cleanup
+ } else /*source is HLLType.FULL*/ {
+ // src: FULL
+ // dest: SPARSE
+ // "Storing into destination" cannot be done (since destination
+ // is by definition of smaller capacity than source), so a
+ // clone of source is made and registers from the destination
+ // are merged into the clone.
+
+ type = HLLType.FULL;
+ probabilisticStorage = other.probabilisticStorage.clone();
+ for(IntByteCursor c : sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+ }
+ sparseProbabilisticStorage = null;
+ }
+ return;
+ }
+ default/*destination is HLLType.FULL*/: {
+ if(HLLType.EXPLICIT.equals(other.getType())) {
+ // src: EXPLICIT
+ // dest: FULL
+ // Add the raw values from the source to the destination.
+ // Promotion is not possible, so don't bother checking.
+
+ for(LongCursor c : other.explicitStorage) {
+ addRaw(c.value);
+ }
+ } else /*source is HLLType.SPARSE*/ {
+ // src: SPARSE
+ // dest: FULL
+ // Merge the registers from the source into the destination.
+ // Promotion is not possible, so don't bother checking.
+
+ for(IntByteCursor c : other.sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Computes the union of two HLLs of the same type, and stores the
+ * result in this instance.
+ *
+ * @param other the other {@link HLL} instance to union into this one. This
+ * cannot be <code>null</code>.
+ */
+ private void homogeneousUnion(final HLL other) {
+ switch(type) {
+ case EMPTY:
+ // union of empty and empty is empty
+ return;
+ case EXPLICIT:
+ for(LongCursor c : other.explicitStorage) {
+ addRaw(c.value);
+ }
+ // NOTE: #addRaw() will handle promotion, if necessary
+ return;
+ case SPARSE:
+ for(IntByteCursor c : other.sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ final byte currentRegisterValue = sparseProbabilisticStorage.get(registerIndex);
+ if(registerValue > currentRegisterValue) {
+ sparseProbabilisticStorage.put(registerIndex, registerValue);
+ }
+ }
+
+ // promotion, if necessary
+ if(sparseProbabilisticStorage.size() > sparseThreshold) {
+ initializeStorage(HLLType.FULL);
+ for(IntByteCursor c : sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+ }
+ sparseProbabilisticStorage = null;
+ }
+ return;
+ case FULL:
+ for(int i=0; i<m; i++) {
+ final long registerValue = other.probabilisticStorage.getRegister(i);
+ probabilisticStorage.setMaxRegister(i, registerValue);
+ }
+ return;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+ }
+
+ // ========================================================================
+ // Serialization
+ /**
+ * Serializes the HLL to an array of bytes in correspondence with the format
+ * of the default schema version, {@link SerializationUtil#DEFAULT_SCHEMA_VERSION}.
+ *
+ * @return the array of bytes representing the HLL. This will never be
+ * <code>null</code> or empty.
+ */
+ public byte[] toBytes() {
+ return toBytes(SerializationUtil.DEFAULT_SCHEMA_VERSION);
+ }
+
+ /**
+ * Serializes the HLL to an array of bytes in correspondence with the format
+ * of the specified schema version.
+ *
+ * @param schemaVersion the schema version dictating the serialization format
+ * @return the array of bytes representing the HLL. This will never be
+ * <code>null</code> or empty.
+ */
+ public byte[] toBytes(final ISchemaVersion schemaVersion) {
+ final byte[] bytes;
+ switch(type) {
+ case EMPTY:
+ bytes = new byte[schemaVersion.paddingBytes(type)];
+ break;
+ case EXPLICIT: {
+ final IWordSerializer serializer =
+ schemaVersion.getSerializer(type, Long.SIZE, explicitStorage.size());
+
+ final long[] values = explicitStorage.toArray();
+ Arrays.sort(values);
+ for(final long value : values) {
+ serializer.writeWord(value);
+ }
+
+ bytes = serializer.getBytes();
+ break;
+ }
+ case SPARSE: {
+ final IWordSerializer serializer =
+ schemaVersion.getSerializer(type, shortWordLength, sparseProbabilisticStorage.size());
+
+ final int[] indices = sparseProbabilisticStorage.keys().toArray();
+ Arrays.sort(indices);
+ for(final int registerIndex : indices) {
+ assert sparseProbabilisticStorage.containsKey(registerIndex);
+ final long registerValue = sparseProbabilisticStorage.get(registerIndex);
+ // pack index and value into "short word"
+ final long shortWord = ((registerIndex << regwidth) | registerValue);
+ serializer.writeWord(shortWord);
+ }
+
+ bytes = serializer.getBytes();
+ break;
+ }
+ case FULL: {
+ final IWordSerializer serializer = schemaVersion.getSerializer(type, regwidth, m);
+ probabilisticStorage.getRegisterContents(serializer);
+
+ bytes = serializer.getBytes();
+ break;
+ }
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+
+ final IHLLMetadata metadata = new HLLMetadata(schemaVersion.schemaVersionNumber(),
+ type,
+ log2m,
+ regwidth,
+ (int)NumberUtil.log2(explicitThreshold),
+ explicitOff,
+ explicitAuto,
+ !sparseOff);
+ schemaVersion.writeMetadata(bytes, metadata);
+
+ return bytes;
+ }
+
+ /**
+ * Deserializes the HLL (in {@link #toBytes(ISchemaVersion)} format) serialized
+ * into <code>bytes</code>.<p/>
+ *
+ * @param bytes the serialized bytes of new HLL
+ * @return the deserialized HLL. This will never be <code>null</code>.
+ *
+ * @see #toBytes(ISchemaVersion)
+ */
+ public static HLL fromBytes(final byte[] bytes) {
+ final ISchemaVersion schemaVersion = SerializationUtil.getSchemaVersion(bytes);
+ final IHLLMetadata metadata = schemaVersion.readMetadata(bytes);
+
+ final HLLType type = metadata.HLLType();
+ final int regwidth = metadata.registerWidth();
+ final int log2m = metadata.registerCountLog2();
+ final boolean sparseon = metadata.sparseEnabled();
+
+ final int expthresh;
+ if(metadata.explicitAuto()) {
+ expthresh = -1;
+ } else if(metadata.explicitOff()) {
+ expthresh = 0;
+ } else {
+ // NOTE: take into account that the postgres-compatible constructor
+ // subtracts one before taking a power of two.
+ expthresh = metadata.log2ExplicitCutoff() + 1;
+ }
+
+ final HLL hll = new HLL(log2m, regwidth, expthresh, sparseon, type);
+
+ // Short-circuit on empty, which needs no other deserialization.
+ if(HLLType.EMPTY.equals(type)) {
+ return hll;
+ }
+
+ final int wordLength;
+ switch(type) {
+ case EXPLICIT:
+ wordLength = Long.SIZE;
+ break;
+ case SPARSE:
+ wordLength = hll.shortWordLength;
+ break;
+ case FULL:
+ wordLength = hll.regwidth;
+ break;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+
+ final IWordDeserializer deserializer =
+ schemaVersion.getDeserializer(type, wordLength, bytes);
+ switch(type) {
+ case EXPLICIT:
+ // NOTE: This should not exceed expthresh and this will always
+ // be exactly the number of words that were encoded,
+ // because the word length is at least a byte wide.
+ // SEE: IWordDeserializer#totalWordCount()
+ for(int i=0; i<deserializer.totalWordCount(); i++) {
+ hll.explicitStorage.add(deserializer.readWord());
+ }
+ break;
+ case SPARSE:
+ // NOTE: If the shortWordLength were smaller than 8 bits
+ // (1 byte) there would be a possibility (because of
+ // padding arithmetic) of having one or more extra
+ // registers read. However, this is not relevant as the
+ // extra registers will be all zeroes, which are ignored
+ // in the sparse representation.
+ for(int i=0; i<deserializer.totalWordCount(); i++) {
+ final long shortWord = deserializer.readWord();
+ final byte registerValue = (byte)(shortWord & hll.valueMask);
+ // Only set non-zero registers.
+ if (registerValue != 0) {
+ hll.sparseProbabilisticStorage.put((int)(shortWord >>> hll.regwidth), registerValue);
+ }
+ }
+ break;
+ case FULL:
+ // NOTE: Iteration is done using m (register count) and NOT
+ // deserializer#totalWordCount() because regwidth may be
+ // less than 8 and as such the padding on the 'last' byte
+ // may be larger than regwidth, causing an extra register
+ // to be read.
+ // SEE: IWordDeserializer#totalWordCount()
+ for(long i=0; i<hll.m; i++) {
+ hll.probabilisticStorage.setRegister(i, deserializer.readWord());
+ }
+ break;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+
+ return hll;
+ }
+
+ /**
+ * Create a deep copy of this HLL.
+ *
+ * @see java.lang.Object#clone()
+ */
+ @Override
+ public HLL clone() throws CloneNotSupportedException {
+ // NOTE: Since the package-only constructor assumes both explicit and
+ // sparse are enabled, the easiest thing to do here is to re-derive
+ // the expthresh parameter and create a new HLL with the public
+ // constructor.
+ // TODO: add a more sensible constructor to make this less obfuscated
+ final int copyExpthresh;
+ if(explicitAuto) {
+ copyExpthresh = -1;
+ } else if(explicitOff) {
+ copyExpthresh = 0;
+ } else {
+ // explicitThreshold is defined as:
+ //
+ // this.explicitThreshold = (1 << (expthresh - 1));
+ //
+ // Since explicitThreshold is a power of two and only has a single
+ // bit set, finding the LSB is the same as finding the inverse
+ copyExpthresh = BitUtil.leastSignificantBit(explicitThreshold) + 1;
+ }
+ final HLL copy = new HLL(log2m, regwidth, copyExpthresh, !sparseOff/*sparseOn*/, type);
+ switch(type) {
+ case EMPTY:
+ // nothing to be done
+ break;
+ case EXPLICIT:
+ copy.explicitStorage = this.explicitStorage.clone();
+ break;
+ case SPARSE:
+ copy.sparseProbabilisticStorage = this.sparseProbabilisticStorage.clone();
+ break;
+ case FULL:
+ copy.probabilisticStorage = this.probabilisticStorage.clone();
+ break;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+ return copy;
+ }
+}
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLMetadata.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLMetadata.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLMetadata.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLMetadata.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,138 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A concrete {@link IHLLMetadata} implemented as a simple struct.
+ *
+ * @author timon
+ */
+class HLLMetadata implements IHLLMetadata {
+ private final int schemaVersion;
+ private final HLLType type;
+ private final int registerCountLog2;
+ private final int registerWidth;
+ private final int log2ExplicitCutoff;
+ private final boolean explicitOff;
+ private final boolean explicitAuto;
+ private final boolean sparseEnabled;
+
+ /**
+ * @param schemaVersion the schema version number of the HLL. This must
+ * be greater than or equal to zero.
+ * @param type the {@link HLLType type} of the HLL. This cannot
+ * be <code>null</code>.
+ * @param registerCountLog2 the log-base-2 register count parameter for
+ * probabilistic HLLs. This must be greater than or equal to zero.
+ * @param registerWidth the register width parameter for probabilistic
+ * HLLs. This must be greater than or equal to zero.
+ * @param log2ExplicitCutoff the log-base-2 of the explicit cardinality cutoff,
+ * if it is explicitly defined. (If <code>explicitOff</code> or
+ * <code>explicitAuto</code> is <code>true</code> then this has no
+ * meaning.)
+ * @param explicitOff the flag for 'explicit off'-mode, where the
+ * {@link HLLType#EXPLICIT} representation is not used. Both this and
+ * <code>explicitAuto</code> cannot be <code>true</code> at the same
+ * time.
+ * @param explicitAuto the flag for 'explicit auto'-mode, where the
+ * {@link HLLType#EXPLICIT} representation's promotion cutoff is
+ * determined based on in-memory size automatically. Both this and
+ * <code>explicitOff</code> cannot be <code>true</code> at the same
+ * time.
+ * @param sparseEnabled the flag for 'sparse-enabled'-mode, where the
+ * {@link HLLType#SPARSE} representation is used.
+ */
+ public HLLMetadata(final int schemaVersion,
+ final HLLType type,
+ final int registerCountLog2,
+ final int registerWidth,
+ final int log2ExplicitCutoff,
+ final boolean explicitOff,
+ final boolean explicitAuto,
+ final boolean sparseEnabled) {
+ this.schemaVersion = schemaVersion;
+ this.type = type;
+ this.registerCountLog2 = registerCountLog2;
+ this.registerWidth = registerWidth;
+ this.log2ExplicitCutoff = log2ExplicitCutoff;
+ this.explicitOff = explicitOff;
+ this.explicitAuto = explicitAuto;
+ this.sparseEnabled = sparseEnabled;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#schemaVersion()
+ */
+ @Override
+ public int schemaVersion() { return schemaVersion; }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#HLLType()
+ */
+ @Override
+ public HLLType HLLType() { return type; }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#registerCountLog2()
+ */
+ @Override
+ public int registerCountLog2() { return registerCountLog2; }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#registerWidth()
+ */
+ @Override
+ public int registerWidth() { return registerWidth; }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#log2ExplicitCutoff()
+ */
+ @Override
+ public int log2ExplicitCutoff() { return log2ExplicitCutoff; }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#explicitOff()
+ */
+ @Override
+ public boolean explicitOff() {
+ return explicitOff;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#explicitAuto()
+ * @see net.agkn.hll.serialization.IHLLMetadata#log2ExplicitCutoff()
+ */
+ @Override
+ public boolean explicitAuto() {
+ return explicitAuto;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#sparseEnabled()
+ */
+ @Override
+ public boolean sparseEnabled() { return sparseEnabled; }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return "<HLLMetadata schemaVersion: " + this.schemaVersion + ", type: " + this.type.toString() + ", registerCountLog2: " + this.registerCountLog2 + ", registerWidth: " + this.registerWidth + ", log2ExplicitCutoff: " + this.log2ExplicitCutoff + ", explicitOff: " + this.explicitOff + ", explicitAuto: " +this.explicitAuto + ">";
+ }
+}
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLType.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLType.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLType.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLType.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,29 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The types of algorithm/data structure that {@link HLL} can utilize. For more
+ * information, see the Javadoc for {@link HLL}.
+ */
+public enum HLLType {
+ EMPTY,
+ EXPLICIT,
+ SPARSE,
+ FULL;
+}
\ No newline at end of file
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,199 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Static functions for computing constants and parameters used in the HLL
+ * algorithm.
+ */
+final class HLLUtil {
+ /**
+ * Precomputed <code>pwMaxMask</code> values indexed by <code>registerSizeInBits</code>.
+ * Calculated with this formula:
+ * <pre>
+ * int maxRegisterValue = (1 << registerSizeInBits) - 1;
+ * // Mask with all bits set except for (maxRegisterValue - 1) least significant bits (see #addRaw())
+ * return ~((1L << (maxRegisterValue - 1)) - 1);
+ * </pre>
+ *
+ * @see #pwMaxMask(int)
+ */
+ private static final long[] PW_MASK = {
+ ~((1L << (((1 << 0) - 1) - 1)) - 1),
+ ~((1L << (((1 << 1) - 1) - 1)) - 1),
+ ~((1L << (((1 << 2) - 1) - 1)) - 1),
+ ~((1L << (((1 << 3) - 1) - 1)) - 1),
+ ~((1L << (((1 << 4) - 1) - 1)) - 1),
+ ~((1L << (((1 << 5) - 1) - 1)) - 1),
+ ~((1L << (((1 << 6) - 1) - 1)) - 1),
+ ~((1L << (((1 << 7) - 1) - 1)) - 1),
+ ~((1L << (((1 << 8) - 1) - 1)) - 1)
+ };
+
+ /**
+ * Precomputed <code>twoToL</code> values indexed by a linear combination of
+ * <code>regWidth</code> and <code>log2m</code>.
+ *
+ * The array is one-dimensional and can be accessed by using index
+ * <code>(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m</code>
+ * for <code>regWidth</code> and <code>log2m</code> between the specified
+ * <code>HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM</code> constants.
+ *
+ * @see #largeEstimator(int, int, double)
+ * @see #largeEstimatorCutoff(int, int)
+ * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 2^L</a>"
+ */
+ private static final double[] TWO_TO_L = new double[(HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)];
+
+ /**
+ * Spacing constant used to compute offsets into {@link #TWO_TO_L}.
+ */
+ private static final int REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1;
+
+ static {
+ for(int regWidth = HLL.MINIMUM_REGWIDTH_PARAM; regWidth <= HLL.MAXIMUM_REGWIDTH_PARAM; regWidth++) {
+ for(int log2m = HLL.MINIMUM_LOG2M_PARAM ; log2m <= HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
+ int maxRegisterValue = (1 << regWidth) - 1;
+
+ // Since 1 is added to p(w) in the insertion algorithm, only
+ // (maxRegisterValue - 1) bits are inspected hence the hash
+ // space is one power of two smaller.
+ final int pwBits = (maxRegisterValue - 1);
+ final int totalBits = (pwBits + log2m);
+ final double twoToL = Math.pow(2, totalBits);
+ TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m] = twoToL;
+ }
+ }
+ }
+
+ // ************************************************************************
+ /**
+ * Computes the bit-width of HLL registers necessary to estimate a set of
+ * the specified cardinality.
+ *
+ * @param expectedUniqueElements an upper bound on the number of unique
+ * elements that are expected. This must be greater than zero.
+ * @return a register size in bits (i.e. <code>log2(log2(n))</code>)
+ */
+ public static int registerBitSize(final long expectedUniqueElements) {
+ return Math.max(HLL.MINIMUM_REGWIDTH_PARAM,
+ (int)Math.ceil(NumberUtil.log2(NumberUtil.log2(expectedUniqueElements))));
+ }
+
+ // ========================================================================
+ /**
+ * Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm.
+ *
+ * @param m this must be a power of two, cannot be less than
+ * 16 (2<sup>4</sup>), and cannot be greater than 65536 (2<sup>16</sup>).
+ * @return gamma times <code>registerCount</code> squared where gamma is
+ * based on the value of <code>registerCount</code>.
+ * @throws IllegalArgumentException if <code>registerCount</code> is less
+ * than 16.
+ */
+ public static double alphaMSquared(final int m) {
+ switch(m) {
+ case 1/*2^0*/:
+ case 2/*2^1*/:
+ case 4/*2^2*/:
+ case 8/*2^3*/:
+ throw new IllegalArgumentException("'m' cannot be less than 16 (" + m + " < 16).");
+
+ case 16/*2^4*/:
+ return 0.673 * m * m;
+
+ case 32/*2^5*/:
+ return 0.697 * m * m;
+
+ case 64/*2^6*/:
+ return 0.709 * m * m;
+
+ default/*>2^6*/:
+ return (0.7213 / (1.0 + 1.079 / m)) * m * m;
+ }
+ }
+
+ // ========================================================================
+ /**
+ * Computes a mask that prevents overflow of HyperLogLog registers.
+ *
+ * @param registerSizeInBits the size of the HLL registers, in bits.
+ * @return mask a <code>long</code> mask to prevent overflow of the registers
+ * @see #registerBitSize(long)
+ */
+ public static long pwMaxMask(final int registerSizeInBits) {
+ return PW_MASK[registerSizeInBits];
+ }
+
+ // ========================================================================
+ /**
+ * The cutoff for using the "small range correction" formula, in the
+ * HyperLogLog algorithm.
+ *
+ * @param m the number of registers in the HLL. <em>m<em> in the paper.
+ * @return the cutoff for the small range correction.
+ * @see #smallEstimator(int, int)
+ */
+ public static double smallEstimatorCutoff(final int m) {
+ return ((double)m * 5) / 2;
+ }
+
+ /**
+ * The "small range correction" formula from the HyperLogLog algorithm. Only
+ * appropriate if both the estimator is smaller than <pre>(5/2) * m</pre> and
+ * there are still registers that have the zero value.
+ *
+ * @param m the number of registers in the HLL. <em>m<em> in the paper.
+ * @param numberOfZeroes the number of registers with value zero. <em>V</em>
+ * in the paper.
+ * @return a corrected cardinality estimate.
+ */
+ public static double smallEstimator(final int m, final int numberOfZeroes) {
+ return m * Math.log((double)m / numberOfZeroes);
+ }
+
+ /**
+ * The cutoff for using the "large range correction" formula, from the
+ * HyperLogLog algorithm, adapted for 64 bit hashes.
+ *
+ * @param log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
+ * @param registerSizeInBits the size of the HLL registers, in bits.
+ * @return the cutoff for the large range correction.
+ * @see #largeEstimator(int, int, double)
+ * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction' cutoff</a>"
+ */
+ public static double largeEstimatorCutoff(final int log2m, final int registerSizeInBits) {
+ return (TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]) / 30.0;
+ }
+
+ /**
+ * The "large range correction" formula from the HyperLogLog algorithm, adapted
+ * for 64 bit hashes. Only appropriate for estimators whose value exceeds
+ * the return of {@link #largeEstimatorCutoff(int, int)}.
+ *
+ * @param log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
+ * @param registerSizeInBits the size of the HLL registers, in bits.
+ * @param estimator the original estimator ("E" in the paper).
+ * @return a corrected cardinality estimate.
+ * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction'</a>"
+ */
+ public static double largeEstimator(final int log2m, final int registerSizeInBits, final double estimator) {
+ final double twoToL = TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m];
+ return -1 * twoToL * Math.log(1.0 - (estimator/twoToL));
+ }
+}
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IHLLMetadata.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IHLLMetadata.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IHLLMetadata.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IHLLMetadata.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,71 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The metadata and parameters associated with a HLL.
+ */
+interface IHLLMetadata {
+ /**
+ * @return the schema version of the HLL. This will never be <code>null</code>.
+ */
+ int schemaVersion();
+
+ /**
+ * @return the type of the HLL. This will never be <code>null</code>.
+ */
+ HLLType HLLType();
+
+ /**
+ * @return the log-base-2 of the register count parameter of the HLL. This
+ * will always be greater than or equal to 4 and less than or equal
+ * to 31.
+ */
+ int registerCountLog2();
+
+ /**
+ * @return the register width parameter of the HLL. This will always be
+ * greater than or equal to 1 and less than or equal to 8.
+ */
+ int registerWidth();
+
+ /**
+ * @return the log-base-2 of the explicit cutoff cardinality. This will always
+ * be greater than or equal to zero and less than 31, per the specification.
+ */
+ int log2ExplicitCutoff();
+
+ /**
+ * @return <code>true</code> if the {@link HLLType#EXPLICIT} representation
+ * has been disabled. <code>false</code> otherwise.
+ */
+ boolean explicitOff();
+
+ /**
+ * @return <code>true</code> if the {@link HLLType#EXPLICIT} representation
+ * cutoff cardinality is set to be automatically chosen,
+ * <code>false</code> otherwise.
+ */
+ boolean explicitAuto();
+
+ /**
+ * @return <code>true</code> if the {@link HLLType#SPARSE} representation
+ * is enabled.
+ */
+ boolean sparseEnabled();
+}
\ No newline at end of file
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/ISchemaVersion.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/ISchemaVersion.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/ISchemaVersion.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/ISchemaVersion.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,87 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A serialization schema for HLLs. Reads and writes HLL metadata to
+ * and from <code>byte[]</code> representations.
+ *
+ * @author timon
+ */
+interface ISchemaVersion {
+ /**
+ * The number of metadata bytes required for a serialized HLL of the
+ * specified type.
+ *
+ * @param type the type of the serialized HLL
+ * @return the number of padding bytes needed in order to fully accommodate
+ * the needed metadata.
+ */
+ int paddingBytes(HLLType type);
+
+ /**
+ * Writes metadata bytes to serialized HLL.
+ *
+ * @param bytes the padded data bytes of the HLL
+ * @param metadata the metadata to write to the padding bytes
+ */
+ void writeMetadata(byte[] bytes, IHLLMetadata metadata);
+
+ /**
+ * Reads the metadata bytes of the serialized HLL.
+ *
+ * @param bytes the serialized HLL
+ * @return the HLL metadata
+ */
+ IHLLMetadata readMetadata(byte[] bytes);
+
+ /**
+ * Builds an HLL serializer that matches this schema version.
+ *
+ * @param type the HLL type that will be serialized. This cannot be
+ * <code>null</code>.
+ * @param wordLength the length of the 'words' that comprise the data of the
+ * HLL. Words must be at least 5 bits and at most 64 bits long.
+ * @param wordCount the number of 'words' in the HLL's data.
+ * @return a byte array serializer used to serialize a HLL according
+ * to this schema version's specification.
+ * @see #paddingBytes(HLLType)
+ * @see IWordSerializer
+ */
+ IWordSerializer getSerializer(HLLType type, int wordLength, int wordCount);
+
+ /**
+ * Builds an HLL deserializer that matches this schema version.
+ *
+ * @param type the HLL type that will be deserialized. This cannot be
+ * <code>null</code>.
+ * @param wordLength the length of the 'words' that comprise the data of the
+ * serialized HLL. Words must be at least 5 bits and at most 64
+ * bits long.
+ * @param bytes the serialized HLL to deserialize. This cannot be
+ * <code>null</code>.
+ * @return a byte array deserializer used to deserialize a HLL serialized
+ * according to this schema version's specification.
+ */
+ IWordDeserializer getDeserializer(HLLType type, int wordLength, byte[] bytes);
+
+ /**
+ * @return the schema version number.
+ */
+ int schemaVersionNumber();
+}
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordDeserializer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordDeserializer.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordDeserializer.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordDeserializer.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,41 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Reads 'words' of a fixed width, in sequence, from a byte array.
+ */
+public interface IWordDeserializer {
+ /**
+ * @return the next word in the sequence. Should not be called more than
+ * {@link #totalWordCount()} times.
+ */
+ long readWord();
+
+ /**
+ * Returns the number of words that could be encoded in the sequence.<p/>
+ *
+ * NOTE: the sequence that was encoded may be shorter than the value this
+ * method returns due to padding issues within bytes. This guarantees
+ * only an upper bound on the number of times {@link #readWord()}
+ * can be called.
+ *
+ * @return the maximum number of words that could be read from the sequence.
+ */
+ int totalWordCount();
+}
\ No newline at end of file
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordSerializer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordSerializer.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordSerializer.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordSerializer.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,39 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Writes 'words' of fixed width, in sequence, to a byte array.
+ */
+interface IWordSerializer {
+
+ /**
+ * Writes the word to the backing array.
+ *
+ * @param word the word to write.
+ */
+ void writeWord(final long word);
+
+ /**
+ * Returns the backing array of <code>byte</code>s that contain the serialized
+ * words.
+ * @return the serialized words as a <code>byte[]</code>.
+ */
+ byte[] getBytes();
+
+}
\ No newline at end of file
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/LongIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/LongIterator.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/LongIterator.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/LongIterator.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,35 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A <code>long</code>-based iterator. This is not <i>is-a</i> {@link java.util.Iterator}
+ * to prevent autoboxing between <code>Long</code> and <code>long</code>.
+ */
+interface LongIterator {
+ /**
+ * @return <code>true</code> if and only if there are more elements to
+ * iterate over. <code>false</code> otherwise.
+ */
+ boolean hasNext();
+
+ /**
+ * @return the next <code>long</code> in the collection.
+ */
+ long next();
+}
\ No newline at end of file
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/NumberUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/NumberUtil.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/NumberUtil.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/NumberUtil.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,172 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A collection of utilities to work with numbers.
+ */
+class NumberUtil {
+ // loge(2) (log-base e of 2)
+ public static final double LOGE_2 = 0.6931471805599453;
+
+ // ************************************************************************
+ /**
+ * Computes the <code>log2</code> (log-base-two) of the specified value.
+ *
+ * @param value the <code>double</code> for which the <code>log2</code> is
+ * desired.
+ * @return the <code>log2</code> of the specified value
+ */
+ public static double log2(final double value) {
+ // REF: http://en.wikipedia.org/wiki/Logarithmic_scale (conversion of bases)
+ return Math.log(value) / LOGE_2;
+ }
+
+ // ========================================================================
+ // the hex characters
+ private static final char[] HEX = { '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
+
+ // ------------------------------------------------------------------------
+ /**
+ * Converts the specified array of <code>byte</code>s into a string of
+ * hex characters (low <code>byte</code> first).
+ *
+ * @param bytes the array of <code>byte</code>s that are to be converted.
+ * This cannot be <code>null</code> though it may be empty.
+ * @param offset the offset in <code>bytes</code> at which the bytes will
+ * be taken. This cannot be negative and must be less than
+ * <code>bytes.length - 1</code>.
+ * @param count the number of bytes to be retrieved from the specified array.
+ * This cannot be negative. If greater than <code>bytes.length - offset</code>
+ * then that value is used.
+ * @return a string of at most <code>count</code> characters that represents
+ * the specified byte array in hex. This will never be <code>null</code>
+ * though it may be empty if <code>bytes</code> is empty or <code>count</code>
+ * is zero.
+ * @throws IllegalArgumentException if <code>offset</code> is greater than
+ * or equal to <code>bytes.length</code>.
+ * @see #fromHex(String, int, int)
+ */
+ public static String toHex(final byte[] bytes, final int offset, final int count) {
+ if(offset >= bytes.length) throw new IllegalArgumentException("Offset is greater than the length (" + offset + " >= " + bytes.length + ").")/*by contract*/;
+ final int byteCount = Math.min( (bytes.length - offset), count);
+ final int upperBound = byteCount + offset;
+
+ final char[] chars = new char[byteCount * 2/*two chars per byte*/];
+ int charIndex = 0;
+ for(int i=offset; i<upperBound; i++) {
+ final byte value = bytes[i];
+ chars[charIndex++] = HEX[(value >>> 4) & 0x0F];
+ chars[charIndex++] = HEX[value & 0x0F];
+ }
+
+ return new String(chars);
+ }
+
+ /**
+ * Converts the specified array of hex characters into an array of <code>byte</code>s
+ * (low <code>byte</code> first).
+ *
+ * @param string the string of hex characters to be converted into <code>byte</code>s.
+ * This cannot be <code>null</code> though it may be blank.
+ * @param offset the offset in the string at which the characters will be
+ * taken. This cannot be negative and must be less than <code>string.length() - 1</code>.
+ * @param count the number of characters to be retrieved from the specified
+ * string. This cannot be negative and must be divisible by two
+ * (since there are two characters per <code>byte</code>).
+ * @return the array of <code>byte</code>s that were converted from the
+ * specified string (in the specified range). This will never be
+ * <code>null</code> though it may be empty if <code>string</code>
+ * is empty or <code>count</code> is zero.
+ * @throws IllegalArgumentException if <code>offset</code> is greater than
+ * or equal to <code>string.length()</code> or if <code>count</code>
+ * is not divisible by two.
+ * @see #toHex(byte[], int, int)
+ */
+ public static byte[] fromHex(final String string, final int offset, final int count) {
+ if(offset >= string.length()) throw new IllegalArgumentException("Offset is greater than the length (" + offset + " >= " + string.length() + ").")/*by contract*/;
+ if( (count & 0x01) != 0) throw new IllegalArgumentException("Count is not divisible by two (" + count + ").")/*by contract*/;
+ final int charCount = Math.min((string.length() - offset), count);
+ final int upperBound = offset + charCount;
+
+ final byte[] bytes = new byte[charCount >>> 1/*aka /2*/];
+ int byteIndex = 0/*beginning*/;
+ for(int i=offset; i<upperBound; i+=2) {
+ bytes[byteIndex++] = (byte)(( (digit(string.charAt(i)) << 4)
+ | digit(string.charAt(i + 1))) & 0xFF);
+ }
+
+ return bytes;
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * @param character a hex character to be converted to a <code>byte</code>.
+ * This cannot be a character other than [a-fA-F0-9].
+ * @return the value of the specified character. This will be a value <code>0</code>
+ * through <code>15</code>.
+ * @throws IllegalArgumentException if the specified character is not in
+ * [a-fA-F0-9]
+ */
+ private static final int digit(final char character) {
+ switch(character) {
+ case '0':
+ return 0;
+ case '1':
+ return 1;
+ case '2':
+ return 2;
+ case '3':
+ return 3;
+ case '4':
+ return 4;
+ case '5':
+ return 5;
+ case '6':
+ return 6;
+ case '7':
+ return 7;
+ case '8':
+ return 8;
+ case '9':
+ return 9;
+ case 'a':
+ case 'A':
+ return 10;
+ case 'b':
+ case 'B':
+ return 11;
+ case 'c':
+ case 'C':
+ return 12;
+ case 'd':
+ case 'D':
+ return 13;
+ case 'e':
+ case 'E':
+ return 14;
+ case 'f':
+ case 'F':
+ return 15;
+
+ default:
+ throw new IllegalArgumentException("Character is not in [a-fA-F0-9] ('" + character + "').");
+ }
+ }
+}
\ No newline at end of file