You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2015/07/16 12:32:07 UTC

svn commit: r1691350 [3/3] - in /lucene/dev/branches/solr7787: ./ lucene/ solr/ solr/core/ solr/core/src/java/org/apache/solr/handler/component/ solr/core/src/java/org/apache/solr/search/facet/ solr/core/src/java/org/apache/solr/util/hll/ solr/core/src...

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/SchemaVersionOne.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/SchemaVersionOne.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/SchemaVersionOne.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/SchemaVersionOne.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,156 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A concrete {@link ISchemaVersion} representing schema version one.
+ *
+ * @author timon
+ */
+class SchemaVersionOne implements ISchemaVersion {
+    /**
+     * The schema version number for this instance.
+     */
+    public static final int SCHEMA_VERSION = 1;
+
+    // ------------------------------------------------------------------------
+    // Version-specific ordinals (array position) for each of the HLL types
+    private static final HLLType[] TYPE_ORDINALS = new HLLType[] {
+        HLLType.EMPTY,
+        HLLType.EXPLICIT,
+        HLLType.SPARSE,
+        HLLType.FULL
+    };
+
+    // ------------------------------------------------------------------------
+    // number of header bytes for all HLL types
+    private static final int HEADER_BYTE_COUNT = 3;
+
+    // sentinel values from the spec for explicit off and auto
+    private static final int EXPLICIT_OFF = 0;
+    private static final int EXPLICIT_AUTO = 63;
+
+    // ************************************************************************
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.ISchemaVersion#paddingBytes(HLLType)
+     */
+    @Override
+    public int paddingBytes(final HLLType type) {
+        return HEADER_BYTE_COUNT;
+    }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.ISchemaVersion#writeMetadata(byte[], IHLLMetadata)
+     */
+    @Override
+    public void writeMetadata(final byte[] bytes, final IHLLMetadata metadata) {
+        final HLLType type = metadata.HLLType();
+        final int typeOrdinal = getOrdinal(type);
+
+        final int explicitCutoffValue;
+        if(metadata.explicitOff()) {
+            explicitCutoffValue = EXPLICIT_OFF;
+        } else if(metadata.explicitAuto()) {
+            explicitCutoffValue = EXPLICIT_AUTO;
+        } else {
+            explicitCutoffValue = metadata.log2ExplicitCutoff() + 1/*per spec*/;
+        }
+
+        bytes[0] = SerializationUtil.packVersionByte(SCHEMA_VERSION, typeOrdinal);
+        bytes[1] = SerializationUtil.packParametersByte(metadata.registerWidth(), metadata.registerCountLog2());
+        bytes[2] = SerializationUtil.packCutoffByte(explicitCutoffValue, metadata.sparseEnabled());
+    }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.ISchemaVersion#readMetadata(byte[])
+     */
+    @Override
+    public IHLLMetadata readMetadata(final byte[] bytes) {
+        final byte versionByte = bytes[0];
+        final byte parametersByte = bytes[1];
+        final byte cutoffByte = bytes[2];
+
+        final int typeOrdinal = SerializationUtil.typeOrdinal(versionByte);
+        final int explicitCutoffValue = SerializationUtil.explicitCutoff(cutoffByte);
+        final boolean explicitOff = (explicitCutoffValue == EXPLICIT_OFF);
+        final boolean explicitAuto = (explicitCutoffValue == EXPLICIT_AUTO);
+        final int log2ExplicitCutoff = (explicitOff || explicitAuto) ? -1/*sentinel*/ : (explicitCutoffValue - 1/*per spec*/);
+
+        return new HLLMetadata(SCHEMA_VERSION,
+                                    getType(typeOrdinal),
+                                    SerializationUtil.registerCountLog2(parametersByte),
+                                    SerializationUtil.registerWidth(parametersByte),
+                                    log2ExplicitCutoff,
+                                    explicitOff,
+                                    explicitAuto,
+                                    SerializationUtil.sparseEnabled(cutoffByte));
+    }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.ISchemaVersion#getSerializer(HLLType, int, int)
+     */
+    @Override
+    public IWordSerializer getSerializer(HLLType type, int wordLength, int wordCount) {
+        return new BigEndianAscendingWordSerializer(wordLength, wordCount, paddingBytes(type));
+    }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.ISchemaVersion#getDeserializer(HLLType, int, byte[])
+     */
+    @Override
+    public IWordDeserializer getDeserializer(HLLType type, int wordLength, byte[] bytes) {
+        return new BigEndianAscendingWordDeserializer(wordLength, paddingBytes(type), bytes);
+    }
+
+    /* (non-Javadoc)
+     * @see net.agkn.hll.serialization.ISchemaVersion#schemaVersionNumber()
+     */
+    @Override
+    public int schemaVersionNumber() {
+        return SCHEMA_VERSION;
+    }
+
+    // ========================================================================
+    // Type/Ordinal lookups
+    /**
+     * Gets the ordinal for the specified {@link HLLType}.
+     *
+     * @param  type the type whose ordinal is desired
+     * @return the ordinal for the specified type, to be used in the version byte.
+     *         This will always be non-negative.
+     */
+    private static int getOrdinal(final HLLType type) {
+        for(int i=0; i<TYPE_ORDINALS.length; i++) {
+            if(TYPE_ORDINALS[i].equals(type)) return i;
+        }
+        throw new RuntimeException("Unknown HLL type " + type);
+    }
+
+    /**
+     * Gets the {@link HLLType} for the specified ordinal.
+     *
+     * @param  ordinal the ordinal whose type is desired
+     * @return the type for the specified ordinal. This will never be <code>null</code>.
+     */
+    private static HLLType getType(final int ordinal) {
+        if((ordinal < 0) || (ordinal >= TYPE_ORDINALS.length)) {
+            throw new IllegalArgumentException("Invalid type ordinal '" + ordinal + "'. Only 0-" + (TYPE_ORDINALS.length - 1) + " inclusive allowed.");
+        }
+        return TYPE_ORDINALS[ordinal];
+    }
+}

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/SerializationUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/SerializationUtil.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/SerializationUtil.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/SerializationUtil.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,277 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A collection of constants and utilities for serializing and deserializing
+ * HLLs.
+ *
+ * NOTE:  'package' visibility is used for many methods that only need to be
+ *        used by the {@link ISchemaVersion} implementations. The structure of
+ *        a serialized HLL's metadata should be opaque to the rest of the
+ *        library.
+ */
+class SerializationUtil {
+    /**
+     * The number of bits (of the parameters byte) dedicated to encoding the
+     * width of the registers.
+     */
+    /*package*/ static int REGISTER_WIDTH_BITS = 3;
+
+    /**
+     * A mask to cap the maximum value of the register width.
+     */
+    /*package*/ static int REGISTER_WIDTH_MASK = (1 << REGISTER_WIDTH_BITS) - 1;
+
+    /**
+     * The number of bits (of the parameters byte) dedicated to encoding
+     * <code>log2(registerCount)</code>.
+     */
+    /*package*/ static int LOG2_REGISTER_COUNT_BITS = 5;
+
+    /**
+     * A mask to cap the maximum value of <code>log2(registerCount)</code>.
+     */
+    /*package*/ static int LOG2_REGISTER_COUNT_MASK = (1 << LOG2_REGISTER_COUNT_BITS) - 1;
+
+    /**
+     * The number of bits (of the cutoff byte) dedicated to encoding the
+     * log-base-2 of the explicit cutoff or sentinel values for
+     * 'explicit-disabled' or 'auto'.
+     */
+    /*package*/ static int EXPLICIT_CUTOFF_BITS = 6;
+
+    /**
+     * A mask to cap the maximum value of the explicit cutoff choice.
+     */
+    /*package*/ static int EXPLICIT_CUTOFF_MASK = (1 << EXPLICIT_CUTOFF_BITS) - 1;
+
+    /**
+     * Number of bits in a nibble.
+     */
+    private static int NIBBLE_BITS = 4;
+
+    /**
+     * A mask to cap the maximum value of a nibble.
+     */
+    private static int NIBBLE_MASK = (1 << NIBBLE_BITS) - 1;
+
+    // ************************************************************************
+    // Serialization utilities
+
+    /**
+     * Schema version one (v1).
+     */
+    public static ISchemaVersion VERSION_ONE = new SchemaVersionOne();
+
+    /**
+     * The default schema version for serializing HLLs.
+     */
+    public static ISchemaVersion DEFAULT_SCHEMA_VERSION = VERSION_ONE;
+
+    /**
+     * List of registered schema versions, indexed by their version numbers. If
+     * an entry is <code>null</code>, then no such schema version is registered.
+     * Similarly, registering a new schema version simply entails assigning an
+     * {@link ISchemaVersion} instance to the appropriate index of this array.<p/>
+     *
+     * By default, only {@link SchemaVersionOne} is registered. Note that version
+     * zero will always be reserved for internal (e.g. proprietary, legacy) schema
+     * specifications/implementations and will never be assigned to in by this
+     * library.
+     */
+    public static ISchemaVersion[] REGISTERED_SCHEMA_VERSIONS = new ISchemaVersion[16];
+
+    static {
+        REGISTERED_SCHEMA_VERSIONS[1] = VERSION_ONE;
+    }
+
+    /**
+     * @param  schemaVersionNumber the version number of the {@link ISchemaVersion}
+     *         desired. This must be a registered schema version number.
+     * @return The {@link ISchemaVersion} for the given number. This will never
+     *         be <code>null</code>.
+     */
+    public static ISchemaVersion getSchemaVersion(final int schemaVersionNumber) {
+        if(schemaVersionNumber >= REGISTERED_SCHEMA_VERSIONS.length || schemaVersionNumber < 0) {
+            throw new RuntimeException("Invalid schema version number " + schemaVersionNumber);
+        }
+        final ISchemaVersion schemaVersion = REGISTERED_SCHEMA_VERSIONS[schemaVersionNumber];
+        if(schemaVersion == null) {
+            throw new RuntimeException("Unknown schema version number " + schemaVersionNumber);
+        }
+        return schemaVersion;
+    }
+
+    /**
+     * Get the appropriate {@link ISchemaVersion schema version} for the specified
+     * serialized HLL.
+     *
+     * @param  bytes the serialized HLL whose schema version is desired.
+     * @return the schema version for the specified HLL. This will never
+     *         be <code>null</code>.
+     */
+    public static ISchemaVersion getSchemaVersion(final byte[] bytes) {
+        final byte versionByte = bytes[0];
+        final int schemaVersionNumber = schemaVersion(versionByte);
+
+        return getSchemaVersion(schemaVersionNumber);
+    }
+
+    // ************************************************************************
+    // Package-specific shared helpers
+
+    /**
+     * Generates a byte that encodes the schema version and the type ordinal
+     * of the HLL.
+     *
+     * The top nibble is the schema version and the bottom nibble is the type
+     * ordinal.
+     *
+     * @param schemaVersion the schema version to encode.
+     * @param typeOrdinal the type ordinal of the HLL to encode.
+     * @return the packed version byte
+     */
+    public static byte packVersionByte(final int schemaVersion, final int typeOrdinal) {
+        return (byte)(((NIBBLE_MASK & schemaVersion) << NIBBLE_BITS) | (NIBBLE_MASK & typeOrdinal));
+    }
+    /**
+     * Generates a byte that encodes the log-base-2 of the explicit cutoff
+     * or sentinel values for 'explicit-disabled' or 'auto', as well as the
+     * boolean indicating whether to use {@link HLLType#SPARSE}
+     * in the promotion hierarchy.
+     *
+     * The top bit is always padding, the second highest bit indicates the
+     * 'sparse-enabled' boolean, and the lowest six bits encode the explicit
+     * cutoff value.
+     *
+     * @param  explicitCutoff the explicit cutoff value to encode.
+     *         <ul>
+     *           <li>
+     *             If 'explicit-disabled' is chosen, this value should be <code>0</code>.
+     *           </li>
+     *           <li>
+     *             If 'auto' is chosen, this value should be <code>63</code>.
+     *           </li>
+     *           <li>
+     *             If a cutoff of 2<sup>n</sup> is desired, for <code>0 <= n < 31</code>,
+     *             this value should be <code>n + 1</code>.
+     *           </li>
+     *         </ul>
+     * @param  sparseEnabled whether {@link HLLType#SPARSE}
+     *         should be used in the promotion hierarchy to improve HLL
+     *         storage.
+     *
+     * @return the packed cutoff byte
+     */
+    public static byte packCutoffByte(final int explicitCutoff, final boolean sparseEnabled) {
+        final int sparseBit = (sparseEnabled ? (1 << EXPLICIT_CUTOFF_BITS) : 0);
+        return (byte)(sparseBit | (EXPLICIT_CUTOFF_MASK & explicitCutoff));
+    }
+
+    /**
+     * Generates a byte that encodes the parameters of a
+     * {@link HLLType#FULL} or {@link HLLType#SPARSE}
+     * HLL.<p/>
+     *
+     * The top 3 bits are used to encode <code>registerWidth - 1</code>
+     * (range of <code>registerWidth</code> is thus 1-9) and the bottom 5
+     * bits are used to encode <code>registerCountLog2</code>
+     * (range of <code>registerCountLog2</code> is thus 0-31).
+     *
+     * @param  registerWidth the register width (must be at least 1 and at
+     *         most 9)
+     * @param  registerCountLog2 the log-base-2 of the register count (must
+     *         be at least 0 and at most 31)
+     * @return the packed parameters byte
+     */
+    public static byte packParametersByte(final int registerWidth, final int registerCountLog2) {
+        final int widthBits = ((registerWidth - 1) & REGISTER_WIDTH_MASK);
+        final int countBits = (registerCountLog2 & LOG2_REGISTER_COUNT_MASK);
+        return (byte)((widthBits << LOG2_REGISTER_COUNT_BITS) | countBits);
+    }
+
+    /**
+     * Extracts the 'sparse-enabled' boolean from the cutoff byte of a serialized
+     * HLL.
+     *
+     * @param  cutoffByte the cutoff byte of the serialized HLL
+     * @return the 'sparse-enabled' boolean
+     */
+    public static boolean sparseEnabled(final byte cutoffByte) {
+        return ((cutoffByte >>> EXPLICIT_CUTOFF_BITS) & 1) == 1;
+    }
+
+    /**
+     * Extracts the explicit cutoff value from the cutoff byte of a serialized
+     * HLL.
+     *
+     * @param  cutoffByte the cutoff byte of the serialized HLL
+     * @return the explicit cutoff value
+     */
+    public static int explicitCutoff(final byte cutoffByte) {
+        return (cutoffByte & EXPLICIT_CUTOFF_MASK);
+    }
+
+    /**
+     * Extracts the schema version from the version byte of a serialized
+     * HLL.
+     *
+     * @param  versionByte the version byte of the serialized HLL
+     * @return the schema version of the serialized HLL
+     */
+    public static int schemaVersion(final byte versionByte) {
+        return NIBBLE_MASK & (versionByte >>> NIBBLE_BITS);
+    }
+
+    /**
+     * Extracts the type ordinal from the version byte of a serialized HLL.
+     *
+     * @param  versionByte the version byte of the serialized HLL
+     * @return the type ordinal of the serialized HLL
+     */
+    public static int typeOrdinal(final byte versionByte) {
+        return (versionByte & NIBBLE_MASK);
+    }
+
+    /**
+     * Extracts the register width from the parameters byte of a serialized
+     * {@link HLLType#FULL} HLL.
+     *
+     * @param  parametersByte the parameters byte of the serialized HLL
+     * @return the register width of the serialized HLL
+     *
+     * @see #packParametersByte(int, int)
+     */
+    public static int registerWidth(final byte parametersByte) {
+        return ((parametersByte >>> LOG2_REGISTER_COUNT_BITS) & REGISTER_WIDTH_MASK) + 1;
+    }
+
+    /**
+     * Extracts the log2(registerCount) from the parameters byte of a
+     * serialized {@link HLLType#FULL} HLL.
+     *
+     * @param  parametersByte the parameters byte of the serialized HLL
+     * @return log2(registerCount) of the serialized HLL
+     *
+     * @see #packParametersByte(int, int)
+     */
+    public static int registerCountLog2(final byte parametersByte) {
+        return (parametersByte & LOG2_REGISTER_COUNT_MASK);
+    }
+}

Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/package-info.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/package-info.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/package-info.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/package-info.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+/** 
+ * A fork of <a href="https://github.com/aggregateknowledge/java-hll/">Java-HyperLogLog</a> package tweaked
+ * not to depend on fastutil and with cleanups to make it lean and clean.
+ */
+package org.apache.solr.util.hll;
+
+

Modified: lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java?rev=1691350&r1=1691343&r2=1691350&view=diff
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java (original)
+++ lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java Thu Jul 16 10:32:07 2015
@@ -55,9 +55,9 @@ import org.apache.solr.util.AbstractSolr
 
 import org.apache.commons.math3.util.Combinations;
 import com.tdunning.math.stats.AVLTreeDigest;
-import net.agkn.hll.HLL;
 import com.google.common.hash.Hashing; 
-import com.google.common.hash.HashFunction; 
+import com.google.common.hash.HashFunction;
+import org.apache.solr.util.hll.HLL;
 
 import org.junit.BeforeClass;
 

Modified: lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java?rev=1691350&r1=1691343&r2=1691350&view=diff
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java (original)
+++ lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java Thu Jul 16 10:32:07 2015
@@ -31,7 +31,7 @@ import org.apache.solr.client.solrj.resp
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
 
-import net.agkn.hll.HLL;
+import org.apache.solr.util.hll.HLL;
 import com.google.common.hash.Hashing;
 import com.google.common.hash.HashFunction;
 

Modified: lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java?rev=1691350&r1=1691343&r2=1691350&view=diff
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java (original)
+++ lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java Thu Jul 16 10:32:07 2015
@@ -28,7 +28,7 @@ import java.util.Map;
 import java.util.Random;
 
 import com.tdunning.math.stats.AVLTreeDigest;
-import net.agkn.hll.HLL;
+import org.apache.solr.util.hll.HLL;
 import org.apache.lucene.queryparser.flexible.standard.processors.NumericQueryNodeProcessor;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.packed.GrowableWriter;