You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2015/07/16 12:32:07 UTC
svn commit: r1691350 [1/3] - in /lucene/dev/branches/solr7787: ./ lucene/
solr/ solr/core/ solr/core/src/java/org/apache/solr/handler/component/
solr/core/src/java/org/apache/solr/search/facet/
solr/core/src/java/org/apache/solr/util/hll/ solr/core/src...
Author: dweiss
Date: Thu Jul 16 10:32:07 2015
New Revision: 1691350
URL: http://svn.apache.org/r1691350
Log:
SOLR-7787 (jhll integration).
Added:
lucene/dev/branches/solr7787/ (props changed)
- copied from r1691343, lucene/dev/trunk/
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BitUtil.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BitVector.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLL.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLMetadata.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLType.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IHLLMetadata.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/ISchemaVersion.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordDeserializer.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/IWordSerializer.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/LongIterator.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/NumberUtil.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/SchemaVersionOne.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/SerializationUtil.java (with props)
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/package-info.java (with props)
Modified:
lucene/dev/branches/solr7787/lucene/ivy-versions.properties
lucene/dev/branches/solr7787/solr/NOTICE.txt
lucene/dev/branches/solr7787/solr/core/ivy.xml
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/handler/component/StatsField.java
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/search/facet/HLLAgg.java
lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/search/facet/UniqueSlotAcc.java
lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java
lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java
lucene/dev/branches/solr7787/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
Modified: lucene/dev/branches/solr7787/lucene/ivy-versions.properties
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/lucene/ivy-versions.properties?rev=1691350&r1=1691343&r2=1691350&view=diff
==============================================================================
--- lucene/dev/branches/solr7787/lucene/ivy-versions.properties (original)
+++ lucene/dev/branches/solr7787/lucene/ivy-versions.properties Thu Jul 16 10:32:07 2015
@@ -73,7 +73,6 @@ com.sun.jersey.version = 1.9
/hsqldb/hsqldb = 1.8.0.10
/io.airlift/slice = 0.10
/io.netty/netty = 3.7.0.Final
-/it.unimi.dsi/fastutil = 6.5.11
/jakarta-regexp/jakarta-regexp = 1.4
/javax.activation/activation = 1.1.1
/javax.inject/javax.inject= 1
@@ -85,7 +84,6 @@ com.sun.jersey.version = 1.9
/log4j/log4j = 1.2.17
/mecab/mecab-ipadic = 2.7.0-20070801
/mecab/mecab-naist-jdic = 0.6.3b-20111013
-/net.agkn/hll = 1.6.0
/net.arnx/jsonic = 1.2.7
/net.sf.ehcache/ehcache-core = 2.4.4
/net.sf.saxon/Saxon-HE = 9.6.0-2
Modified: lucene/dev/branches/solr7787/solr/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/NOTICE.txt?rev=1691350&r1=1691343&r2=1691350&view=diff
==============================================================================
--- lucene/dev/branches/solr7787/solr/NOTICE.txt (original)
+++ lucene/dev/branches/solr7787/solr/NOTICE.txt Thu Jul 16 10:32:07 2015
@@ -13,6 +13,9 @@ including, but not limited to:
- Apache Blur
- Apache Hadoop
+This product includes code forked from the Java-HLL library.
+Copyright (c) 2013 Aggregate Knowledge, Inc., https://github.com/aggregateknowledge/java-hll/
+
This product includes the JQuery JavaScript library created by John Resig.
Copyright (c) 2010 John Resig, http://jquery.com/
Modified: lucene/dev/branches/solr7787/solr/core/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/ivy.xml?rev=1691350&r1=1691343&r2=1691350&view=diff
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/ivy.xml (original)
+++ lucene/dev/branches/solr7787/solr/core/ivy.xml Thu Jul 16 10:32:07 2015
@@ -134,10 +134,6 @@
<dependency org="org.antlr" name="antlr4-runtime" rev="${/org.antlr/antlr4-runtime}"/>
<dependency org="io.airlift" name="slice" rev="${/io.airlift/slice}"/>
- <!-- StatsComponents HLL Dependencies-->
- <dependency org="net.agkn" name="hll" rev="${/net.agkn/hll}" conf="compile->*"/>
- <dependency org="it.unimi.dsi" name="fastutil" rev="${/it.unimi.dsi/fastutil}" conf="compile->*"/>
-
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>
Modified: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/handler/component/StatsField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/handler/component/StatsField.java?rev=1691350&r1=1691343&r2=1691350&view=diff
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/handler/component/StatsField.java (original)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/handler/component/StatsField.java Thu Jul 16 10:32:07 2015
@@ -55,9 +55,9 @@ import org.apache.solr.search.QParserPlu
import org.apache.solr.search.QueryParsing;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SyntaxError;
+import org.apache.solr.util.hll.HLL;
+import org.apache.solr.util.hll.HLLType;
-import net.agkn.hll.HLL;
-import net.agkn.hll.HLLType;
import com.google.common.hash.Hashing;
import com.google.common.hash.HashFunction;
@@ -625,8 +625,8 @@ public class StatsField {
* Creates an HllOptions based on the (local) params specified (if appropriate).
*
* @param localParams the LocalParams for this {@link StatsField}
- * @param field the field corrisponding to this {@link StatsField}, may be null if these stats are over a value source
- * @return the {@link HllOptions} to use basd on the params, or null if no {@link HLL} should be computed
+ * @param field the field corresponding to this {@link StatsField}, may be null if these stats are over a value source
+ * @return the {@link HllOptions} to use based on the params, or null if no {@link HLL} should be computed
* @throws SolrException if there are invalid options
*/
public static HllOptions parseHllOptions(SolrParams localParams, SchemaField field)
Modified: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java?rev=1691350&r1=1691343&r2=1691350&view=diff
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java (original)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java Thu Jul 16 10:32:07 2015
@@ -33,12 +33,12 @@ import org.apache.solr.handler.component
import org.apache.solr.schema.*;
import com.tdunning.math.stats.AVLTreeDigest;
-
-import net.agkn.hll.HLL;
-import net.agkn.hll.HLLType;
import com.google.common.hash.Hashing;
import com.google.common.hash.HashFunction;
+import org.apache.solr.util.hll.HLL;
+import org.apache.solr.util.hll.HLLType;
+
/**
* Factory class for creating instance of
* {@link org.apache.solr.handler.component.StatsValues}
Modified: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/search/facet/HLLAgg.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/search/facet/HLLAgg.java?rev=1691350&r1=1691343&r2=1691350&view=diff
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/search/facet/HLLAgg.java (original)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/search/facet/HLLAgg.java Thu Jul 16 10:32:07 2015
@@ -23,8 +23,8 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
-import net.agkn.hll.HLL;
-import net.agkn.hll.HLLType;
+import org.apache.solr.util.hll.HLL;
+import org.apache.solr.util.hll.HLLType;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
Modified: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/search/facet/UniqueSlotAcc.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/search/facet/UniqueSlotAcc.java?rev=1691350&r1=1691343&r2=1691350&view=diff
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/search/facet/UniqueSlotAcc.java (original)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/search/facet/UniqueSlotAcc.java Thu Jul 16 10:32:07 2015
@@ -21,7 +21,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
-import net.agkn.hll.HLL;
+import org.apache.solr.util.hll.HLL;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.SortedDocValues;
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,173 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A corresponding deserializer for {@link BigEndianAscendingWordSerializer}.
+ */
+class BigEndianAscendingWordDeserializer implements IWordDeserializer {
+ // The number of bits per byte.
+ private static final int BITS_PER_BYTE = 8;
+
+ // long mask for the maximum value stored in a byte
+ private static final long BYTE_MASK = (1L << BITS_PER_BYTE) - 1L;
+
+ // ************************************************************************
+ // The length in bits of the words to be read.
+ private final int wordLength;
+
+ // The byte array to which the words are serialized.
+ private final byte[] bytes;
+
+ // The number of leading padding bytes in 'bytes' to be ignored.
+ private final int bytePadding;
+
+ // The number of words that the byte array contains.
+ private final int wordCount;
+
+ // The current read state.
+ private int currentWordIndex;
+
+ // ========================================================================
+ /**
+ * @param wordLength the length in bits of the words to be deserialized. Must
+ * be less than or equal to 64 and greater than or equal to 1.
+ * @param bytePadding the number of leading bytes that pad the serialized words.
+ * Must be greater than or equal to zero.
+ * @param bytes the byte array containing the serialized words. Cannot be
+ * <code>null</code>.
+ */
+ public BigEndianAscendingWordDeserializer(final int wordLength, final int bytePadding, final byte[] bytes) {
+ if((wordLength < 1) || (wordLength > 64)) {
+ throw new IllegalArgumentException("Word length must be >= 1 and <= 64. (was: " + wordLength + ")");
+ }
+
+ if(bytePadding < 0) {
+ throw new IllegalArgumentException("Byte padding must be >= zero. (was: " + bytePadding + ")");
+ }
+
+ this.wordLength = wordLength;
+ this.bytes = bytes;
+ this.bytePadding = bytePadding;
+
+ final int dataBytes = (bytes.length - bytePadding);
+ final long dataBits = (dataBytes * BITS_PER_BYTE);
+
+ this.wordCount = (int)(dataBits/wordLength);
+
+ currentWordIndex = 0;
+ }
+
+ // ========================================================================
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IWordDeserializer#readWord()
+ */
+ @Override
+ public long readWord() {
+ final long word = readWord(currentWordIndex);
+ currentWordIndex++;
+
+ return word;
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * Reads the word at the specified sequence position (zero-indexed).
+ *
+ * @param position the zero-indexed position of the word to be read. This
+ * must be greater than or equal to zero.
+ * @return the value of the serialized word at the specified position.
+ */
+ private long readWord(final int position) {
+ if(position < 0) {
+ throw new ArrayIndexOutOfBoundsException(position);
+ }
+
+ // First bit of the word
+ final long firstBitIndex = (position * wordLength);
+ final int firstByteIndex = (bytePadding + (int)(firstBitIndex / BITS_PER_BYTE));
+ final int firstByteSkipBits = (int)(firstBitIndex % BITS_PER_BYTE);
+
+ // Last bit of the word
+ final long lastBitIndex = (firstBitIndex + wordLength - 1);
+ final int lastByteIndex = (bytePadding + (int)(lastBitIndex / BITS_PER_BYTE));
+ final int lastByteBitsToConsume;
+
+ final int bitsAfterByteBoundary = (int)((lastBitIndex + 1) % BITS_PER_BYTE);
+ // If the word terminates at the end of the last byte, consume the whole
+ // last byte.
+ if(bitsAfterByteBoundary == 0) {
+ lastByteBitsToConsume = BITS_PER_BYTE;
+ } else {
+ // Otherwise, only consume what is necessary.
+ lastByteBitsToConsume = bitsAfterByteBoundary;
+ }
+
+ if(lastByteIndex >= bytes.length) {
+ throw new ArrayIndexOutOfBoundsException("Word out of bounds of backing array.");
+ }
+
+ // Accumulator
+ long value = 0;
+
+ // --------------------------------------------------------------------
+ // First byte
+ final int bitsRemainingInFirstByte = (BITS_PER_BYTE - firstByteSkipBits);
+ final int bitsToConsumeInFirstByte = Math.min(bitsRemainingInFirstByte, wordLength);
+ long firstByte = (long)bytes[firstByteIndex];
+
+ // Mask off the bits to skip in the first byte.
+ final long firstByteMask = ((1L << bitsRemainingInFirstByte) - 1L);
+ firstByte &= firstByteMask;
+ // Right-align relevant bits of first byte.
+ firstByte >>>= (bitsRemainingInFirstByte - bitsToConsumeInFirstByte);
+
+ value |= firstByte;
+
+ // If the first byte contains the whole word, short-circuit.
+ if(firstByteIndex == lastByteIndex) {
+ return value;
+ }
+
+ // --------------------------------------------------------------------
+ // Middle bytes
+ final int middleByteCount = (lastByteIndex - firstByteIndex - 1);
+ for(int i=0; i<middleByteCount; i++) {
+ final long middleByte = (bytes[firstByteIndex + i + 1] & BYTE_MASK);
+ // Push middle byte onto accumulator.
+ value <<= BITS_PER_BYTE;
+ value |= middleByte;
+ }
+
+ // --------------------------------------------------------------------
+ // Last byte
+ long lastByte = (bytes[lastByteIndex] & BYTE_MASK);
+ lastByte >>= (BITS_PER_BYTE - lastByteBitsToConsume);
+ value <<= lastByteBitsToConsume;
+ value |= lastByte;
+ return value;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IWordDeserializer#totalWordCount()
+ */
+ @Override
+ public int totalWordCount() {
+ return wordCount;
+ }
+}
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,174 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A serializer that writes a sequence of fixed bit-width 'words' to a byte array.
+ * Bitwise OR is used to write words into bytes, so a low bit in a word is also
+ * a low bit in a byte. However, a high byte in a word is written at a lower index
+ * in the array than a low byte in a word. The first word is written at the lowest
+ * array index. Each serializer is one time use and returns its backing byte
+ * array.<p/>
+ *
+ * This encoding was chosen so that when reading bytes as octets in the typical
+ * first-octet-is-the-high-nibble fashion, an octet-to-binary conversion
+ * would yield a high-to-low, left-to-right view of the "short words".<p/>
+ *
+ * Example:<p/>
+ *
+ * Say short words are 5 bits wide. Our word sequence is the values
+ * <code>[31, 1, 5]</code>. In big-endian binary format, the values are
+ * <code>[0b11111, 0b00001, 0b00101]</code>. We use 15 of 16 bits in two bytes
+ * and pad the last (lowest) bit of the last byte with a zero:
+ *
+ * <code>
+ * [0b11111000, 0b01001010] = [0xF8, 0x4A]
+ * </code>.
+ */
+class BigEndianAscendingWordSerializer implements IWordSerializer {
+ // The number of bits per byte.
+ private static final int BITS_PER_BYTE = 8;
+
+ // ************************************************************************
+ // The length in bits of the words to be written.
+ private final int wordLength;
+ // The number of words to be written.
+ private final int wordCount;
+
+ // The byte array to which the words are serialized.
+ private final byte[] bytes;
+
+ // ------------------------------------------------------------------------
+ // Write state
+ // Number of bits that remain writable in the current byte.
+ private int bitsLeftInByte;
+ // Index of byte currently being written to.
+ private int byteIndex;
+ // Number of words written.
+ private int wordsWritten;
+
+ // ========================================================================
+ /**
+ * @param wordLength the length in bits of the words to be serialized. Must
+ * be greater than or equal to 1 and less than or equal to 64.
+ * @param wordCount the number of words to be serialized. Must be greater than
+ * or equal to zero.
+ * @param bytePadding the number of leading bytes that should pad the
+ * serialized words. Must be greater than or equal to zero.
+ */
+ public BigEndianAscendingWordSerializer(final int wordLength, final int wordCount, final int bytePadding) {
+ if((wordLength < 1) || (wordLength > 64)) {
+ throw new IllegalArgumentException("Word length must be >= 1 and <= 64. (was: " + wordLength + ")");
+ }
+ if(wordCount < 0) {
+ throw new IllegalArgumentException("Word count must be >= 0. (was: " + wordCount + ")");
+ }
+ if(bytePadding < 0) {
+ throw new IllegalArgumentException("Byte padding must be must be >= 0. (was: " + bytePadding + ")");
+ }
+
+ this.wordLength = wordLength;
+ this.wordCount = wordCount;
+
+ final long bitsRequired = (wordLength * wordCount);
+ final boolean leftoverBits = ((bitsRequired % BITS_PER_BYTE) != 0);
+ final int bytesRequired = (int)(bitsRequired / BITS_PER_BYTE) + (leftoverBits ? 1 : 0) + bytePadding;
+ bytes = new byte[bytesRequired];
+
+ bitsLeftInByte = BITS_PER_BYTE;
+ byteIndex = bytePadding;
+ wordsWritten = 0;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IWordSerializer#writeWord(long)
+ * @throws RuntimeException if the number of words written is greater than the
+ * <code>wordCount</code> parameter in the constructor.
+ */
+ @Override
+ public void writeWord(final long word) {
+ if(wordsWritten == wordCount) {
+ throw new RuntimeException("Cannot write more words, backing array full!");
+ }
+
+ int bitsLeftInWord = wordLength;
+
+ while(bitsLeftInWord > 0) {
+ // Move to the next byte if the current one is fully packed.
+ if(bitsLeftInByte == 0) {
+ byteIndex++;
+ bitsLeftInByte = BITS_PER_BYTE;
+ }
+
+ final long consumedMask;
+ if(bitsLeftInWord == 64) {
+ consumedMask = ~0L;
+ } else {
+ consumedMask = ((1L << bitsLeftInWord) - 1L);
+ }
+
+ // Fix how many bits will be written in this cycle. Choose the
+ // smaller of the remaining bits in the word or byte.
+ final int numberOfBitsToWrite = Math.min(bitsLeftInByte, bitsLeftInWord);
+ final int bitsInByteRemainingAfterWrite = (bitsLeftInByte - numberOfBitsToWrite);
+
+ // In general, we write the highest bits of the word first, so we
+ // strip the highest bits that were consumed in previous cycles.
+ final long remainingBitsOfWordToWrite = (word & consumedMask);
+
+ final long bitsThatTheByteCanAccept;
+ // If there is more left in the word than can be written to this
+ // byte, shift off the bits that can't be written off the bottom.
+ if(bitsLeftInWord > numberOfBitsToWrite) {
+ bitsThatTheByteCanAccept = (remainingBitsOfWordToWrite >>> (bitsLeftInWord - bitsLeftInByte));
+ } else {
+ // If the byte can accept all remaining bits, there is no need
+ // to shift off the bits that won't be written in this cycle.
+ bitsThatTheByteCanAccept = remainingBitsOfWordToWrite;
+ }
+
+ // Align the word bits to write up against the byte bits that have
+ // already been written. This shift may do nothing if the remainder
+ // of the byte is being consumed in this cycle.
+ final long alignedBits = (bitsThatTheByteCanAccept << bitsInByteRemainingAfterWrite);
+
+ // Update the byte with the alignedBits.
+ bytes[byteIndex] |= (byte)alignedBits;
+
+ // Update state with bit count written.
+ bitsLeftInWord -= numberOfBitsToWrite;
+ bitsLeftInByte = bitsInByteRemainingAfterWrite;
+ }
+
+ wordsWritten ++;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IWordSerializer#getBytes()
+ * @throws RuntimeException if the number of words written is fewer than the
+ * <code>wordCount</code> parameter in the constructor.
+ */
+ @Override
+ public byte[] getBytes() {
+ if(wordsWritten < wordCount) {
+ throw new RuntimeException("Not all words have been written! (" + wordsWritten + "/" + wordCount + ")");
+ }
+
+ return bytes;
+ }
+}
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BitUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BitUtil.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BitUtil.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BitUtil.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,71 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A collection of bit utilities.
+ */
+class BitUtil {
+ /**
+ * The set of least-significant bits for a given <code>byte</code>. <code>-1</code>
+ * is used if no bits are set (so as to not be confused with "index of zero"
+ * meaning that the least significant bit is the 0th (1st) bit).
+ *
+ * @see #leastSignificantBit(long)
+ */
+ private static final int[] LEAST_SIGNIFICANT_BIT = {
+ -1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+ };
+
+ /**
+ * Computes the least-significant bit of the specified <code>long</code>
+ * that is set to <code>1</code>. Zero-indexed.
+ *
+ * @param value the <code>long</code> whose least-significant bit is desired.
+ * @return the least-significant bit of the specified <code>long</code>.
+ * <code>-1</code> is returned if there are no bits set.
+ */
+ // REF: http://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
+ // REF: http://www-graphics.stanford.edu/~seander/bithacks.html
+ public static int leastSignificantBit(final long value) {
+ if(value == 0L) return -1/*by contract*/;
+ if((value & 0xFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 0) & 0xFF)] + 0;
+ if((value & 0xFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 8) & 0xFF)] + 8;
+ if((value & 0xFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 16) & 0xFF)] + 16;
+ if((value & 0xFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 24) & 0xFF)] + 24;
+ if((value & 0xFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 32) & 0xFF)] + 32;
+ if((value & 0xFFFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 40) & 0xFF)] + 40;
+ if((value & 0xFFFFFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 48) & 0xFF)] + 48;
+ return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 56) & 0xFFL)] + 56;
+ }
+}
\ No newline at end of file
Added: lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BitVector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BitVector.java?rev=1691350&view=auto
==============================================================================
--- lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BitVector.java (added)
+++ lucene/dev/branches/solr7787/solr/core/src/java/org/apache/solr/util/hll/BitVector.java Thu Jul 16 10:32:07 2015
@@ -0,0 +1,259 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A vector (array) of bits that is accessed in units ("registers") of <code>width</code>
+ * bits which are stored as 64bit "words" (<code>long</code>s). In this context
+ * a register is at most 64bits.
+ */
+class BitVector implements Cloneable {
+ // NOTE: in this context, a word is 64bits
+
+ // rather than doing division to determine how a bit index fits into 64bit
+ // words (i.e. longs), bit shifting is used
+ private static final int LOG2_BITS_PER_WORD = 6/*=>64bits*/;
+ private static final int BITS_PER_WORD = 1 << LOG2_BITS_PER_WORD;
+ private static final int BITS_PER_WORD_MASK = BITS_PER_WORD - 1;
+
+ // ditto from above but for bytes (for output)
+ private static final int LOG2_BITS_PER_BYTE = 3/*=>8bits*/;
+ public static final int BITS_PER_BYTE = 1 << LOG2_BITS_PER_BYTE;
+
+ // ========================================================================
+ public static final int BYTES_PER_WORD = 8/*8 bytes in a long*/;
+
+ // ************************************************************************
+ // 64bit words
+ private final long[] words;
+ public final long[] words() { return words; }
+ public final int wordCount() { return words.length; }
+ public final int byteCount() { return wordCount() * BYTES_PER_WORD; }
+
+ // the width of a register in bits (this cannot be more than 64 (the word size))
+ private final int registerWidth;
+ public final int registerWidth() { return registerWidth; }
+
+ private final long count;
+
+ // ------------------------------------------------------------------------
+ private final long registerMask;
+
+ // ========================================================================
+ /**
+ * @param width the width of each register. This cannot be negative or
+ * zero or greater than 63 (the signed word size).
+ * @param count the number of registers. This cannot be negative or zero
+ */
+ public BitVector(final int width, final long count) {
+ // ceil((width * count)/BITS_PER_WORD)
+ this.words = new long[(int)(((width * count) + BITS_PER_WORD_MASK) >>> LOG2_BITS_PER_WORD)];
+ this.registerWidth = width;
+ this.count = count;
+
+ this.registerMask = (1L << width) - 1;
+ }
+
+ // ========================================================================
+ /**
+ * @param registerIndex the index of the register whose value is to be
+ * retrieved. This cannot be negative.
+ * @return the value at the specified register index
+ * @see #setRegister(long, long)
+ * @see #setMaxRegister(long, long)
+ */
+ // NOTE: if this changes then setMaxRegister() must change
+ public long getRegister(final long registerIndex) {
+ final long bitIndex = registerIndex * registerWidth;
+ final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/;
+ final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/;
+ final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/;
+
+ if(firstWordIndex == secondWordIndex)
+ return ((words[firstWordIndex] >>> bitRemainder) & registerMask);
+ /* else -- register spans words */
+ return (words[firstWordIndex] >>> bitRemainder)/*no need to mask since at top of word*/
+ | (words[secondWordIndex] << (BITS_PER_WORD - bitRemainder)) & registerMask;
+ }
+
+ /**
+ * @param registerIndex the index of the register whose value is to be set.
+ * This cannot be negative
+ * @param value the value to set in the register
+ * @see #getRegister(long)
+ * @see #setMaxRegister(long, long)
+ */
+ // NOTE: if this changes then setMaxRegister() must change
+ public void setRegister(final long registerIndex, final long value) {
+ final long bitIndex = registerIndex * registerWidth;
+ final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/;
+ final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/;
+ final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/;
+
+ final long words[] = this.words/*for convenience/performance*/;
+ if(firstWordIndex == secondWordIndex) {
+ // clear then set
+ words[firstWordIndex] &= ~(registerMask << bitRemainder);
+ words[firstWordIndex] |= (value << bitRemainder);
+ } else {/*register spans words*/
+ // clear then set each partial word
+ words[firstWordIndex] &= (1L << bitRemainder) - 1;
+ words[firstWordIndex] |= (value << bitRemainder);
+
+ words[secondWordIndex] &= ~(registerMask >>> (BITS_PER_WORD - bitRemainder));
+ words[secondWordIndex] |= (value >>> (BITS_PER_WORD - bitRemainder));
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * @return a <code>LongIterator</code> for iterating starting at the register
+ * with index zero. This will never be <code>null</code>.
+ */
+ public LongIterator registerIterator() {
+ return new LongIterator() {
+ final int registerWidth = BitVector.this.registerWidth;
+ final long[] words = BitVector.this.words;
+ final long registerMask = BitVector.this.registerMask;
+
+ // register setup
+ long registerIndex = 0;
+ int wordIndex = 0;
+ int remainingWordBits = BITS_PER_WORD;
+ long word = words[wordIndex];
+
+ @Override public long next() {
+ long register;
+ if(remainingWordBits >= registerWidth) {
+ register = word & registerMask;
+
+ // shift to the next register
+ word >>>= registerWidth;
+ remainingWordBits -= registerWidth;
+ } else { /*insufficient bits remaining in current word*/
+ wordIndex++/*move to the next word*/;
+
+ register = (word | (words[wordIndex] << remainingWordBits)) & registerMask;
+
+ // shift to the next partial register (word)
+ word = words[wordIndex] >>> (registerWidth - remainingWordBits);
+ remainingWordBits += BITS_PER_WORD - registerWidth;
+ }
+ registerIndex++;
+ return register;
+ }
+
+ @Override public boolean hasNext() {
+ return registerIndex < count;
+ }
+ };
+ }
+
+ // ------------------------------------------------------------------------
+ // composite accessors
+ /**
+ * Sets the value of the specified index register if and only if the specified
+ * value is greater than the current value in the register. This is equivalent
+ * to but much more performant than:<p/>
+ *
+ * <pre>vector.setRegister(index, Math.max(vector.getRegister(index), value));</pre>
+ *
+ * @param registerIndex the index of the register whose value is to be set.
+ * This cannot be negative
+ * @param value the value to set in the register if and only if this value
+ * is greater than the current value in the register
+ * @return <code>true</code> if and only if the specified value is greater
+ * than or equal to the current register value. <code>false</code>
+ * otherwise.
+ * @see #getRegister(long)
+ * @see #setRegister(long, long)
+ * @see java.lang.Math#max(long, long)
+ */
+ // NOTE: if this changes then setRegister() must change
+ public boolean setMaxRegister(final long registerIndex, final long value) {
+ final long bitIndex = registerIndex * registerWidth;
+ final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/;
+ final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/;
+ final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/;
+
+ // NOTE: matches getRegister()
+ final long registerValue;
+ final long words[] = this.words/*for convenience/performance*/;
+ if(firstWordIndex == secondWordIndex)
+ registerValue = ((words[firstWordIndex] >>> bitRemainder) & registerMask);
+ else /*register spans words*/
+ registerValue = (words[firstWordIndex] >>> bitRemainder)/*no need to mask since at top of word*/
+ | (words[secondWordIndex] << (BITS_PER_WORD - bitRemainder)) & registerMask;
+
+ // determine which is the larger and update as necessary
+ if(value > registerValue) {
+ // NOTE: matches setRegister()
+ if(firstWordIndex == secondWordIndex) {
+ // clear then set
+ words[firstWordIndex] &= ~(registerMask << bitRemainder);
+ words[firstWordIndex] |= (value << bitRemainder);
+ } else {/*register spans words*/
+ // clear then set each partial word
+ words[firstWordIndex] &= (1L << bitRemainder) - 1;
+ words[firstWordIndex] |= (value << bitRemainder);
+
+ words[secondWordIndex] &= ~(registerMask >>> (BITS_PER_WORD - bitRemainder));
+ words[secondWordIndex] |= (value >>> (BITS_PER_WORD - bitRemainder));
+ }
+ } /* else -- the register value is greater (or equal) so nothing needs to be done */
+
+ return (value >= registerValue);
+ }
+
+ // ========================================================================
+ /**
+ * Fills this bit vector with the specified bit value. This can be used to
+ * clear the vector by specifying <code>0</code>.
+ *
+ * @param value the value to set all bits to (only the lowest bit is used)
+ */
+ public void fill(final long value) {
+ for(long i=0; i<count; i++) {
+ setRegister(i, value);
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * Serializes the registers of the vector using the specified serializer.
+ *
+ * @param serializer the serializer to use. This cannot be <code>null</code>.
+ */
+ public void getRegisterContents(final IWordSerializer serializer) {
+ for(final LongIterator iter = registerIterator(); iter.hasNext();) {
+ serializer.writeWord(iter.next());
+ }
+ }
+
+ /**
+ * Creates a deep copy of this vector.
+ *
+ * @see java.lang.Object#clone()
+ */
+ @Override
+ public BitVector clone() {
+ final BitVector copy = new BitVector(registerWidth, count);
+ System.arraycopy(words, 0, copy.words, 0, words.length);
+ return copy;
+ }
+}
\ No newline at end of file