You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2015/12/14 22:36:18 UTC
[5/5] hive git commit: HIVE-12055. Move WriterImpl over to orc module.
HIVE-12055. Move WriterImpl over to orc module.
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/06e39ebe
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/06e39ebe
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/06e39ebe
Branch: refs/heads/master
Commit: 06e39ebe07d7854df669529e73f1c461f3c7d9d4
Parents: 49dc645
Author: Owen O'Malley <om...@apache.org>
Authored: Mon Dec 14 13:35:39 2015 -0800
Committer: Owen O'Malley <om...@apache.org>
Committed: Mon Dec 14 13:35:39 2015 -0800
----------------------------------------------------------------------
.../apache/hive/common/util/BloomFilter.java | 309 --
.../org/apache/hive/common/util/Murmur3.java | 335 --
.../apache/hive/common/util/TestMurmur3.java | 224 --
orc/src/java/org/apache/orc/BloomFilterIO.java | 43 +
orc/src/java/org/apache/orc/OrcFile.java | 22 +
.../java/org/apache/orc/TypeDescription.java | 26 +-
.../java/org/apache/orc/impl/WriterImpl.java | 2912 +++++++++++++++
.../hive/ql/io/filters/BloomFilterIO.java | 44 -
.../apache/hadoop/hive/ql/io/orc/FileDump.java | 2 +-
.../hadoop/hive/ql/io/orc/JsonFileDump.java | 2 +-
.../apache/hadoop/hive/ql/io/orc/OrcFile.java | 30 +-
.../hadoop/hive/ql/io/orc/ReaderImpl.java | 15 +
.../hadoop/hive/ql/io/orc/RecordReaderImpl.java | 2 +-
.../apache/hadoop/hive/ql/io/orc/Writer.java | 2 +-
.../hadoop/hive/ql/io/orc/WriterImpl.java | 3394 ++----------------
.../hadoop/hive/ql/util/JavaDataModel.java | 335 --
.../hadoop/hive/ql/io/orc/TestFileDump.java | 25 +-
.../hive/ql/io/orc/TestNewIntegerEncoding.java | 2 +-
.../hadoop/hive/ql/io/orc/TestOrcFile.java | 9 +-
.../hive/ql/io/orc/TestOrcRawRecordMerger.java | 12 +-
.../hive/ql/io/orc/TestRecordReaderImpl.java | 2 +-
.../resources/orc-file-dump-bloomfilter.out | 2 +-
.../resources/orc-file-dump-bloomfilter2.out | 2 +-
.../orc-file-dump-dictionary-threshold.out | 2 +-
ql/src/test/resources/orc-file-dump.json | 2 +-
ql/src/test/resources/orc-file-dump.out | 2 +-
ql/src/test/resources/orc-file-has-null.out | 2 +-
.../results/clientpositive/orc_file_dump.q.out | 6 +-
.../results/clientpositive/orc_merge10.q.out | 4 +-
.../results/clientpositive/orc_merge11.q.out | 6 +-
.../clientpositive/tez/orc_merge10.q.out | 4 +-
.../clientpositive/tez/orc_merge11.q.out | 6 +-
.../hadoop/hive/ql/util/JavaDataModel.java | 335 ++
.../apache/hive/common/util/BloomFilter.java | 309 ++
.../org/apache/hive/common/util/Murmur3.java | 335 ++
.../apache/hive/common/util/TestMurmur3.java | 224 ++
36 files changed, 4489 insertions(+), 4499 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/06e39ebe/common/src/java/org/apache/hive/common/util/BloomFilter.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hive/common/util/BloomFilter.java b/common/src/java/org/apache/hive/common/util/BloomFilter.java
deleted file mode 100644
index bb0b8f2..0000000
--- a/common/src/java/org/apache/hive/common/util/BloomFilter.java
+++ /dev/null
@@ -1,309 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-/**
- * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are
- * highly space efficient when compared to using a HashSet. Because of the probabilistic nature of
- * bloom filter false positive (element not present in bloom filter but test() says true) are
- * possible but false negatives are not possible (if element is present then test() will never
- * say false). The false positive probability is configurable (default: 5%) depending on which
- * storage requirement may increase or decrease. Lower the false positive probability greater
- * is the space requirement.
- * Bloom filters are sensitive to number of elements that will be inserted in the bloom filter.
- * During the creation of bloom filter expected number of entries must be specified. If the number
- * of insertions exceed the specified initial number of entries then false positive probability will
- * increase accordingly.
- *
- * Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash
- * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it suffers from hash
- * collisions for specific sequence of repeating bytes. Check the following link for more info
- * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw
- */
-public class BloomFilter {
- public static final double DEFAULT_FPP = 0.05;
- protected BitSet bitSet;
- protected int numBits;
- protected int numHashFunctions;
-
- public BloomFilter() {
- }
-
- public BloomFilter(long expectedEntries) {
- this(expectedEntries, DEFAULT_FPP);
- }
-
- public BloomFilter(long expectedEntries, double fpp) {
- checkArgument(expectedEntries > 0, "expectedEntries should be > 0");
- checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0");
- int nb = optimalNumOfBits(expectedEntries, fpp);
- // make 'm' multiple of 64
- this.numBits = nb + (Long.SIZE - (nb % Long.SIZE));
- this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, numBits);
- this.bitSet = new BitSet(numBits);
- }
-
- /**
- * A constructor to support rebuilding the BloomFilter from a serialized representation.
- * @param bits
- * @param numBits
- * @param numFuncs
- */
- public BloomFilter(List<Long> bits, int numBits, int numFuncs) {
- super();
- long[] copied = new long[bits.size()];
- for (int i = 0; i < bits.size(); i++) copied[i] = bits.get(i);
- bitSet = new BitSet(copied);
- this.numBits = numBits;
- numHashFunctions = numFuncs;
- }
-
- static int optimalNumOfHashFunctions(long n, long m) {
- return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
- }
-
- static int optimalNumOfBits(long n, double p) {
- return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
- }
-
- public void add(byte[] val) {
- if (val == null) {
- addBytes(val, -1, -1);
- } else {
- addBytes(val, 0, val.length);
- }
- }
-
- public void addBytes(byte[] val, int offset, int length) {
- // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter"
- // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively
- // implement a Bloom filter without any loss in the asymptotic false positive probability'
-
- // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned
- // in the above paper
- long hash64 = val == null ? Murmur3.NULL_HASHCODE :
- Murmur3.hash64(val, offset, length);
- addHash(hash64);
- }
-
- private void addHash(long hash64) {
- int hash1 = (int) hash64;
- int hash2 = (int) (hash64 >>> 32);
-
- for (int i = 1; i <= numHashFunctions; i++) {
- int combinedHash = hash1 + (i * hash2);
- // hashcode should be positive, flip all the bits if it's negative
- if (combinedHash < 0) {
- combinedHash = ~combinedHash;
- }
- int pos = combinedHash % numBits;
- bitSet.set(pos);
- }
- }
-
- public void addString(String val) {
- if (val == null) {
- add(null);
- } else {
- add(val.getBytes());
- }
- }
-
- public void addLong(long val) {
- addHash(getLongHash(val));
- }
-
- public void addDouble(double val) {
- addLong(Double.doubleToLongBits(val));
- }
-
- public boolean test(byte[] val) {
- if (val == null) {
- return testBytes(val, -1, -1);
- }
- return testBytes(val, 0, val.length);
- }
-
- public boolean testBytes(byte[] val, int offset, int length) {
- long hash64 = val == null ? Murmur3.NULL_HASHCODE :
- Murmur3.hash64(val, offset, length);
- return testHash(hash64);
- }
-
- private boolean testHash(long hash64) {
- int hash1 = (int) hash64;
- int hash2 = (int) (hash64 >>> 32);
-
- for (int i = 1; i <= numHashFunctions; i++) {
- int combinedHash = hash1 + (i * hash2);
- // hashcode should be positive, flip all the bits if it's negative
- if (combinedHash < 0) {
- combinedHash = ~combinedHash;
- }
- int pos = combinedHash % numBits;
- if (!bitSet.get(pos)) {
- return false;
- }
- }
- return true;
- }
-
- public boolean testString(String val) {
- if (val == null) {
- return test(null);
- } else {
- return test(val.getBytes());
- }
- }
-
- public boolean testLong(long val) {
- return testHash(getLongHash(val));
- }
-
- // Thomas Wang's integer hash function
- // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
- private long getLongHash(long key) {
- key = (~key) + (key << 21); // key = (key << 21) - key - 1;
- key = key ^ (key >> 24);
- key = (key + (key << 3)) + (key << 8); // key * 265
- key = key ^ (key >> 14);
- key = (key + (key << 2)) + (key << 4); // key * 21
- key = key ^ (key >> 28);
- key = key + (key << 31);
- return key;
- }
-
- public boolean testDouble(double val) {
- return testLong(Double.doubleToLongBits(val));
- }
-
- public long sizeInBytes() {
- return getBitSize() / 8;
- }
-
- public int getBitSize() {
- return bitSet.getData().length * Long.SIZE;
- }
-
- public int getNumHashFunctions() {
- return numHashFunctions;
- }
-
- public long[] getBitSet() {
- return bitSet.getData();
- }
-
- @Override
- public String toString() {
- return "m: " + numBits + " k: " + numHashFunctions;
- }
-
- /**
- * Merge the specified bloom filter with current bloom filter.
- *
- * @param that - bloom filter to merge
- */
- public void merge(BloomFilter that) {
- if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions) {
- this.bitSet.putAll(that.bitSet);
- } else {
- throw new IllegalArgumentException("BloomFilters are not compatible for merging." +
- " this - " + this.toString() + " that - " + that.toString());
- }
- }
-
- public void reset() {
- this.bitSet.clear();
- }
-
- /**
- * Bare metal bit set implementation. For performance reasons, this implementation does not check
- * for index bounds nor expand the bit set size if the specified index is greater than the size.
- */
- public class BitSet {
- private final long[] data;
-
- public BitSet(long bits) {
- this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]);
- }
-
- /**
- * Deserialize long array as bit set.
- *
- * @param data - bit array
- */
- public BitSet(long[] data) {
- assert data.length > 0 : "data length is zero!";
- this.data = data;
- }
-
- /**
- * Sets the bit at specified index.
- *
- * @param index - position
- */
- public void set(int index) {
- data[index >>> 6] |= (1L << index);
- }
-
- /**
- * Returns true if the bit is set in the specified index.
- *
- * @param index - position
- * @return - value at the bit position
- */
- public boolean get(int index) {
- return (data[index >>> 6] & (1L << index)) != 0;
- }
-
- /**
- * Number of bits
- */
- public long bitSize() {
- return (long) data.length * Long.SIZE;
- }
-
- public long[] getData() {
- return data;
- }
-
- /**
- * Combines the two BitArrays using bitwise OR.
- */
- public void putAll(BitSet array) {
- assert data.length == array.data.length :
- "BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")";
- for (int i = 0; i < data.length; i++) {
- data[i] |= array.data[i];
- }
- }
-
- /**
- * Clear the bit set.
- */
- public void clear() {
- Arrays.fill(data, 0);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/06e39ebe/common/src/java/org/apache/hive/common/util/Murmur3.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hive/common/util/Murmur3.java b/common/src/java/org/apache/hive/common/util/Murmur3.java
deleted file mode 100644
index 88c3514..0000000
--- a/common/src/java/org/apache/hive/common/util/Murmur3.java
+++ /dev/null
@@ -1,335 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-/**
- * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms.
- *
- * Murmur3 32 and 128 bit variants.
- * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
- * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
- *
- * This is a public domain code with no copyrights.
- * From homepage of MurmurHash (https://code.google.com/p/smhasher/),
- * "All MurmurHash versions are public domain software, and the author disclaims all copyright
- * to their code."
- */
-public class Murmur3 {
- // from 64-bit linear congruential generator
- public static final long NULL_HASHCODE = 2862933555777941757L;
-
- // Constants for 32 bit variant
- private static final int C1_32 = 0xcc9e2d51;
- private static final int C2_32 = 0x1b873593;
- private static final int R1_32 = 15;
- private static final int R2_32 = 13;
- private static final int M_32 = 5;
- private static final int N_32 = 0xe6546b64;
-
- // Constants for 128 bit variant
- private static final long C1 = 0x87c37b91114253d5L;
- private static final long C2 = 0x4cf5ad432745937fL;
- private static final int R1 = 31;
- private static final int R2 = 27;
- private static final int R3 = 33;
- private static final int M = 5;
- private static final int N1 = 0x52dce729;
- private static final int N2 = 0x38495ab5;
-
- private static final int DEFAULT_SEED = 104729;
-
- /**
- * Murmur3 32-bit variant.
- *
- * @param data - input byte array
- * @return - hashcode
- */
- public static int hash32(byte[] data) {
- return hash32(data, data.length, DEFAULT_SEED);
- }
-
- /**
- * Murmur3 32-bit variant.
- *
- * @param data - input byte array
- * @param length - length of array
- * @param seed - seed. (default 0)
- * @return - hashcode
- */
- public static int hash32(byte[] data, int length, int seed) {
- int hash = seed;
- final int nblocks = length >> 2;
-
- // body
- for (int i = 0; i < nblocks; i++) {
- int i_4 = i << 2;
- int k = (data[i_4] & 0xff)
- | ((data[i_4 + 1] & 0xff) << 8)
- | ((data[i_4 + 2] & 0xff) << 16)
- | ((data[i_4 + 3] & 0xff) << 24);
-
- // mix functions
- k *= C1_32;
- k = Integer.rotateLeft(k, R1_32);
- k *= C2_32;
- hash ^= k;
- hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
- }
-
- // tail
- int idx = nblocks << 2;
- int k1 = 0;
- switch (length - idx) {
- case 3:
- k1 ^= data[idx + 2] << 16;
- case 2:
- k1 ^= data[idx + 1] << 8;
- case 1:
- k1 ^= data[idx];
-
- // mix functions
- k1 *= C1_32;
- k1 = Integer.rotateLeft(k1, R1_32);
- k1 *= C2_32;
- hash ^= k1;
- }
-
- // finalization
- hash ^= length;
- hash ^= (hash >>> 16);
- hash *= 0x85ebca6b;
- hash ^= (hash >>> 13);
- hash *= 0xc2b2ae35;
- hash ^= (hash >>> 16);
-
- return hash;
- }
-
- /**
- * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
- *
- * @param data - input byte array
- * @return - hashcode
- */
- public static long hash64(byte[] data) {
- return hash64(data, 0, data.length, DEFAULT_SEED);
- }
-
- public static long hash64(byte[] data, int offset, int length) {
- return hash64(data, offset, length, DEFAULT_SEED);
- }
-
- /**
- * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
- *
- * @param data - input byte array
- * @param length - length of array
- * @param seed - seed. (default is 0)
- * @return - hashcode
- */
- public static long hash64(byte[] data, int offset, int length, int seed) {
- long hash = seed;
- final int nblocks = length >> 3;
-
- // body
- for (int i = 0; i < nblocks; i++) {
- final int i8 = i << 3;
- long k = ((long) data[offset + i8] & 0xff)
- | (((long) data[offset + i8 + 1] & 0xff) << 8)
- | (((long) data[offset + i8 + 2] & 0xff) << 16)
- | (((long) data[offset + i8 + 3] & 0xff) << 24)
- | (((long) data[offset + i8 + 4] & 0xff) << 32)
- | (((long) data[offset + i8 + 5] & 0xff) << 40)
- | (((long) data[offset + i8 + 6] & 0xff) << 48)
- | (((long) data[offset + i8 + 7] & 0xff) << 56);
-
- // mix functions
- k *= C1;
- k = Long.rotateLeft(k, R1);
- k *= C2;
- hash ^= k;
- hash = Long.rotateLeft(hash, R2) * M + N1;
- }
-
- // tail
- long k1 = 0;
- int tailStart = nblocks << 3;
- switch (length - tailStart) {
- case 7:
- k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
- case 6:
- k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
- case 5:
- k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
- case 4:
- k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
- case 3:
- k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
- case 2:
- k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
- case 1:
- k1 ^= ((long) data[offset + tailStart] & 0xff);
- k1 *= C1;
- k1 = Long.rotateLeft(k1, R1);
- k1 *= C2;
- hash ^= k1;
- }
-
- // finalization
- hash ^= length;
- hash = fmix64(hash);
-
- return hash;
- }
-
- /**
- * Murmur3 128-bit variant.
- *
- * @param data - input byte array
- * @return - hashcode (2 longs)
- */
- public static long[] hash128(byte[] data) {
- return hash128(data, 0, data.length, DEFAULT_SEED);
- }
-
- /**
- * Murmur3 128-bit variant.
- *
- * @param data - input byte array
- * @param offset - the first element of array
- * @param length - length of array
- * @param seed - seed. (default is 0)
- * @return - hashcode (2 longs)
- */
- public static long[] hash128(byte[] data, int offset, int length, int seed) {
- long h1 = seed;
- long h2 = seed;
- final int nblocks = length >> 4;
-
- // body
- for (int i = 0; i < nblocks; i++) {
- final int i16 = i << 4;
- long k1 = ((long) data[offset + i16] & 0xff)
- | (((long) data[offset + i16 + 1] & 0xff) << 8)
- | (((long) data[offset + i16 + 2] & 0xff) << 16)
- | (((long) data[offset + i16 + 3] & 0xff) << 24)
- | (((long) data[offset + i16 + 4] & 0xff) << 32)
- | (((long) data[offset + i16 + 5] & 0xff) << 40)
- | (((long) data[offset + i16 + 6] & 0xff) << 48)
- | (((long) data[offset + i16 + 7] & 0xff) << 56);
-
- long k2 = ((long) data[offset + i16 + 8] & 0xff)
- | (((long) data[offset + i16 + 9] & 0xff) << 8)
- | (((long) data[offset + i16 + 10] & 0xff) << 16)
- | (((long) data[offset + i16 + 11] & 0xff) << 24)
- | (((long) data[offset + i16 + 12] & 0xff) << 32)
- | (((long) data[offset + i16 + 13] & 0xff) << 40)
- | (((long) data[offset + i16 + 14] & 0xff) << 48)
- | (((long) data[offset + i16 + 15] & 0xff) << 56);
-
- // mix functions for k1
- k1 *= C1;
- k1 = Long.rotateLeft(k1, R1);
- k1 *= C2;
- h1 ^= k1;
- h1 = Long.rotateLeft(h1, R2);
- h1 += h2;
- h1 = h1 * M + N1;
-
- // mix functions for k2
- k2 *= C2;
- k2 = Long.rotateLeft(k2, R3);
- k2 *= C1;
- h2 ^= k2;
- h2 = Long.rotateLeft(h2, R1);
- h2 += h1;
- h2 = h2 * M + N2;
- }
-
- // tail
- long k1 = 0;
- long k2 = 0;
- int tailStart = nblocks << 4;
- switch (length - tailStart) {
- case 15:
- k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
- case 14:
- k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
- case 13:
- k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
- case 12:
- k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
- case 11:
- k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
- case 10:
- k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
- case 9:
- k2 ^= (long) (data[offset + tailStart + 8] & 0xff);
- k2 *= C2;
- k2 = Long.rotateLeft(k2, R3);
- k2 *= C1;
- h2 ^= k2;
-
- case 8:
- k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
- case 7:
- k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
- case 6:
- k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
- case 5:
- k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
- case 4:
- k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
- case 3:
- k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
- case 2:
- k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
- case 1:
- k1 ^= (long) (data[offset + tailStart] & 0xff);
- k1 *= C1;
- k1 = Long.rotateLeft(k1, R1);
- k1 *= C2;
- h1 ^= k1;
- }
-
- // finalization
- h1 ^= length;
- h2 ^= length;
-
- h1 += h2;
- h2 += h1;
-
- h1 = fmix64(h1);
- h2 = fmix64(h2);
-
- h1 += h2;
- h2 += h1;
-
- return new long[]{h1, h2};
- }
-
- private static long fmix64(long h) {
- h ^= (h >>> 33);
- h *= 0xff51afd7ed558ccdL;
- h ^= (h >>> 33);
- h *= 0xc4ceb9fe1a85ec53L;
- h ^= (h >>> 33);
- return h;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/06e39ebe/common/src/test/org/apache/hive/common/util/TestMurmur3.java
----------------------------------------------------------------------
diff --git a/common/src/test/org/apache/hive/common/util/TestMurmur3.java b/common/src/test/org/apache/hive/common/util/TestMurmur3.java
deleted file mode 100644
index 5facc7c..0000000
--- a/common/src/test/org/apache/hive/common/util/TestMurmur3.java
+++ /dev/null
@@ -1,224 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-import static org.junit.Assert.assertEquals;
-
-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-
-import org.junit.Test;
-
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.util.Arrays;
-import java.util.Random;
-
-/**
- * Tests for Murmur3 variants.
- */
-public class TestMurmur3 {
-
- @Test
- public void testHashCodesM3_32_string() {
- String key = "test";
- int seed = 123;
- HashFunction hf = Hashing.murmur3_32(seed);
- int hc1 = hf.hashBytes(key.getBytes()).asInt();
- int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
- assertEquals(hc1, hc2);
-
- key = "testkey";
- hc1 = hf.hashBytes(key.getBytes()).asInt();
- hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
- assertEquals(hc1, hc2);
- }
-
- @Test
- public void testHashCodesM3_32_ints() {
- int seed = 123;
- Random rand = new Random(seed);
- HashFunction hf = Hashing.murmur3_32(seed);
- for (int i = 0; i < 1000; i++) {
- int val = rand.nextInt();
- byte[] data = ByteBuffer.allocate(4).putInt(val).array();
- int hc1 = hf.hashBytes(data).asInt();
- int hc2 = Murmur3.hash32(data, data.length, seed);
- assertEquals(hc1, hc2);
- }
- }
-
- @Test
- public void testHashCodesM3_32_longs() {
- int seed = 123;
- Random rand = new Random(seed);
- HashFunction hf = Hashing.murmur3_32(seed);
- for (int i = 0; i < 1000; i++) {
- long val = rand.nextLong();
- byte[] data = ByteBuffer.allocate(8).putLong(val).array();
- int hc1 = hf.hashBytes(data).asInt();
- int hc2 = Murmur3.hash32(data, data.length, seed);
- assertEquals(hc1, hc2);
- }
- }
-
- @Test
- public void testHashCodesM3_32_double() {
- int seed = 123;
- Random rand = new Random(seed);
- HashFunction hf = Hashing.murmur3_32(seed);
- for (int i = 0; i < 1000; i++) {
- double val = rand.nextDouble();
- byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
- int hc1 = hf.hashBytes(data).asInt();
- int hc2 = Murmur3.hash32(data, data.length, seed);
- assertEquals(hc1, hc2);
- }
- }
-
- @Test
- public void testHashCodesM3_128_string() {
- String key = "test";
- int seed = 123;
- HashFunction hf = Hashing.murmur3_128(seed);
- // guava stores the hashcodes in little endian order
- ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
- buf.put(hf.hashBytes(key.getBytes()).asBytes());
- buf.flip();
- long gl1 = buf.getLong();
- long gl2 = buf.getLong(8);
- long[] hc = Murmur3.hash128(key.getBytes(), 0, key.getBytes().length, seed);
- long m1 = hc[0];
- long m2 = hc[1];
- assertEquals(gl1, m1);
- assertEquals(gl2, m2);
-
- key = "testkey128_testkey128";
- buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
- buf.put(hf.hashBytes(key.getBytes()).asBytes());
- buf.flip();
- gl1 = buf.getLong();
- gl2 = buf.getLong(8);
- byte[] keyBytes = key.getBytes();
- hc = Murmur3.hash128(keyBytes, 0, keyBytes.length, seed);
- m1 = hc[0];
- m2 = hc[1];
- assertEquals(gl1, m1);
- assertEquals(gl2, m2);
-
- byte[] offsetKeyBytes = new byte[keyBytes.length + 35];
- Arrays.fill(offsetKeyBytes, (byte) -1);
- System.arraycopy(keyBytes, 0, offsetKeyBytes, 35, keyBytes.length);
- hc = Murmur3.hash128(offsetKeyBytes, 35, keyBytes.length, seed);
- assertEquals(gl1, hc[0]);
- assertEquals(gl2, hc[1]);
- }
-
- @Test
- public void testHashCodeM3_64() {
- byte[] origin = ("It was the best of times, it was the worst of times," +
- " it was the age of wisdom, it was the age of foolishness," +
- " it was the epoch of belief, it was the epoch of incredulity," +
- " it was the season of Light, it was the season of Darkness," +
- " it was the spring of hope, it was the winter of despair," +
- " we had everything before us, we had nothing before us," +
- " we were all going direct to Heaven," +
- " we were all going direct the other way.").getBytes();
- long hash = Murmur3.hash64(origin, 0, origin.length);
- assertEquals(305830725663368540L, hash);
-
- byte[] originOffset = new byte[origin.length + 150];
- Arrays.fill(originOffset, (byte) 123);
- System.arraycopy(origin, 0, originOffset, 150, origin.length);
- hash = Murmur3.hash64(originOffset, 150, origin.length);
- assertEquals(305830725663368540L, hash);
- }
-
- @Test
- public void testHashCodesM3_128_ints() {
- int seed = 123;
- Random rand = new Random(seed);
- HashFunction hf = Hashing.murmur3_128(seed);
- for (int i = 0; i < 1000; i++) {
- int val = rand.nextInt();
- byte[] data = ByteBuffer.allocate(4).putInt(val).array();
- // guava stores the hashcodes in little endian order
- ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
- buf.put(hf.hashBytes(data).asBytes());
- buf.flip();
- long gl1 = buf.getLong();
- long gl2 = buf.getLong(8);
- long[] hc = Murmur3.hash128(data, 0, data.length, seed);
- long m1 = hc[0];
- long m2 = hc[1];
- assertEquals(gl1, m1);
- assertEquals(gl2, m2);
-
- byte[] offsetData = new byte[data.length + 50];
- System.arraycopy(data, 0, offsetData, 50, data.length);
- hc = Murmur3.hash128(offsetData, 50, data.length, seed);
- assertEquals(gl1, hc[0]);
- assertEquals(gl2, hc[1]);
- }
- }
-
- @Test
- public void testHashCodesM3_128_longs() {
- int seed = 123;
- Random rand = new Random(seed);
- HashFunction hf = Hashing.murmur3_128(seed);
- for (int i = 0; i < 1000; i++) {
- long val = rand.nextLong();
- byte[] data = ByteBuffer.allocate(8).putLong(val).array();
- // guava stores the hashcodes in little endian order
- ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
- buf.put(hf.hashBytes(data).asBytes());
- buf.flip();
- long gl1 = buf.getLong();
- long gl2 = buf.getLong(8);
- long[] hc = Murmur3.hash128(data, 0, data.length, seed);
- long m1 = hc[0];
- long m2 = hc[1];
- assertEquals(gl1, m1);
- assertEquals(gl2, m2);
- }
- }
-
- @Test
- public void testHashCodesM3_128_double() {
- int seed = 123;
- Random rand = new Random(seed);
- HashFunction hf = Hashing.murmur3_128(seed);
- for (int i = 0; i < 1000; i++) {
- double val = rand.nextDouble();
- byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
- // guava stores the hashcodes in little endian order
- ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
- buf.put(hf.hashBytes(data).asBytes());
- buf.flip();
- long gl1 = buf.getLong();
- long gl2 = buf.getLong(8);
- long[] hc = Murmur3.hash128(data, 0, data.length, seed);
- long m1 = hc[0];
- long m2 = hc[1];
- assertEquals(gl1, m1);
- assertEquals(gl2, m2);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/06e39ebe/orc/src/java/org/apache/orc/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/BloomFilterIO.java b/orc/src/java/org/apache/orc/BloomFilterIO.java
new file mode 100644
index 0000000..1406266
--- /dev/null
+++ b/orc/src/java/org/apache/orc/BloomFilterIO.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import org.apache.hive.common.util.BloomFilter;
+
+import com.google.common.primitives.Longs;
+
+public class BloomFilterIO extends BloomFilter {
+
+ public BloomFilterIO(long expectedEntries) {
+ super(expectedEntries, DEFAULT_FPP);
+ }
+
+ public BloomFilterIO(long expectedEntries, double fpp) {
+ super(expectedEntries, fpp);
+ }
+
+/**
+ * Initializes the BloomFilter from the given Orc BloomFilter
+ */
+ public BloomFilterIO(OrcProto.BloomFilter bloomFilter) {
+ this.bitSet = new BitSet(Longs.toArray(bloomFilter.getBitsetList()));
+ this.numHashFunctions = bloomFilter.getNumHashFunctions();
+ this.numBits = (int) this.bitSet.bitSize();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/06e39ebe/orc/src/java/org/apache/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/OrcFile.java b/orc/src/java/org/apache/orc/OrcFile.java
index 9ea0b52..98226f9 100644
--- a/orc/src/java/org/apache/orc/OrcFile.java
+++ b/orc/src/java/org/apache/orc/OrcFile.java
@@ -23,7 +23,9 @@ import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
import org.apache.orc.impl.MemoryManager;
+import org.apache.orc.impl.WriterImpl;
/**
* Contains factory methods to read or write ORC files.
@@ -102,6 +104,8 @@ public class OrcFile {
ORIGINAL(0),
HIVE_8732(1), // corrupted stripe/file maximum column statistics
HIVE_4243(2), // use real column names from Hive tables
+ HIVE_12055(3), // vectorized writer
+
// Don't use any magic numbers here except for the below:
FUTURE(Integer.MAX_VALUE); // a version from a future writer
@@ -138,6 +142,7 @@ public class OrcFile {
return values[val];
}
}
+ public static final WriterVersion CURRENT_WRITER = WriterVersion.HIVE_12055;
public enum EncodingStrategy {
SPEED, COMPRESSION
@@ -511,4 +516,21 @@ public class OrcFile {
return memoryManager.get();
}
+ /**
+ * Create an ORC file writer. This is the public interface for creating
+ * writers going forward and new options will only be added to this method.
+ * @param path filename to write to
+ * @param opts the options
+ * @return a new ORC file writer
+ * @throws IOException
+ */
+ public static Writer createWriter(Path path,
+ WriterOptions opts
+ ) throws IOException {
+ FileSystem fs = opts.getFileSystem() == null ?
+ path.getFileSystem(opts.getConfiguration()) : opts.getFileSystem();
+
+ return new WriterImpl(fs, path, opts);
+ }
+
}
http://git-wip-us.apache.org/repos/asf/hive/blob/06e39ebe/orc/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/TypeDescription.java b/orc/src/java/org/apache/orc/TypeDescription.java
index fc945e4..f97a113 100644
--- a/orc/src/java/org/apache/orc/TypeDescription.java
+++ b/orc/src/java/org/apache/orc/TypeDescription.java
@@ -275,7 +275,7 @@ public class TypeDescription {
return maxId;
}
- private ColumnVector createColumn() {
+ private ColumnVector createColumn(int maxSize) {
switch (category) {
case BOOLEAN:
case BYTE:
@@ -298,7 +298,7 @@ public class TypeDescription {
case STRUCT: {
ColumnVector[] fieldVector = new ColumnVector[children.size()];
for(int i=0; i < fieldVector.length; ++i) {
- fieldVector[i] = children.get(i).createColumn();
+ fieldVector[i] = children.get(i).createColumn(maxSize);
}
return new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
fieldVector);
@@ -306,38 +306,42 @@ public class TypeDescription {
case UNION: {
ColumnVector[] fieldVector = new ColumnVector[children.size()];
for(int i=0; i < fieldVector.length; ++i) {
- fieldVector[i] = children.get(i).createColumn();
+ fieldVector[i] = children.get(i).createColumn(maxSize);
}
return new UnionColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
fieldVector);
}
case LIST:
return new ListColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
- children.get(0).createColumn());
+ children.get(0).createColumn(maxSize));
case MAP:
return new MapColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
- children.get(0).createColumn(), children.get(1).createColumn());
+ children.get(0).createColumn(maxSize),
+ children.get(1).createColumn(maxSize));
default:
throw new IllegalArgumentException("Unknown type " + category);
}
}
- public VectorizedRowBatch createRowBatch() {
+ public VectorizedRowBatch createRowBatch(int maxSize) {
VectorizedRowBatch result;
if (category == Category.STRUCT) {
- result = new VectorizedRowBatch(children.size(),
- VectorizedRowBatch.DEFAULT_SIZE);
+ result = new VectorizedRowBatch(children.size(), maxSize);
for(int i=0; i < result.cols.length; ++i) {
- result.cols[i] = children.get(i).createColumn();
+ result.cols[i] = children.get(i).createColumn(maxSize);
}
} else {
- result = new VectorizedRowBatch(1, VectorizedRowBatch.DEFAULT_SIZE);
- result.cols[0] = createColumn();
+ result = new VectorizedRowBatch(1, maxSize);
+ result.cols[0] = createColumn(maxSize);
}
result.reset();
return result;
}
+ public VectorizedRowBatch createRowBatch() {
+ return createRowBatch(VectorizedRowBatch.DEFAULT_SIZE);
+ }
+
/**
* Get the kind of this type.
* @return get the category for this type.