You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2016/09/22 19:21:53 UTC
[1/4] orc git commit: ORC-101 Correct bloom filters for strings and
decimals to use utf8 encoding.
Repository: orc
Updated Branches:
refs/heads/master 7118e968b -> 604dcc801
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java b/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
deleted file mode 100644
index 88c3514..0000000
--- a/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
+++ /dev/null
@@ -1,335 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-/**
- * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms.
- *
- * Murmur3 32 and 128 bit variants.
- * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
- * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
- *
- * This is a public domain code with no copyrights.
- * From homepage of MurmurHash (https://code.google.com/p/smhasher/),
- * "All MurmurHash versions are public domain software, and the author disclaims all copyright
- * to their code."
- */
-public class Murmur3 {
- // from 64-bit linear congruential generator
- public static final long NULL_HASHCODE = 2862933555777941757L;
-
- // Constants for 32 bit variant
- private static final int C1_32 = 0xcc9e2d51;
- private static final int C2_32 = 0x1b873593;
- private static final int R1_32 = 15;
- private static final int R2_32 = 13;
- private static final int M_32 = 5;
- private static final int N_32 = 0xe6546b64;
-
- // Constants for 128 bit variant
- private static final long C1 = 0x87c37b91114253d5L;
- private static final long C2 = 0x4cf5ad432745937fL;
- private static final int R1 = 31;
- private static final int R2 = 27;
- private static final int R3 = 33;
- private static final int M = 5;
- private static final int N1 = 0x52dce729;
- private static final int N2 = 0x38495ab5;
-
- private static final int DEFAULT_SEED = 104729;
-
- /**
- * Murmur3 32-bit variant.
- *
- * @param data - input byte array
- * @return - hashcode
- */
- public static int hash32(byte[] data) {
- return hash32(data, data.length, DEFAULT_SEED);
- }
-
- /**
- * Murmur3 32-bit variant.
- *
- * @param data - input byte array
- * @param length - length of array
- * @param seed - seed. (default 0)
- * @return - hashcode
- */
- public static int hash32(byte[] data, int length, int seed) {
- int hash = seed;
- final int nblocks = length >> 2;
-
- // body
- for (int i = 0; i < nblocks; i++) {
- int i_4 = i << 2;
- int k = (data[i_4] & 0xff)
- | ((data[i_4 + 1] & 0xff) << 8)
- | ((data[i_4 + 2] & 0xff) << 16)
- | ((data[i_4 + 3] & 0xff) << 24);
-
- // mix functions
- k *= C1_32;
- k = Integer.rotateLeft(k, R1_32);
- k *= C2_32;
- hash ^= k;
- hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
- }
-
- // tail
- int idx = nblocks << 2;
- int k1 = 0;
- switch (length - idx) {
- case 3:
- k1 ^= data[idx + 2] << 16;
- case 2:
- k1 ^= data[idx + 1] << 8;
- case 1:
- k1 ^= data[idx];
-
- // mix functions
- k1 *= C1_32;
- k1 = Integer.rotateLeft(k1, R1_32);
- k1 *= C2_32;
- hash ^= k1;
- }
-
- // finalization
- hash ^= length;
- hash ^= (hash >>> 16);
- hash *= 0x85ebca6b;
- hash ^= (hash >>> 13);
- hash *= 0xc2b2ae35;
- hash ^= (hash >>> 16);
-
- return hash;
- }
-
- /**
- * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
- *
- * @param data - input byte array
- * @return - hashcode
- */
- public static long hash64(byte[] data) {
- return hash64(data, 0, data.length, DEFAULT_SEED);
- }
-
- public static long hash64(byte[] data, int offset, int length) {
- return hash64(data, offset, length, DEFAULT_SEED);
- }
-
- /**
- * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
- *
- * @param data - input byte array
- * @param length - length of array
- * @param seed - seed. (default is 0)
- * @return - hashcode
- */
- public static long hash64(byte[] data, int offset, int length, int seed) {
- long hash = seed;
- final int nblocks = length >> 3;
-
- // body
- for (int i = 0; i < nblocks; i++) {
- final int i8 = i << 3;
- long k = ((long) data[offset + i8] & 0xff)
- | (((long) data[offset + i8 + 1] & 0xff) << 8)
- | (((long) data[offset + i8 + 2] & 0xff) << 16)
- | (((long) data[offset + i8 + 3] & 0xff) << 24)
- | (((long) data[offset + i8 + 4] & 0xff) << 32)
- | (((long) data[offset + i8 + 5] & 0xff) << 40)
- | (((long) data[offset + i8 + 6] & 0xff) << 48)
- | (((long) data[offset + i8 + 7] & 0xff) << 56);
-
- // mix functions
- k *= C1;
- k = Long.rotateLeft(k, R1);
- k *= C2;
- hash ^= k;
- hash = Long.rotateLeft(hash, R2) * M + N1;
- }
-
- // tail
- long k1 = 0;
- int tailStart = nblocks << 3;
- switch (length - tailStart) {
- case 7:
- k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
- case 6:
- k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
- case 5:
- k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
- case 4:
- k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
- case 3:
- k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
- case 2:
- k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
- case 1:
- k1 ^= ((long) data[offset + tailStart] & 0xff);
- k1 *= C1;
- k1 = Long.rotateLeft(k1, R1);
- k1 *= C2;
- hash ^= k1;
- }
-
- // finalization
- hash ^= length;
- hash = fmix64(hash);
-
- return hash;
- }
-
- /**
- * Murmur3 128-bit variant.
- *
- * @param data - input byte array
- * @return - hashcode (2 longs)
- */
- public static long[] hash128(byte[] data) {
- return hash128(data, 0, data.length, DEFAULT_SEED);
- }
-
- /**
- * Murmur3 128-bit variant.
- *
- * @param data - input byte array
- * @param offset - the first element of array
- * @param length - length of array
- * @param seed - seed. (default is 0)
- * @return - hashcode (2 longs)
- */
- public static long[] hash128(byte[] data, int offset, int length, int seed) {
- long h1 = seed;
- long h2 = seed;
- final int nblocks = length >> 4;
-
- // body
- for (int i = 0; i < nblocks; i++) {
- final int i16 = i << 4;
- long k1 = ((long) data[offset + i16] & 0xff)
- | (((long) data[offset + i16 + 1] & 0xff) << 8)
- | (((long) data[offset + i16 + 2] & 0xff) << 16)
- | (((long) data[offset + i16 + 3] & 0xff) << 24)
- | (((long) data[offset + i16 + 4] & 0xff) << 32)
- | (((long) data[offset + i16 + 5] & 0xff) << 40)
- | (((long) data[offset + i16 + 6] & 0xff) << 48)
- | (((long) data[offset + i16 + 7] & 0xff) << 56);
-
- long k2 = ((long) data[offset + i16 + 8] & 0xff)
- | (((long) data[offset + i16 + 9] & 0xff) << 8)
- | (((long) data[offset + i16 + 10] & 0xff) << 16)
- | (((long) data[offset + i16 + 11] & 0xff) << 24)
- | (((long) data[offset + i16 + 12] & 0xff) << 32)
- | (((long) data[offset + i16 + 13] & 0xff) << 40)
- | (((long) data[offset + i16 + 14] & 0xff) << 48)
- | (((long) data[offset + i16 + 15] & 0xff) << 56);
-
- // mix functions for k1
- k1 *= C1;
- k1 = Long.rotateLeft(k1, R1);
- k1 *= C2;
- h1 ^= k1;
- h1 = Long.rotateLeft(h1, R2);
- h1 += h2;
- h1 = h1 * M + N1;
-
- // mix functions for k2
- k2 *= C2;
- k2 = Long.rotateLeft(k2, R3);
- k2 *= C1;
- h2 ^= k2;
- h2 = Long.rotateLeft(h2, R1);
- h2 += h1;
- h2 = h2 * M + N2;
- }
-
- // tail
- long k1 = 0;
- long k2 = 0;
- int tailStart = nblocks << 4;
- switch (length - tailStart) {
- case 15:
- k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
- case 14:
- k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
- case 13:
- k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
- case 12:
- k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
- case 11:
- k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
- case 10:
- k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
- case 9:
- k2 ^= (long) (data[offset + tailStart + 8] & 0xff);
- k2 *= C2;
- k2 = Long.rotateLeft(k2, R3);
- k2 *= C1;
- h2 ^= k2;
-
- case 8:
- k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
- case 7:
- k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
- case 6:
- k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
- case 5:
- k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
- case 4:
- k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
- case 3:
- k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
- case 2:
- k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
- case 1:
- k1 ^= (long) (data[offset + tailStart] & 0xff);
- k1 *= C1;
- k1 = Long.rotateLeft(k1, R1);
- k1 *= C2;
- h1 ^= k1;
- }
-
- // finalization
- h1 ^= length;
- h2 ^= length;
-
- h1 += h2;
- h2 += h1;
-
- h1 = fmix64(h1);
- h2 = fmix64(h2);
-
- h1 += h2;
- h2 += h1;
-
- return new long[]{h1, h2};
- }
-
- private static long fmix64(long h) {
- h ^= (h >>> 33);
- h *= 0xff51afd7ed558ccdL;
- h ^= (h >>> 33);
- h *= 0xc4ceb9fe1a85ec53L;
- h ^= (h >>> 33);
- return h;
- }
-}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/java/org/apache/orc/util/Murmur3.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/java/org/apache/orc/util/Murmur3.java b/java/storage-api/src/java/org/apache/orc/util/Murmur3.java
new file mode 100644
index 0000000..838681c
--- /dev/null
+++ b/java/storage-api/src/java/org/apache/orc/util/Murmur3.java
@@ -0,0 +1,335 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+/**
+ * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms.
+ *
+ * Murmur3 32 and 128 bit variants.
+ * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
+ * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
+ *
+ * This is a public domain code with no copyrights.
+ * From homepage of MurmurHash (https://code.google.com/p/smhasher/),
+ * "All MurmurHash versions are public domain software, and the author disclaims all copyright
+ * to their code."
+ */
+public class Murmur3 {
+ // from 64-bit linear congruential generator
+ public static final long NULL_HASHCODE = 2862933555777941757L;
+
+ // Constants for 32 bit variant
+ private static final int C1_32 = 0xcc9e2d51;
+ private static final int C2_32 = 0x1b873593;
+ private static final int R1_32 = 15;
+ private static final int R2_32 = 13;
+ private static final int M_32 = 5;
+ private static final int N_32 = 0xe6546b64;
+
+ // Constants for 128 bit variant
+ private static final long C1 = 0x87c37b91114253d5L;
+ private static final long C2 = 0x4cf5ad432745937fL;
+ private static final int R1 = 31;
+ private static final int R2 = 27;
+ private static final int R3 = 33;
+ private static final int M = 5;
+ private static final int N1 = 0x52dce729;
+ private static final int N2 = 0x38495ab5;
+
+ private static final int DEFAULT_SEED = 104729;
+
+ /**
+ * Murmur3 32-bit variant.
+ *
+ * @param data - input byte array
+ * @return - hashcode
+ */
+ public static int hash32(byte[] data) {
+ return hash32(data, data.length, DEFAULT_SEED);
+ }
+
+ /**
+ * Murmur3 32-bit variant.
+ *
+ * @param data - input byte array
+ * @param length - length of array
+ * @param seed - seed. (default 0)
+ * @return - hashcode
+ */
+ public static int hash32(byte[] data, int length, int seed) {
+ int hash = seed;
+ final int nblocks = length >> 2;
+
+ // body
+ for (int i = 0; i < nblocks; i++) {
+ int i_4 = i << 2;
+ int k = (data[i_4] & 0xff)
+ | ((data[i_4 + 1] & 0xff) << 8)
+ | ((data[i_4 + 2] & 0xff) << 16)
+ | ((data[i_4 + 3] & 0xff) << 24);
+
+ // mix functions
+ k *= C1_32;
+ k = Integer.rotateLeft(k, R1_32);
+ k *= C2_32;
+ hash ^= k;
+ hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
+ }
+
+ // tail
+ int idx = nblocks << 2;
+ int k1 = 0;
+ switch (length - idx) {
+ case 3:
+ k1 ^= data[idx + 2] << 16;
+ case 2:
+ k1 ^= data[idx + 1] << 8;
+ case 1:
+ k1 ^= data[idx];
+
+ // mix functions
+ k1 *= C1_32;
+ k1 = Integer.rotateLeft(k1, R1_32);
+ k1 *= C2_32;
+ hash ^= k1;
+ }
+
+ // finalization
+ hash ^= length;
+ hash ^= (hash >>> 16);
+ hash *= 0x85ebca6b;
+ hash ^= (hash >>> 13);
+ hash *= 0xc2b2ae35;
+ hash ^= (hash >>> 16);
+
+ return hash;
+ }
+
+ /**
+ * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
+ *
+ * @param data - input byte array
+ * @return - hashcode
+ */
+ public static long hash64(byte[] data) {
+ return hash64(data, 0, data.length, DEFAULT_SEED);
+ }
+
+ public static long hash64(byte[] data, int offset, int length) {
+ return hash64(data, offset, length, DEFAULT_SEED);
+ }
+
+ /**
+ * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
+ *
+ * @param data - input byte array
+ * @param length - length of array
+ * @param seed - seed. (default is 0)
+ * @return - hashcode
+ */
+ public static long hash64(byte[] data, int offset, int length, int seed) {
+ long hash = seed;
+ final int nblocks = length >> 3;
+
+ // body
+ for (int i = 0; i < nblocks; i++) {
+ final int i8 = i << 3;
+ long k = ((long) data[offset + i8] & 0xff)
+ | (((long) data[offset + i8 + 1] & 0xff) << 8)
+ | (((long) data[offset + i8 + 2] & 0xff) << 16)
+ | (((long) data[offset + i8 + 3] & 0xff) << 24)
+ | (((long) data[offset + i8 + 4] & 0xff) << 32)
+ | (((long) data[offset + i8 + 5] & 0xff) << 40)
+ | (((long) data[offset + i8 + 6] & 0xff) << 48)
+ | (((long) data[offset + i8 + 7] & 0xff) << 56);
+
+ // mix functions
+ k *= C1;
+ k = Long.rotateLeft(k, R1);
+ k *= C2;
+ hash ^= k;
+ hash = Long.rotateLeft(hash, R2) * M + N1;
+ }
+
+ // tail
+ long k1 = 0;
+ int tailStart = nblocks << 3;
+ switch (length - tailStart) {
+ case 7:
+ k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
+ case 6:
+ k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
+ case 5:
+ k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
+ case 4:
+ k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
+ case 3:
+ k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
+ case 2:
+ k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
+ case 1:
+ k1 ^= ((long) data[offset + tailStart] & 0xff);
+ k1 *= C1;
+ k1 = Long.rotateLeft(k1, R1);
+ k1 *= C2;
+ hash ^= k1;
+ }
+
+ // finalization
+ hash ^= length;
+ hash = fmix64(hash);
+
+ return hash;
+ }
+
+ /**
+ * Murmur3 128-bit variant.
+ *
+ * @param data - input byte array
+ * @return - hashcode (2 longs)
+ */
+ public static long[] hash128(byte[] data) {
+ return hash128(data, 0, data.length, DEFAULT_SEED);
+ }
+
+ /**
+ * Murmur3 128-bit variant.
+ *
+ * @param data - input byte array
+ * @param offset - the first element of array
+ * @param length - length of array
+ * @param seed - seed. (default is 0)
+ * @return - hashcode (2 longs)
+ */
+ public static long[] hash128(byte[] data, int offset, int length, int seed) {
+ long h1 = seed;
+ long h2 = seed;
+ final int nblocks = length >> 4;
+
+ // body
+ for (int i = 0; i < nblocks; i++) {
+ final int i16 = i << 4;
+ long k1 = ((long) data[offset + i16] & 0xff)
+ | (((long) data[offset + i16 + 1] & 0xff) << 8)
+ | (((long) data[offset + i16 + 2] & 0xff) << 16)
+ | (((long) data[offset + i16 + 3] & 0xff) << 24)
+ | (((long) data[offset + i16 + 4] & 0xff) << 32)
+ | (((long) data[offset + i16 + 5] & 0xff) << 40)
+ | (((long) data[offset + i16 + 6] & 0xff) << 48)
+ | (((long) data[offset + i16 + 7] & 0xff) << 56);
+
+ long k2 = ((long) data[offset + i16 + 8] & 0xff)
+ | (((long) data[offset + i16 + 9] & 0xff) << 8)
+ | (((long) data[offset + i16 + 10] & 0xff) << 16)
+ | (((long) data[offset + i16 + 11] & 0xff) << 24)
+ | (((long) data[offset + i16 + 12] & 0xff) << 32)
+ | (((long) data[offset + i16 + 13] & 0xff) << 40)
+ | (((long) data[offset + i16 + 14] & 0xff) << 48)
+ | (((long) data[offset + i16 + 15] & 0xff) << 56);
+
+ // mix functions for k1
+ k1 *= C1;
+ k1 = Long.rotateLeft(k1, R1);
+ k1 *= C2;
+ h1 ^= k1;
+ h1 = Long.rotateLeft(h1, R2);
+ h1 += h2;
+ h1 = h1 * M + N1;
+
+ // mix functions for k2
+ k2 *= C2;
+ k2 = Long.rotateLeft(k2, R3);
+ k2 *= C1;
+ h2 ^= k2;
+ h2 = Long.rotateLeft(h2, R1);
+ h2 += h1;
+ h2 = h2 * M + N2;
+ }
+
+ // tail
+ long k1 = 0;
+ long k2 = 0;
+ int tailStart = nblocks << 4;
+ switch (length - tailStart) {
+ case 15:
+ k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
+ case 14:
+ k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
+ case 13:
+ k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
+ case 12:
+ k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
+ case 11:
+ k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
+ case 10:
+ k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
+ case 9:
+ k2 ^= (long) (data[offset + tailStart + 8] & 0xff);
+ k2 *= C2;
+ k2 = Long.rotateLeft(k2, R3);
+ k2 *= C1;
+ h2 ^= k2;
+
+ case 8:
+ k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
+ case 7:
+ k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
+ case 6:
+ k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
+ case 5:
+ k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
+ case 4:
+ k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
+ case 3:
+ k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
+ case 2:
+ k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
+ case 1:
+ k1 ^= (long) (data[offset + tailStart] & 0xff);
+ k1 *= C1;
+ k1 = Long.rotateLeft(k1, R1);
+ k1 *= C2;
+ h1 ^= k1;
+ }
+
+ // finalization
+ h1 ^= length;
+ h2 ^= length;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+
+ h1 += h2;
+ h2 += h1;
+
+ return new long[]{h1, h2};
+ }
+
+ private static long fmix64(long h) {
+ h ^= (h >>> 33);
+ h *= 0xff51afd7ed558ccdL;
+ h ^= (h >>> 33);
+ h *= 0xc4ceb9fe1a85ec53L;
+ h ^= (h >>> 33);
+ return h;
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java b/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java
deleted file mode 100644
index 5facc7c..0000000
--- a/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java
+++ /dev/null
@@ -1,224 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-import static org.junit.Assert.assertEquals;
-
-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-
-import org.junit.Test;
-
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.util.Arrays;
-import java.util.Random;
-
-/**
- * Tests for Murmur3 variants.
- */
-public class TestMurmur3 {
-
- @Test
- public void testHashCodesM3_32_string() {
- String key = "test";
- int seed = 123;
- HashFunction hf = Hashing.murmur3_32(seed);
- int hc1 = hf.hashBytes(key.getBytes()).asInt();
- int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
- assertEquals(hc1, hc2);
-
- key = "testkey";
- hc1 = hf.hashBytes(key.getBytes()).asInt();
- hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
- assertEquals(hc1, hc2);
- }
-
- @Test
- public void testHashCodesM3_32_ints() {
- int seed = 123;
- Random rand = new Random(seed);
- HashFunction hf = Hashing.murmur3_32(seed);
- for (int i = 0; i < 1000; i++) {
- int val = rand.nextInt();
- byte[] data = ByteBuffer.allocate(4).putInt(val).array();
- int hc1 = hf.hashBytes(data).asInt();
- int hc2 = Murmur3.hash32(data, data.length, seed);
- assertEquals(hc1, hc2);
- }
- }
-
- @Test
- public void testHashCodesM3_32_longs() {
- int seed = 123;
- Random rand = new Random(seed);
- HashFunction hf = Hashing.murmur3_32(seed);
- for (int i = 0; i < 1000; i++) {
- long val = rand.nextLong();
- byte[] data = ByteBuffer.allocate(8).putLong(val).array();
- int hc1 = hf.hashBytes(data).asInt();
- int hc2 = Murmur3.hash32(data, data.length, seed);
- assertEquals(hc1, hc2);
- }
- }
-
- @Test
- public void testHashCodesM3_32_double() {
- int seed = 123;
- Random rand = new Random(seed);
- HashFunction hf = Hashing.murmur3_32(seed);
- for (int i = 0; i < 1000; i++) {
- double val = rand.nextDouble();
- byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
- int hc1 = hf.hashBytes(data).asInt();
- int hc2 = Murmur3.hash32(data, data.length, seed);
- assertEquals(hc1, hc2);
- }
- }
-
- @Test
- public void testHashCodesM3_128_string() {
- String key = "test";
- int seed = 123;
- HashFunction hf = Hashing.murmur3_128(seed);
- // guava stores the hashcodes in little endian order
- ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
- buf.put(hf.hashBytes(key.getBytes()).asBytes());
- buf.flip();
- long gl1 = buf.getLong();
- long gl2 = buf.getLong(8);
- long[] hc = Murmur3.hash128(key.getBytes(), 0, key.getBytes().length, seed);
- long m1 = hc[0];
- long m2 = hc[1];
- assertEquals(gl1, m1);
- assertEquals(gl2, m2);
-
- key = "testkey128_testkey128";
- buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
- buf.put(hf.hashBytes(key.getBytes()).asBytes());
- buf.flip();
- gl1 = buf.getLong();
- gl2 = buf.getLong(8);
- byte[] keyBytes = key.getBytes();
- hc = Murmur3.hash128(keyBytes, 0, keyBytes.length, seed);
- m1 = hc[0];
- m2 = hc[1];
- assertEquals(gl1, m1);
- assertEquals(gl2, m2);
-
- byte[] offsetKeyBytes = new byte[keyBytes.length + 35];
- Arrays.fill(offsetKeyBytes, (byte) -1);
- System.arraycopy(keyBytes, 0, offsetKeyBytes, 35, keyBytes.length);
- hc = Murmur3.hash128(offsetKeyBytes, 35, keyBytes.length, seed);
- assertEquals(gl1, hc[0]);
- assertEquals(gl2, hc[1]);
- }
-
- @Test
- public void testHashCodeM3_64() {
- byte[] origin = ("It was the best of times, it was the worst of times," +
- " it was the age of wisdom, it was the age of foolishness," +
- " it was the epoch of belief, it was the epoch of incredulity," +
- " it was the season of Light, it was the season of Darkness," +
- " it was the spring of hope, it was the winter of despair," +
- " we had everything before us, we had nothing before us," +
- " we were all going direct to Heaven," +
- " we were all going direct the other way.").getBytes();
- long hash = Murmur3.hash64(origin, 0, origin.length);
- assertEquals(305830725663368540L, hash);
-
- byte[] originOffset = new byte[origin.length + 150];
- Arrays.fill(originOffset, (byte) 123);
- System.arraycopy(origin, 0, originOffset, 150, origin.length);
- hash = Murmur3.hash64(originOffset, 150, origin.length);
- assertEquals(305830725663368540L, hash);
- }
-
- @Test
- public void testHashCodesM3_128_ints() {
- int seed = 123;
- Random rand = new Random(seed);
- HashFunction hf = Hashing.murmur3_128(seed);
- for (int i = 0; i < 1000; i++) {
- int val = rand.nextInt();
- byte[] data = ByteBuffer.allocate(4).putInt(val).array();
- // guava stores the hashcodes in little endian order
- ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
- buf.put(hf.hashBytes(data).asBytes());
- buf.flip();
- long gl1 = buf.getLong();
- long gl2 = buf.getLong(8);
- long[] hc = Murmur3.hash128(data, 0, data.length, seed);
- long m1 = hc[0];
- long m2 = hc[1];
- assertEquals(gl1, m1);
- assertEquals(gl2, m2);
-
- byte[] offsetData = new byte[data.length + 50];
- System.arraycopy(data, 0, offsetData, 50, data.length);
- hc = Murmur3.hash128(offsetData, 50, data.length, seed);
- assertEquals(gl1, hc[0]);
- assertEquals(gl2, hc[1]);
- }
- }
-
- @Test
- public void testHashCodesM3_128_longs() {
- int seed = 123;
- Random rand = new Random(seed);
- HashFunction hf = Hashing.murmur3_128(seed);
- for (int i = 0; i < 1000; i++) {
- long val = rand.nextLong();
- byte[] data = ByteBuffer.allocate(8).putLong(val).array();
- // guava stores the hashcodes in little endian order
- ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
- buf.put(hf.hashBytes(data).asBytes());
- buf.flip();
- long gl1 = buf.getLong();
- long gl2 = buf.getLong(8);
- long[] hc = Murmur3.hash128(data, 0, data.length, seed);
- long m1 = hc[0];
- long m2 = hc[1];
- assertEquals(gl1, m1);
- assertEquals(gl2, m2);
- }
- }
-
- @Test
- public void testHashCodesM3_128_double() {
- int seed = 123;
- Random rand = new Random(seed);
- HashFunction hf = Hashing.murmur3_128(seed);
- for (int i = 0; i < 1000; i++) {
- double val = rand.nextDouble();
- byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
- // guava stores the hashcodes in little endian order
- ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
- buf.put(hf.hashBytes(data).asBytes());
- buf.flip();
- long gl1 = buf.getLong();
- long gl2 = buf.getLong(8);
- long[] hc = Murmur3.hash128(data, 0, data.length, seed);
- long m1 = hc[0];
- long m2 = hc[1];
- assertEquals(gl1, m1);
- assertEquals(gl2, m2);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/java/org/apache/orc/tools/FileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 876070b..7206503 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -37,7 +37,8 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hdfs.DistributedFileSystem;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilter;
+import org.apache.orc.util.BloomFilterIO;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcFile;
@@ -383,7 +384,9 @@ public final class FileDump {
StringBuilder buf = new StringBuilder();
String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex());
buf.append(rowIdxString);
- String bloomFilString = getFormattedBloomFilters(col, indices.getBloomFilterIndex());
+ String bloomFilString = getFormattedBloomFilters(col, indices,
+ reader.getWriterVersion(),
+ reader.getSchema().findSubtype(col).getCategory());
buf.append(bloomFilString);
System.out.println(buf);
}
@@ -604,15 +607,18 @@ public final class FileDump {
return -1;
}
- private static String getFormattedBloomFilters(int col,
- OrcProto.BloomFilterIndex[] bloomFilterIndex) {
+ private static String getFormattedBloomFilters(int col, OrcIndex index,
+ OrcFile.WriterVersion version,
+ TypeDescription.Category type) {
+ OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex();
StringBuilder buf = new StringBuilder();
- BloomFilterIO stripeLevelBF = null;
+ BloomFilter stripeLevelBF = null;
if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
int idx = 0;
buf.append("\n Bloom filters for column ").append(col).append(":");
for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
- BloomFilterIO toMerge = new BloomFilterIO(bf);
+ BloomFilter toMerge = BloomFilterIO.deserialize(
+ index.getBloomFilterKinds()[col], version, type, bf);
buf.append("\n Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge));
if (stripeLevelBF == null) {
stripeLevelBF = toMerge;
@@ -626,7 +632,7 @@ public final class FileDump {
return buf.toString();
}
- private static String getBloomFilterStats(BloomFilterIO bf) {
+ private static String getBloomFilterStats(BloomFilter bf) {
StringBuilder sb = new StringBuilder();
int bitCount = bf.getBitSize();
int popCount = 0;
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index e2048ea..aa3072c 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -20,18 +20,20 @@ package org.apache.orc.tools;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
-import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
import org.apache.orc.impl.AcidStats;
import org.apache.orc.impl.OrcAcidUtils;
import org.apache.orc.impl.RecordReaderImpl;
+import org.apache.orc.util.BloomFilter;
import org.codehaus.jettison.json.JSONArray;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilterIO;
import org.apache.orc.BinaryColumnStatistics;
import org.apache.orc.BooleanColumnStatistics;
import org.apache.orc.ColumnStatistics;
@@ -50,12 +52,16 @@ import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import org.codehaus.jettison.json.JSONStringer;
import org.codehaus.jettison.json.JSONWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* File dump tool with json formatted output.
*/
public class JsonFileDump {
+ private static final Logger LOG = LoggerFactory.getLogger(JsonFileDump.class);
+
public static void printJsonMetaData(List<String> files,
Configuration conf,
List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone)
@@ -185,7 +191,9 @@ public class JsonFileDump {
writer.object();
writer.key("columnId").value(col);
writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
- writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex());
+ writeBloomFilterIndexes(writer, col, indices,
+ reader.getWriterVersion(),
+ reader.getSchema().findSubtype(col).getCategory());
writer.endObject();
}
writer.endArray();
@@ -334,16 +342,21 @@ public class JsonFileDump {
}
private static void writeBloomFilterIndexes(JSONWriter writer, int col,
- OrcProto.BloomFilterIndex[] bloomFilterIndex) throws JSONException {
+ OrcIndex index,
+ OrcFile.WriterVersion version,
+ TypeDescription.Category type
+ ) throws JSONException {
- BloomFilterIO stripeLevelBF = null;
+ BloomFilter stripeLevelBF = null;
+ OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex();
if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
int entryIx = 0;
writer.key("bloomFilterIndexes").array();
for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
writer.object();
writer.key("entryId").value(entryIx++);
- BloomFilterIO toMerge = new BloomFilterIO(bf);
+ BloomFilter toMerge = BloomFilterIO.deserialize(
+ index.getBloomFilterKinds()[col], version, type, bf);
writeBloomFilterStats(writer, toMerge);
if (stripeLevelBF == null) {
stripeLevelBF = toMerge;
@@ -362,7 +375,7 @@ public class JsonFileDump {
}
}
- private static void writeBloomFilterStats(JSONWriter writer, BloomFilterIO bf)
+ private static void writeBloomFilterStats(JSONWriter writer, BloomFilter bf)
throws JSONException {
int bitCount = bf.getBitSize();
int popCount = 0;
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index 10cc87d..65ff404 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -445,8 +445,9 @@ public class TestFileDump {
.compress(CompressionKind.ZLIB)
.bufferSize(10000)
.rowIndexStride(1000)
- .bloomFilterColumns("l")
- .bloomFilterFpp(0.01);
+ .bloomFilterColumns("l,s")
+ .bloomFilterFpp(0.01)
+ .bloomFilterVersion(OrcFile.BloomFilterVersion.ORIGINAL);
VectorizedRowBatch batch = schema.createRowBatch(1000);
Writer writer = OrcFile.createWriter(testFilePath, options);
Random r1 = new Random(1);
@@ -483,7 +484,6 @@ public class TestFileDump {
System.out.flush();
System.setOut(origOut);
-
checkOutput(outputFilename, workDir + File.separator + outputFilename);
}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/log4j.properties b/java/tools/src/test/resources/log4j.properties
new file mode 100644
index 0000000..8224baf
--- /dev/null
+++ b/java/tools/src/test/resources/log4j.properties
@@ -0,0 +1,21 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootLogger=WARN,stdout
+
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target = System.err
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n
+
+# Suppress the warnings about native io not being available
+log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
index 18fd2fb..b879bed 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
@@ -1,5 +1,5 @@
Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
Rows: 21000
Compression: ZLIB
Compression size: 4096
@@ -39,17 +39,17 @@ File Statistics:
Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
Stripes:
- Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 951
+ Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 743
Stream: column 0 section ROW_INDEX start: 3 length 17
Stream: column 1 section ROW_INDEX start: 20 length 166
Stream: column 2 section ROW_INDEX start: 186 length 169
Stream: column 3 section ROW_INDEX start: 355 length 87
- Stream: column 3 section BLOOM_FILTER start: 442 length 512
- Stream: column 1 section DATA start: 954 length 20035
- Stream: column 2 section DATA start: 20989 length 40050
- Stream: column 3 section DATA start: 61039 length 3543
- Stream: column 3 section LENGTH start: 64582 length 25
- Stream: column 3 section DICTIONARY_DATA start: 64607 length 133
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 304
+ Stream: column 1 section DATA start: 746 length 20035
+ Stream: column 2 section DATA start: 20781 length 40050
+ Stream: column 3 section DATA start: 60831 length 3543
+ Stream: column 3 section LENGTH start: 64374 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 64399 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -67,17 +67,17 @@ Stripes:
Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 64826 data: 63775 rows: 5000 tail: 86 index: 944
- Stream: column 0 section ROW_INDEX start: 64826 length 17
- Stream: column 1 section ROW_INDEX start: 64843 length 164
- Stream: column 2 section ROW_INDEX start: 65007 length 168
- Stream: column 3 section ROW_INDEX start: 65175 length 83
- Stream: column 3 section BLOOM_FILTER start: 65258 length 512
- Stream: column 1 section DATA start: 65770 length 20035
- Stream: column 2 section DATA start: 85805 length 40050
- Stream: column 3 section DATA start: 125855 length 3532
- Stream: column 3 section LENGTH start: 129387 length 25
- Stream: column 3 section DICTIONARY_DATA start: 129412 length 133
+ Stripe: offset: 64618 data: 63775 rows: 5000 tail: 86 index: 736
+ Stream: column 0 section ROW_INDEX start: 64618 length 17
+ Stream: column 1 section ROW_INDEX start: 64635 length 164
+ Stream: column 2 section ROW_INDEX start: 64799 length 168
+ Stream: column 3 section ROW_INDEX start: 64967 length 83
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 65050 length 304
+ Stream: column 1 section DATA start: 65354 length 20035
+ Stream: column 2 section DATA start: 85389 length 40050
+ Stream: column 3 section DATA start: 125439 length 3532
+ Stream: column 3 section LENGTH start: 128971 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 128996 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -95,17 +95,17 @@ Stripes:
Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 129631 data: 63787 rows: 5000 tail: 86 index: 950
- Stream: column 0 section ROW_INDEX start: 129631 length 17
- Stream: column 1 section ROW_INDEX start: 129648 length 163
- Stream: column 2 section ROW_INDEX start: 129811 length 168
- Stream: column 3 section ROW_INDEX start: 129979 length 90
- Stream: column 3 section BLOOM_FILTER start: 130069 length 512
- Stream: column 1 section DATA start: 130581 length 20035
- Stream: column 2 section DATA start: 150616 length 40050
- Stream: column 3 section DATA start: 190666 length 3544
- Stream: column 3 section LENGTH start: 194210 length 25
- Stream: column 3 section DICTIONARY_DATA start: 194235 length 133
+ Stripe: offset: 129215 data: 63787 rows: 5000 tail: 86 index: 742
+ Stream: column 0 section ROW_INDEX start: 129215 length 17
+ Stream: column 1 section ROW_INDEX start: 129232 length 163
+ Stream: column 2 section ROW_INDEX start: 129395 length 168
+ Stream: column 3 section ROW_INDEX start: 129563 length 90
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 129653 length 304
+ Stream: column 1 section DATA start: 129957 length 20035
+ Stream: column 2 section DATA start: 149992 length 40050
+ Stream: column 3 section DATA start: 190042 length 3544
+ Stream: column 3 section LENGTH start: 193586 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 193611 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -123,17 +123,17 @@ Stripes:
Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 194454 data: 63817 rows: 5000 tail: 86 index: 952
- Stream: column 0 section ROW_INDEX start: 194454 length 17
- Stream: column 1 section ROW_INDEX start: 194471 length 165
- Stream: column 2 section ROW_INDEX start: 194636 length 167
- Stream: column 3 section ROW_INDEX start: 194803 length 91
- Stream: column 3 section BLOOM_FILTER start: 194894 length 512
- Stream: column 1 section DATA start: 195406 length 20035
- Stream: column 2 section DATA start: 215441 length 40050
- Stream: column 3 section DATA start: 255491 length 3574
- Stream: column 3 section LENGTH start: 259065 length 25
- Stream: column 3 section DICTIONARY_DATA start: 259090 length 133
+ Stripe: offset: 193830 data: 63817 rows: 5000 tail: 85 index: 744
+ Stream: column 0 section ROW_INDEX start: 193830 length 17
+ Stream: column 1 section ROW_INDEX start: 193847 length 165
+ Stream: column 2 section ROW_INDEX start: 194012 length 167
+ Stream: column 3 section ROW_INDEX start: 194179 length 91
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 194270 length 304
+ Stream: column 1 section DATA start: 194574 length 20035
+ Stream: column 2 section DATA start: 214609 length 40050
+ Stream: column 3 section DATA start: 254659 length 3574
+ Stream: column 3 section LENGTH start: 258233 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 258258 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -151,17 +151,17 @@ Stripes:
Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 259309 data: 12943 rows: 1000 tail: 78 index: 432
- Stream: column 0 section ROW_INDEX start: 259309 length 12
- Stream: column 1 section ROW_INDEX start: 259321 length 38
- Stream: column 2 section ROW_INDEX start: 259359 length 41
- Stream: column 3 section ROW_INDEX start: 259400 length 40
- Stream: column 3 section BLOOM_FILTER start: 259440 length 301
- Stream: column 1 section DATA start: 259741 length 4007
- Stream: column 2 section DATA start: 263748 length 8010
- Stream: column 3 section DATA start: 271758 length 768
- Stream: column 3 section LENGTH start: 272526 length 25
- Stream: column 3 section DICTIONARY_DATA start: 272551 length 133
+ Stripe: offset: 258476 data: 12943 rows: 1000 tail: 78 index: 382
+ Stream: column 0 section ROW_INDEX start: 258476 length 12
+ Stream: column 1 section ROW_INDEX start: 258488 length 38
+ Stream: column 2 section ROW_INDEX start: 258526 length 41
+ Stream: column 3 section ROW_INDEX start: 258567 length 40
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 258607 length 251
+ Stream: column 1 section DATA start: 258858 length 4007
+ Stream: column 2 section DATA start: 262865 length 8010
+ Stream: column 3 section DATA start: 270875 length 768
+ Stream: column 3 section LENGTH start: 271643 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 271668 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -172,7 +172,7 @@ Stripes:
Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
-File length: 273307 bytes
+File length: 272427 bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
index fa5cc2d..75cd5f4 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -1,5 +1,5 @@
Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
Rows: 21000
Compression: ZLIB
Compression size: 4096
@@ -39,17 +39,20 @@ File Statistics:
Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
Stripes:
- Stripe: offset: 3 data: 63786 rows: 5000 tail: 85 index: 6974
+ Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14950
Stream: column 0 section ROW_INDEX start: 3 length 17
Stream: column 1 section ROW_INDEX start: 20 length 166
Stream: column 2 section ROW_INDEX start: 186 length 169
Stream: column 2 section BLOOM_FILTER start: 355 length 6535
- Stream: column 3 section ROW_INDEX start: 6890 length 87
- Stream: column 1 section DATA start: 6977 length 20035
- Stream: column 2 section DATA start: 27012 length 40050
- Stream: column 3 section DATA start: 67062 length 3543
- Stream: column 3 section LENGTH start: 70605 length 25
- Stream: column 3 section DICTIONARY_DATA start: 70630 length 133
+ Stream: column 2 section BLOOM_FILTER_UTF8 start: 6890 length 6046
+ Stream: column 3 section ROW_INDEX start: 12936 length 87
+ Stream: column 3 section BLOOM_FILTER start: 13023 length 1038
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 892
+ Stream: column 1 section DATA start: 14953 length 20035
+ Stream: column 2 section DATA start: 34988 length 40050
+ Stream: column 3 section DATA start: 75038 length 3543
+ Stream: column 3 section LENGTH start: 78581 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 78606 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -67,17 +70,20 @@ Stripes:
Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 0.5155 expectedFpp: 0.009676614
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 loadFactor: 0.9736 expectedFpp: 0.829482
- Stripe: offset: 70848 data: 63775 rows: 5000 tail: 85 index: 6965
- Stream: column 0 section ROW_INDEX start: 70848 length 17
- Stream: column 1 section ROW_INDEX start: 70865 length 164
- Stream: column 2 section ROW_INDEX start: 71029 length 168
- Stream: column 2 section BLOOM_FILTER start: 71197 length 6533
- Stream: column 3 section ROW_INDEX start: 77730 length 83
- Stream: column 1 section DATA start: 77813 length 20035
- Stream: column 2 section DATA start: 97848 length 40050
- Stream: column 3 section DATA start: 137898 length 3532
- Stream: column 3 section LENGTH start: 141430 length 25
- Stream: column 3 section DICTIONARY_DATA start: 141455 length 133
+ Stripe: offset: 78843 data: 63775 rows: 5000 tail: 103 index: 14941
+ Stream: column 0 section ROW_INDEX start: 78843 length 17
+ Stream: column 1 section ROW_INDEX start: 78860 length 164
+ Stream: column 2 section ROW_INDEX start: 79024 length 168
+ Stream: column 2 section BLOOM_FILTER start: 79192 length 6533
+ Stream: column 2 section BLOOM_FILTER_UTF8 start: 85725 length 6046
+ Stream: column 3 section ROW_INDEX start: 91771 length 83
+ Stream: column 3 section BLOOM_FILTER start: 91854 length 1038
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 92892 length 892
+ Stream: column 1 section DATA start: 93784 length 20035
+ Stream: column 2 section DATA start: 113819 length 40050
+ Stream: column 3 section DATA start: 153869 length 3532
+ Stream: column 3 section LENGTH start: 157401 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 157426 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -95,17 +101,20 @@ Stripes:
Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 0.5169 expectedFpp: 0.009855959
Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 0.5173 expectedFpp: 0.009911705
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 loadFactor: 0.9733 expectedFpp: 0.8276205
- Stripe: offset: 141673 data: 63787 rows: 5000 tail: 85 index: 6971
- Stream: column 0 section ROW_INDEX start: 141673 length 17
- Stream: column 1 section ROW_INDEX start: 141690 length 163
- Stream: column 2 section ROW_INDEX start: 141853 length 168
- Stream: column 2 section BLOOM_FILTER start: 142021 length 6533
- Stream: column 3 section ROW_INDEX start: 148554 length 90
- Stream: column 1 section DATA start: 148644 length 20035
- Stream: column 2 section DATA start: 168679 length 40050
- Stream: column 3 section DATA start: 208729 length 3544
- Stream: column 3 section LENGTH start: 212273 length 25
- Stream: column 3 section DICTIONARY_DATA start: 212298 length 133
+ Stripe: offset: 157662 data: 63787 rows: 5000 tail: 104 index: 14947
+ Stream: column 0 section ROW_INDEX start: 157662 length 17
+ Stream: column 1 section ROW_INDEX start: 157679 length 163
+ Stream: column 2 section ROW_INDEX start: 157842 length 168
+ Stream: column 2 section BLOOM_FILTER start: 158010 length 6533
+ Stream: column 2 section BLOOM_FILTER_UTF8 start: 164543 length 6046
+ Stream: column 3 section ROW_INDEX start: 170589 length 90
+ Stream: column 3 section BLOOM_FILTER start: 170679 length 1038
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 171717 length 892
+ Stream: column 1 section DATA start: 172609 length 20035
+ Stream: column 2 section DATA start: 192644 length 40050
+ Stream: column 3 section DATA start: 232694 length 3544
+ Stream: column 3 section LENGTH start: 236238 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 236263 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -123,17 +132,20 @@ Stripes:
Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 0.5149 expectedFpp: 0.009594797
Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 0.5135 expectedFpp: 0.009419539
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 loadFactor: 0.9722 expectedFpp: 0.82082444
- Stripe: offset: 212516 data: 63817 rows: 5000 tail: 85 index: 6964
- Stream: column 0 section ROW_INDEX start: 212516 length 17
- Stream: column 1 section ROW_INDEX start: 212533 length 165
- Stream: column 2 section ROW_INDEX start: 212698 length 167
- Stream: column 2 section BLOOM_FILTER start: 212865 length 6524
- Stream: column 3 section ROW_INDEX start: 219389 length 91
- Stream: column 1 section DATA start: 219480 length 20035
- Stream: column 2 section DATA start: 239515 length 40050
- Stream: column 3 section DATA start: 279565 length 3574
- Stream: column 3 section LENGTH start: 283139 length 25
- Stream: column 3 section DICTIONARY_DATA start: 283164 length 133
+ Stripe: offset: 236500 data: 63817 rows: 5000 tail: 103 index: 14940
+ Stream: column 0 section ROW_INDEX start: 236500 length 17
+ Stream: column 1 section ROW_INDEX start: 236517 length 165
+ Stream: column 2 section ROW_INDEX start: 236682 length 167
+ Stream: column 2 section BLOOM_FILTER start: 236849 length 6524
+ Stream: column 2 section BLOOM_FILTER_UTF8 start: 243373 length 6046
+ Stream: column 3 section ROW_INDEX start: 249419 length 91
+ Stream: column 3 section BLOOM_FILTER start: 249510 length 1038
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 250548 length 892
+ Stream: column 1 section DATA start: 251440 length 20035
+ Stream: column 2 section DATA start: 271475 length 40050
+ Stream: column 3 section DATA start: 311525 length 3574
+ Stream: column 3 section LENGTH start: 315099 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 315124 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -151,17 +163,20 @@ Stripes:
Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 0.5147 expectedFpp: 0.009567649
Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 0.5201 expectedFpp: 0.010295142
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 loadFactor: 0.9743 expectedFpp: 0.8332165
- Stripe: offset: 283382 data: 12943 rows: 1000 tail: 78 index: 1468
- Stream: column 0 section ROW_INDEX start: 283382 length 12
- Stream: column 1 section ROW_INDEX start: 283394 length 38
- Stream: column 2 section ROW_INDEX start: 283432 length 41
- Stream: column 2 section BLOOM_FILTER start: 283473 length 1337
- Stream: column 3 section ROW_INDEX start: 284810 length 40
- Stream: column 1 section DATA start: 284850 length 4007
- Stream: column 2 section DATA start: 288857 length 8010
- Stream: column 3 section DATA start: 296867 length 768
- Stream: column 3 section LENGTH start: 297635 length 25
- Stream: column 3 section DICTIONARY_DATA start: 297660 length 133
+ Stripe: offset: 315360 data: 12943 rows: 1000 tail: 96 index: 3542
+ Stream: column 0 section ROW_INDEX start: 315360 length 12
+ Stream: column 1 section ROW_INDEX start: 315372 length 38
+ Stream: column 2 section ROW_INDEX start: 315410 length 41
+ Stream: column 2 section BLOOM_FILTER start: 315451 length 1337
+ Stream: column 2 section BLOOM_FILTER_UTF8 start: 316788 length 1211
+ Stream: column 3 section ROW_INDEX start: 317999 length 40
+ Stream: column 3 section BLOOM_FILTER start: 318039 length 472
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 318511 length 391
+ Stream: column 1 section DATA start: 318902 length 4007
+ Stream: column 2 section DATA start: 322909 length 8010
+ Stream: column 3 section DATA start: 330919 length 768
+ Stream: column 3 section LENGTH start: 331687 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 331712 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -172,7 +187,7 @@ Stripes:
Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294
-File length: 298416 bytes
+File length: 332489 bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
index 17a964b..4b0822f 100644
--- a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
+++ b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
@@ -1,5 +1,5 @@
Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
Rows: 21000
Compression: ZLIB
Compression size: 4096
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump.json
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.json b/java/tools/src/test/resources/orc-file-dump.json
index bf654a1..3dd0dc0 100644
--- a/java/tools/src/test/resources/orc-file-dump.json
+++ b/java/tools/src/test/resources/orc-file-dump.json
@@ -1,7 +1,7 @@
{
"fileName": "TestFileDump.testDump.orc",
"fileVersion": "0.12",
- "writerVersion": "HIVE_13083",
+ "writerVersion": "ORC_101",
"numberOfRows": 21000,
"compression": "ZLIB",
"compressionBufferSize": 4096,
@@ -254,9 +254,9 @@
"stripeNumber": 1,
"stripeInformation": {
"offset": 3,
- "indexLength": 970,
+ "indexLength": 762,
"dataLength": 63770,
- "footerLength": 90,
+ "footerLength": 89,
"rowCount": 5000
},
"streams": [
@@ -286,44 +286,44 @@
},
{
"columnId": 3,
- "section": "BLOOM_FILTER",
+ "section": "BLOOM_FILTER_UTF8",
"startOffset": 461,
- "length": 512
+ "length": 304
},
{
"columnId": 1,
"section": "DATA",
- "startOffset": 973,
+ "startOffset": 765,
"length": 20035
},
{
"columnId": 2,
"section": "DATA",
- "startOffset": 21008,
+ "startOffset": 20800,
"length": 40050
},
{
"columnId": 3,
"section": "PRESENT",
- "startOffset": 61058,
+ "startOffset": 60850,
"length": 17
},
{
"columnId": 3,
"section": "DATA",
- "startOffset": 61075,
+ "startOffset": 60867,
"length": 3510
},
{
"columnId": 3,
"section": "LENGTH",
- "startOffset": 64585,
+ "startOffset": 64377,
"length": 25
},
{
"columnId": 3,
"section": "DICTIONARY_DATA",
- "startOffset": 64610,
+ "startOffset": 64402,
"length": 133
}
],
@@ -494,77 +494,77 @@
{
"stripeNumber": 2,
"stripeInformation": {
- "offset": 64833,
- "indexLength": 961,
+ "offset": 64624,
+ "indexLength": 753,
"dataLength": 63763,
- "footerLength": 88,
+ "footerLength": 87,
"rowCount": 5000
},
"streams": [
{
"columnId": 0,
"section": "ROW_INDEX",
- "startOffset": 64833,
+ "startOffset": 64624,
"length": 17
},
{
"columnId": 1,
"section": "ROW_INDEX",
- "startOffset": 64850,
+ "startOffset": 64641,
"length": 166
},
{
"columnId": 2,
"section": "ROW_INDEX",
- "startOffset": 65016,
+ "startOffset": 64807,
"length": 166
},
{
"columnId": 3,
"section": "ROW_INDEX",
- "startOffset": 65182,
+ "startOffset": 64973,
"length": 100
},
{
"columnId": 3,
- "section": "BLOOM_FILTER",
- "startOffset": 65282,
- "length": 512
+ "section": "BLOOM_FILTER_UTF8",
+ "startOffset": 65073,
+ "length": 304
},
{
"columnId": 1,
"section": "DATA",
- "startOffset": 65794,
+ "startOffset": 65377,
"length": 20035
},
{
"columnId": 2,
"section": "DATA",
- "startOffset": 85829,
+ "startOffset": 85412,
"length": 40050
},
{
"columnId": 3,
"section": "PRESENT",
- "startOffset": 125879,
+ "startOffset": 125462,
"length": 17
},
{
"columnId": 3,
"section": "DATA",
- "startOffset": 125896,
+ "startOffset": 125479,
"length": 3503
},
{
"columnId": 3,
"section": "LENGTH",
- "startOffset": 129399,
+ "startOffset": 128982,
"length": 25
},
{
"columnId": 3,
"section": "DICTIONARY_DATA",
- "startOffset": 129424,
+ "startOffset": 129007,
"length": 133
}
],
@@ -735,77 +735,77 @@
{
"stripeNumber": 3,
"stripeInformation": {
- "offset": 129645,
- "indexLength": 962,
+ "offset": 129227,
+ "indexLength": 754,
"dataLength": 63770,
- "footerLength": 91,
+ "footerLength": 89,
"rowCount": 5000
},
"streams": [
{
"columnId": 0,
"section": "ROW_INDEX",
- "startOffset": 129645,
+ "startOffset": 129227,
"length": 17
},
{
"columnId": 1,
"section": "ROW_INDEX",
- "startOffset": 129662,
+ "startOffset": 129244,
"length": 164
},
{
"columnId": 2,
"section": "ROW_INDEX",
- "startOffset": 129826,
+ "startOffset": 129408,
"length": 167
},
{
"columnId": 3,
"section": "ROW_INDEX",
- "startOffset": 129993,
+ "startOffset": 129575,
"length": 102
},
{
"columnId": 3,
- "section": "BLOOM_FILTER",
- "startOffset": 130095,
- "length": 512
+ "section": "BLOOM_FILTER_UTF8",
+ "startOffset": 129677,
+ "length": 304
},
{
"columnId": 1,
"section": "DATA",
- "startOffset": 130607,
+ "startOffset": 129981,
"length": 20035
},
{
"columnId": 2,
"section": "DATA",
- "startOffset": 150642,
+ "startOffset": 150016,
"length": 40050
},
{
"columnId": 3,
"section": "PRESENT",
- "startOffset": 190692,
+ "startOffset": 190066,
"length": 17
},
{
"columnId": 3,
"section": "DATA",
- "startOffset": 190709,
+ "startOffset": 190083,
"length": 3510
},
{
"columnId": 3,
"section": "LENGTH",
- "startOffset": 194219,
+ "startOffset": 193593,
"length": 25
},
{
"columnId": 3,
"section": "DICTIONARY_DATA",
- "startOffset": 194244,
+ "startOffset": 193618,
"length": 133
}
],
@@ -976,77 +976,77 @@
{
"stripeNumber": 4,
"stripeInformation": {
- "offset": 194468,
- "indexLength": 973,
+ "offset": 193840,
+ "indexLength": 765,
"dataLength": 63756,
- "footerLength": 91,
+ "footerLength": 89,
"rowCount": 5000
},
"streams": [
{
"columnId": 0,
"section": "ROW_INDEX",
- "startOffset": 194468,
+ "startOffset": 193840,
"length": 17
},
{
"columnId": 1,
"section": "ROW_INDEX",
- "startOffset": 194485,
+ "startOffset": 193857,
"length": 166
},
{
"columnId": 2,
"section": "ROW_INDEX",
- "startOffset": 194651,
+ "startOffset": 194023,
"length": 171
},
{
"columnId": 3,
"section": "ROW_INDEX",
- "startOffset": 194822,
+ "startOffset": 194194,
"length": 107
},
{
"columnId": 3,
- "section": "BLOOM_FILTER",
- "startOffset": 194929,
- "length": 512
+ "section": "BLOOM_FILTER_UTF8",
+ "startOffset": 194301,
+ "length": 304
},
{
"columnId": 1,
"section": "DATA",
- "startOffset": 195441,
+ "startOffset": 194605,
"length": 20035
},
{
"columnId": 2,
"section": "DATA",
- "startOffset": 215476,
+ "startOffset": 214640,
"length": 40050
},
{
"columnId": 3,
"section": "PRESENT",
- "startOffset": 255526,
+ "startOffset": 254690,
"length": 17
},
{
"columnId": 3,
"section": "DATA",
- "startOffset": 255543,
+ "startOffset": 254707,
"length": 3496
},
{
"columnId": 3,
"section": "LENGTH",
- "startOffset": 259039,
+ "startOffset": 258203,
"length": 25
},
{
"columnId": 3,
"section": "DICTIONARY_DATA",
- "startOffset": 259064,
+ "startOffset": 258228,
"length": 133
}
],
@@ -1217,8 +1217,8 @@
{
"stripeNumber": 5,
"stripeInformation": {
- "offset": 259288,
- "indexLength": 433,
+ "offset": 258450,
+ "indexLength": 383,
"dataLength": 12943,
"footerLength": 83,
"rowCount": 1000
@@ -1227,67 +1227,67 @@
{
"columnId": 0,
"section": "ROW_INDEX",
- "startOffset": 259288,
+ "startOffset": 258450,
"length": 12
},
{
"columnId": 1,
"section": "ROW_INDEX",
- "startOffset": 259300,
+ "startOffset": 258462,
"length": 38
},
{
"columnId": 2,
"section": "ROW_INDEX",
- "startOffset": 259338,
+ "startOffset": 258500,
"length": 41
},
{
"columnId": 3,
"section": "ROW_INDEX",
- "startOffset": 259379,
+ "startOffset": 258541,
"length": 41
},
{
"columnId": 3,
- "section": "BLOOM_FILTER",
- "startOffset": 259420,
- "length": 301
+ "section": "BLOOM_FILTER_UTF8",
+ "startOffset": 258582,
+ "length": 251
},
{
"columnId": 1,
"section": "DATA",
- "startOffset": 259721,
+ "startOffset": 258833,
"length": 4007
},
{
"columnId": 2,
"section": "DATA",
- "startOffset": 263728,
+ "startOffset": 262840,
"length": 8010
},
{
"columnId": 3,
"section": "PRESENT",
- "startOffset": 271738,
+ "startOffset": 270850,
"length": 16
},
{
"columnId": 3,
"section": "DATA",
- "startOffset": 271754,
+ "startOffset": 270866,
"length": 752
},
{
"columnId": 3,
"section": "LENGTH",
- "startOffset": 272506,
+ "startOffset": 271618,
"length": 25
},
{
"columnId": 3,
"section": "DICTIONARY_DATA",
- "startOffset": 272531,
+ "startOffset": 271643,
"length": 133
}
],
@@ -1348,7 +1348,7 @@
}]
}
],
- "fileLength": 273300,
+ "fileLength": 272409,
"paddingLength": 0,
"paddingRatio": 0,
"status": "OK"
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.out b/java/tools/src/test/resources/orc-file-dump.out
index 70f7fbd..ae8195e 100644
--- a/java/tools/src/test/resources/orc-file-dump.out
+++ b/java/tools/src/test/resources/orc-file-dump.out
@@ -1,5 +1,5 @@
Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
Rows: 21000
Compression: ZLIB
Compression size: 4096
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-has-null.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-has-null.out b/java/tools/src/test/resources/orc-file-has-null.out
index df075d5..c02f803 100644
--- a/java/tools/src/test/resources/orc-file-has-null.out
+++ b/java/tools/src/test/resources/orc-file-has-null.out
@@ -1,5 +1,5 @@
Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
Rows: 20000
Compression: ZLIB
Compression size: 4096
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/proto/orc_proto.proto
----------------------------------------------------------------------
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index dbc34ab..de6974e 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -91,6 +91,7 @@ message RowIndex {
message BloomFilter {
optional uint32 numHashFunctions = 1;
repeated fixed64 bitset = 2;
+ optional bytes utf8bitset = 3;
}
message BloomFilterIndex {
@@ -109,6 +110,7 @@ message Stream {
SECONDARY = 5;
ROW_INDEX = 6;
BLOOM_FILTER = 7;
+ BLOOM_FILTER_UTF8 = 8;
}
optional Kind kind = 1;
optional uint32 column = 2;
[2/4] orc git commit: ORC-101 Correct bloom filters for strings and
decimals to use utf8 encoding.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
index 6d1955d..f159eef 100644
--- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
@@ -19,9 +19,11 @@
package org.apache.orc.impl;
import static junit.framework.Assert.assertEquals;
+import static junit.framework.TestCase.fail;
import static org.hamcrest.core.Is.is;
-import static org.junit.Assert.*;
-import static org.mockito.Mockito.any;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.atLeastOnce;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.mock;
@@ -33,9 +35,9 @@ import java.io.IOException;
import java.io.InputStream;
import java.sql.Timestamp;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
-import junit.framework.Assert;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
@@ -46,7 +48,7 @@ import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.hive.common.io.DiskRangeList;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilter;
import org.apache.orc.DataReader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
@@ -62,6 +64,7 @@ import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.OrcProto;
+import org.junit.Assert;
import org.junit.Test;
import org.mockito.MockSettings;
import org.mockito.Mockito;
@@ -375,23 +378,23 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN));
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN));
pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN));
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN));
pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", false, null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN));
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN));
}
@Test
@@ -399,34 +402,34 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.FLOAT, "x", 15.0, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
// Stats gets converted to column type. "15" is outside of "10" and "100"
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "15", null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
// Integer stats will not be converted date because of days/seconds/millis ambiguity
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -434,39 +437,39 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.FLOAT, "x", 15.0, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
// Stats gets converted to column type. "15.0" is outside of "10.0" and "100.0"
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "15", null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE));
// Double is not converted to date type because of days/seconds/millis ambiguity
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15*1000L), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150*1000L), null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE));
}
@Test
@@ -474,33 +477,33 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 100L, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.FLOAT, "x", 100.0, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "100", null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
// IllegalArgumentException is thrown when converting String to Date, hence YES_NO
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(100).get(), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 1000), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 1000), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("100"), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(100), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -509,69 +512,69 @@ public class TestRecordReaderImpl {
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
// Date to Integer conversion is not possible.
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
// Date to Float conversion is also not possible.
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.FLOAT, "x", 15.0, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "15", null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "1970-01-11", null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "15.1", null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "__a15__1", null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "2000-01-16", null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "1970-01-16", null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(150).get(), null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
// Date to Decimal conversion is also not possible.
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15L * 24L * 60L * 60L * 1000L), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -579,39 +582,39 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.FLOAT, "x", 15.0, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
// "15" out of range of "10.0" and "100.0"
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "15", null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
// Decimal to Date not possible.
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15 * 1000L), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150 * 1000L), null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -619,46 +622,46 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.FLOAT, "x", 15.0, null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "15", null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", new Timestamp(15).toString(), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO,
RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10 * 24L * 60L * 60L * 1000L,
- 100 * 24L * 60L * 60L * 1000L), pred, null));
+ 100 * 24L * 60L * 60L * 1000L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -667,17 +670,17 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG,
"x", 15L, null);
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -686,17 +689,17 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG,
"x", 15L, null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -705,15 +708,15 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.LONG,
"x", 15L, null);
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), lessThan, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), lessThan, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), lessThan, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), lessThan, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), lessThan, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -722,15 +725,15 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.LONG,
"x", 15L, null);
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -742,13 +745,13 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
"x", null, args);
assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 20L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 20L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 30L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -760,19 +763,19 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.LONG,
"x", null, args);
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 5L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 5L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 40L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 40L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 15L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 25L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 25L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 25L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 25L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 20L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 20L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -781,7 +784,7 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.LONG,
"x", null, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@@ -791,17 +794,17 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING,
"x", "c", null);
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // before
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle
assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // same
}
@Test
@@ -810,17 +813,17 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING,
"x", "c", null);
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // before
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // same
}
@Test
@@ -829,17 +832,17 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.STRING,
"x", "c", null);
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // before
assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle
assertEquals(TruthValue.NO_NULL, // min, same stats
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -848,17 +851,17 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.STRING,
"x", "c", null);
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // before
assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after
assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // same
}
@Test
@@ -870,17 +873,17 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
"x", null, args);
assertEquals(TruthValue.NO_NULL, // before & after
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null)); // max
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle
assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // same
}
@Test
@@ -892,31 +895,31 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.STRING,
"x", null, args);
assertEquals(TruthValue.YES_NULL, // before & after
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NULL, // before & max
- RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.NO_NULL, // before & before
- RecordReaderImpl.evaluatePredicateProto(createStringStats("h", "g", true), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("h", "g", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL, // before & min
- RecordReaderImpl.evaluatePredicateProto(createStringStats("f", "g", true), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("f", "g", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL, // before & middle
- RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "g", true), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "g", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NULL, // min & after
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "e", true), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NULL, // min & max
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "f", true), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "f", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.YES_NO_NULL, // min & middle
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "g", true), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "g", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "c", true), pred, null)); // max
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max
assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle
assertEquals(TruthValue.YES_NULL, // min & after, same stats
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -925,9 +928,9 @@ public class TestRecordReaderImpl {
(PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.STRING,
"x", null, null);
assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", false), pred, null));
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", false), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
}
@Test
@@ -1304,7 +1307,7 @@ public class TestRecordReaderImpl {
public void testIntNullSafeEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong(i);
}
@@ -1319,7 +1322,7 @@ public class TestRecordReaderImpl {
public void testIntEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong(i);
}
@@ -1338,7 +1341,7 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
"x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong(i);
}
@@ -1356,7 +1359,7 @@ public class TestRecordReaderImpl {
public void testDoubleNullSafeEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addDouble(i);
}
@@ -1371,7 +1374,7 @@ public class TestRecordReaderImpl {
public void testDoubleEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addDouble(i);
}
@@ -1390,7 +1393,7 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.FLOAT,
"x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addDouble(i);
}
@@ -1408,7 +1411,7 @@ public class TestRecordReaderImpl {
public void testStringNullSafeEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString("str_" + i);
}
@@ -1423,7 +1426,7 @@ public class TestRecordReaderImpl {
public void testStringEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString("str_" + i);
}
@@ -1442,7 +1445,7 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
"x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString("str_" + i);
}
@@ -1461,7 +1464,7 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x",
new DateWritable(15).get(), null);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong((new DateWritable(i)).getDays());
}
@@ -1477,7 +1480,7 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DATE, "x",
new DateWritable(15).get(), null);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong((new DateWritable(i)).getDays());
}
@@ -1496,7 +1499,7 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.DATE,
"x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong((new DateWritable(i)).getDays());
}
@@ -1516,7 +1519,7 @@ public class TestRecordReaderImpl {
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x",
new Timestamp(15),
null);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong((new Timestamp(i)).getTime());
}
@@ -1531,7 +1534,7 @@ public class TestRecordReaderImpl {
public void testTimestampEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong((new Timestamp(i)).getTime());
}
@@ -1550,7 +1553,7 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.TIMESTAMP,
"x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong((new Timestamp(i)).getTime());
}
@@ -1570,7 +1573,7 @@ public class TestRecordReaderImpl {
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x",
new HiveDecimalWritable("15"),
null);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString(HiveDecimal.create(i).toString());
}
@@ -1587,7 +1590,7 @@ public class TestRecordReaderImpl {
PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DECIMAL, "x",
new HiveDecimalWritable("15"),
null);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString(HiveDecimal.create(i).toString());
}
@@ -1606,7 +1609,7 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
"x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString(HiveDecimal.create(i).toString());
}
@@ -1629,7 +1632,7 @@ public class TestRecordReaderImpl {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
"x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
+ BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString(HiveDecimal.create(i).toString());
}
@@ -1692,4 +1695,171 @@ public class TestRecordReaderImpl {
recordReader.close();
}
+
+ @Test
+ public void TestOldBloomFilters() throws Exception {
+ OrcProto.StripeFooter footer =
+ OrcProto.StripeFooter.newBuilder()
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(1).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(1).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(2).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(2).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(3).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(3).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+ .build();
+ TypeDescription schema = TypeDescription.fromString("struct<x:int,y:decimal(10,2),z:string>");
+ OrcProto.Stream.Kind[] bloomFilterKinds = new OrcProto.Stream.Kind[4];
+
+ // normal read
+ DiskRangeList ranges = RecordReaderUtils.planIndexReading(schema, footer,
+ false, new boolean[]{true, true, false, true},
+ new boolean[]{false, true, false, true},
+ OrcFile.WriterVersion.HIVE_4243,
+ bloomFilterKinds);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[3]);
+ assertEquals("range start: 0 end: 2000", ranges.toString());
+ assertEquals("range start: 4000 end: 6000", ranges.next.toString());
+ assertEquals(null, ranges.next.next);
+
+ // ignore non-utf8 bloom filter
+ Arrays.fill(bloomFilterKinds, null);
+ ranges = RecordReaderUtils.planIndexReading(schema, footer,
+ true, new boolean[]{true, true, false, true},
+ new boolean[]{false, true, false, true},
+ OrcFile.WriterVersion.HIVE_4243,
+ bloomFilterKinds);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+ assertEquals(null, bloomFilterKinds[3]);
+ assertEquals("range start: 0 end: 2000", ranges.toString());
+ assertEquals("range start: 4000 end: 5000", ranges.next.toString());
+ assertEquals(null, ranges.next.next);
+
+ // check that we are handling the post hive-12055 strings correctly
+ Arrays.fill(bloomFilterKinds, null);
+ ranges = RecordReaderUtils.planIndexReading(schema, footer,
+ true, null, new boolean[]{false, true, true, true},
+ OrcFile.WriterVersion.HIVE_12055, bloomFilterKinds);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+ assertEquals(null, bloomFilterKinds[2]);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[3]);
+ assertEquals("range start: 0 end: 3000", ranges.toString());
+ assertEquals("range start: 4000 end: 6000", ranges.next.toString());
+ assertEquals(null, ranges.next.next);
+
+ // ignore non-utf8 bloom filter on decimal
+ Arrays.fill(bloomFilterKinds, null);
+ ranges = RecordReaderUtils.planIndexReading(schema, footer,
+ true, null,
+ new boolean[]{false, false, true, false},
+ OrcFile.WriterVersion.HIVE_4243,
+ bloomFilterKinds);
+ assertEquals(null, bloomFilterKinds[2]);
+ assertEquals("range start: 0 end: 1000", ranges.toString());
+ assertEquals("range start: 2000 end: 3000", ranges.next.toString());
+ assertEquals("range start: 4000 end: 5000", ranges.next.next.toString());
+ assertEquals(null, ranges.next.next.next);
+ }
+
+ @Test
+ public void TestCompatibleBloomFilters() throws Exception {
+ OrcProto.StripeFooter footer =
+ OrcProto.StripeFooter.newBuilder()
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(1).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(1).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(2).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(2).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(2).setKind(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(3).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(3).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(3).setKind(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).setLength(1000).build())
+ .build();
+ TypeDescription schema = TypeDescription.fromString("struct<x:int,y:decimal(10,2),z:string>");
+ OrcProto.Stream.Kind[] bloomFilterKinds = new OrcProto.Stream.Kind[4];
+
+ // normal read
+ DiskRangeList ranges = RecordReaderUtils.planIndexReading(schema, footer,
+ false, new boolean[]{true, true, false, true},
+ new boolean[]{false, true, false, true},
+ OrcFile.WriterVersion.HIVE_4243,
+ bloomFilterKinds);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, bloomFilterKinds[3]);
+ assertEquals("range start: 0 end: 2000", ranges.toString());
+ assertEquals("range start: 5000 end: 6000", ranges.next.toString());
+ assertEquals("range start: 7000 end: 8000", ranges.next.next.toString());
+ assertEquals(null, ranges.next.next.next);
+
+ //
+ Arrays.fill(bloomFilterKinds, null);
+ ranges = RecordReaderUtils.planIndexReading(schema, footer,
+ true, null,
+ new boolean[]{false, true, true, false},
+ OrcFile.WriterVersion.HIVE_4243,
+ bloomFilterKinds);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, bloomFilterKinds[2]);
+ assertEquals("range start: 0 end: 3000", ranges.toString());
+ assertEquals("range start: 4000 end: 6000", ranges.next.toString());
+ assertEquals(null, ranges.next.next);
+ }
+
+ @Test
+ public void TestNewBloomFilters() throws Exception {
+ OrcProto.StripeFooter footer =
+ OrcProto.StripeFooter.newBuilder()
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(1).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(1).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(2).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(2).setKind(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(3).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+ .addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(3).setKind(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).setLength(1000).build())
+ .build();
+ TypeDescription schema = TypeDescription.fromString("struct<x:int,y:decimal(10,2),z:string>");
+ OrcProto.Stream.Kind[] bloomFilterKinds = new OrcProto.Stream.Kind[4];
+
+ // normal read
+ DiskRangeList ranges = RecordReaderUtils.planIndexReading(schema, footer,
+ false, new boolean[]{true, true, false, true},
+ new boolean[]{false, true, false, true},
+ OrcFile.WriterVersion.HIVE_4243,
+ bloomFilterKinds);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, bloomFilterKinds[3]);
+ assertEquals("range start: 0 end: 2000", ranges.toString());
+ assertEquals("range start: 4000 end: 6000", ranges.next.toString());
+ assertEquals(null, ranges.next.next);
+
+ //
+ Arrays.fill(bloomFilterKinds, null);
+ ranges = RecordReaderUtils.planIndexReading(schema, footer,
+ true, null,
+ new boolean[]{false, true, true, false},
+ OrcFile.WriterVersion.HIVE_4243,
+ bloomFilterKinds);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+ assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, bloomFilterKinds[2]);
+ assertEquals("range start: 0 end: 5000", ranges.toString());
+ assertEquals(null, ranges.next);
+ }
}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/test/org/apache/orc/util/TestMurmur3.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/util/TestMurmur3.java b/java/core/src/test/org/apache/orc/util/TestMurmur3.java
new file mode 100644
index 0000000..575e250
--- /dev/null
+++ b/java/core/src/test/org/apache/orc/util/TestMurmur3.java
@@ -0,0 +1,225 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+import static org.junit.Assert.assertEquals;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+
+import org.apache.orc.util.Murmur3;
+import org.junit.Test;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+import java.util.Random;
+
+/**
+ * Tests for Murmur3 variants.
+ */
+public class TestMurmur3 {
+
+ @Test
+ public void testHashCodesM3_32_string() {
+ String key = "test";
+ int seed = 123;
+ HashFunction hf = Hashing.murmur3_32(seed);
+ int hc1 = hf.hashBytes(key.getBytes()).asInt();
+ int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
+ assertEquals(hc1, hc2);
+
+ key = "testkey";
+ hc1 = hf.hashBytes(key.getBytes()).asInt();
+ hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
+ assertEquals(hc1, hc2);
+ }
+
+ @Test
+ public void testHashCodesM3_32_ints() {
+ int seed = 123;
+ Random rand = new Random(seed);
+ HashFunction hf = Hashing.murmur3_32(seed);
+ for (int i = 0; i < 1000; i++) {
+ int val = rand.nextInt();
+ byte[] data = ByteBuffer.allocate(4).putInt(val).array();
+ int hc1 = hf.hashBytes(data).asInt();
+ int hc2 = Murmur3.hash32(data, data.length, seed);
+ assertEquals(hc1, hc2);
+ }
+ }
+
+ @Test
+ public void testHashCodesM3_32_longs() {
+ int seed = 123;
+ Random rand = new Random(seed);
+ HashFunction hf = Hashing.murmur3_32(seed);
+ for (int i = 0; i < 1000; i++) {
+ long val = rand.nextLong();
+ byte[] data = ByteBuffer.allocate(8).putLong(val).array();
+ int hc1 = hf.hashBytes(data).asInt();
+ int hc2 = Murmur3.hash32(data, data.length, seed);
+ assertEquals(hc1, hc2);
+ }
+ }
+
+ @Test
+ public void testHashCodesM3_32_double() {
+ int seed = 123;
+ Random rand = new Random(seed);
+ HashFunction hf = Hashing.murmur3_32(seed);
+ for (int i = 0; i < 1000; i++) {
+ double val = rand.nextDouble();
+ byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
+ int hc1 = hf.hashBytes(data).asInt();
+ int hc2 = Murmur3.hash32(data, data.length, seed);
+ assertEquals(hc1, hc2);
+ }
+ }
+
+ @Test
+ public void testHashCodesM3_128_string() {
+ String key = "test";
+ int seed = 123;
+ HashFunction hf = Hashing.murmur3_128(seed);
+ // guava stores the hashcodes in little endian order
+ ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
+ buf.put(hf.hashBytes(key.getBytes()).asBytes());
+ buf.flip();
+ long gl1 = buf.getLong();
+ long gl2 = buf.getLong(8);
+ long[] hc = Murmur3.hash128(key.getBytes(), 0, key.getBytes().length, seed);
+ long m1 = hc[0];
+ long m2 = hc[1];
+ assertEquals(gl1, m1);
+ assertEquals(gl2, m2);
+
+ key = "testkey128_testkey128";
+ buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
+ buf.put(hf.hashBytes(key.getBytes()).asBytes());
+ buf.flip();
+ gl1 = buf.getLong();
+ gl2 = buf.getLong(8);
+ byte[] keyBytes = key.getBytes();
+ hc = Murmur3.hash128(keyBytes, 0, keyBytes.length, seed);
+ m1 = hc[0];
+ m2 = hc[1];
+ assertEquals(gl1, m1);
+ assertEquals(gl2, m2);
+
+ byte[] offsetKeyBytes = new byte[keyBytes.length + 35];
+ Arrays.fill(offsetKeyBytes, (byte) -1);
+ System.arraycopy(keyBytes, 0, offsetKeyBytes, 35, keyBytes.length);
+ hc = Murmur3.hash128(offsetKeyBytes, 35, keyBytes.length, seed);
+ assertEquals(gl1, hc[0]);
+ assertEquals(gl2, hc[1]);
+ }
+
+ @Test
+ public void testHashCodeM3_64() {
+ byte[] origin = ("It was the best of times, it was the worst of times," +
+ " it was the age of wisdom, it was the age of foolishness," +
+ " it was the epoch of belief, it was the epoch of incredulity," +
+ " it was the season of Light, it was the season of Darkness," +
+ " it was the spring of hope, it was the winter of despair," +
+ " we had everything before us, we had nothing before us," +
+ " we were all going direct to Heaven," +
+ " we were all going direct the other way.").getBytes();
+ long hash = Murmur3.hash64(origin, 0, origin.length);
+ assertEquals(305830725663368540L, hash);
+
+ byte[] originOffset = new byte[origin.length + 150];
+ Arrays.fill(originOffset, (byte) 123);
+ System.arraycopy(origin, 0, originOffset, 150, origin.length);
+ hash = Murmur3.hash64(originOffset, 150, origin.length);
+ assertEquals(305830725663368540L, hash);
+ }
+
+ @Test
+ public void testHashCodesM3_128_ints() {
+ int seed = 123;
+ Random rand = new Random(seed);
+ HashFunction hf = Hashing.murmur3_128(seed);
+ for (int i = 0; i < 1000; i++) {
+ int val = rand.nextInt();
+ byte[] data = ByteBuffer.allocate(4).putInt(val).array();
+ // guava stores the hashcodes in little endian order
+ ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
+ buf.put(hf.hashBytes(data).asBytes());
+ buf.flip();
+ long gl1 = buf.getLong();
+ long gl2 = buf.getLong(8);
+ long[] hc = Murmur3.hash128(data, 0, data.length, seed);
+ long m1 = hc[0];
+ long m2 = hc[1];
+ assertEquals(gl1, m1);
+ assertEquals(gl2, m2);
+
+ byte[] offsetData = new byte[data.length + 50];
+ System.arraycopy(data, 0, offsetData, 50, data.length);
+ hc = Murmur3.hash128(offsetData, 50, data.length, seed);
+ assertEquals(gl1, hc[0]);
+ assertEquals(gl2, hc[1]);
+ }
+ }
+
+ @Test
+ public void testHashCodesM3_128_longs() {
+ int seed = 123;
+ Random rand = new Random(seed);
+ HashFunction hf = Hashing.murmur3_128(seed);
+ for (int i = 0; i < 1000; i++) {
+ long val = rand.nextLong();
+ byte[] data = ByteBuffer.allocate(8).putLong(val).array();
+ // guava stores the hashcodes in little endian order
+ ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
+ buf.put(hf.hashBytes(data).asBytes());
+ buf.flip();
+ long gl1 = buf.getLong();
+ long gl2 = buf.getLong(8);
+ long[] hc = Murmur3.hash128(data, 0, data.length, seed);
+ long m1 = hc[0];
+ long m2 = hc[1];
+ assertEquals(gl1, m1);
+ assertEquals(gl2, m2);
+ }
+ }
+
+ @Test
+ public void testHashCodesM3_128_double() {
+ int seed = 123;
+ Random rand = new Random(seed);
+ HashFunction hf = Hashing.murmur3_128(seed);
+ for (int i = 0; i < 1000; i++) {
+ double val = rand.nextDouble();
+ byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
+ // guava stores the hashcodes in little endian order
+ ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
+ buf.put(hf.hashBytes(data).asBytes());
+ buf.flip();
+ long gl1 = buf.getLong();
+ long gl2 = buf.getLong(8);
+ long[] hc = Murmur3.hash128(data, 0, data.length, seed);
+ long m1 = hc[0];
+ long m2 = hc[1];
+ assertEquals(gl1, m1);
+ assertEquals(gl2, m2);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/java/core/src/test/resources/log4j.properties b/java/core/src/test/resources/log4j.properties
index d2c063d..fae44b6 100644
--- a/java/core/src/test/resources/log4j.properties
+++ b/java/core/src/test/resources/log4j.properties
@@ -15,3 +15,6 @@ log4j.rootLogger=WARN,stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n
+
+# Suppress the warnings about native io not being available
+log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/mapreduce/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/java/mapreduce/src/test/resources/log4j.properties b/java/mapreduce/src/test/resources/log4j.properties
index d2c063d..fae44b6 100644
--- a/java/mapreduce/src/test/resources/log4j.properties
+++ b/java/mapreduce/src/test/resources/log4j.properties
@@ -15,3 +15,6 @@ log4j.rootLogger=WARN,stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n
+
+# Suppress the warnings about native io not being available
+log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java b/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java
deleted file mode 100644
index e60690d..0000000
--- a/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java
+++ /dev/null
@@ -1,313 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are
- * highly space efficient when compared to using a HashSet. Because of the probabilistic nature of
- * bloom filter false positive (element not present in bloom filter but test() says true) are
- * possible but false negatives are not possible (if element is present then test() will never
- * say false). The false positive probability is configurable (default: 5%) depending on which
- * storage requirement may increase or decrease. Lower the false positive probability greater
- * is the space requirement.
- * Bloom filters are sensitive to number of elements that will be inserted in the bloom filter.
- * During the creation of bloom filter expected number of entries must be specified. If the number
- * of insertions exceed the specified initial number of entries then false positive probability will
- * increase accordingly.
- *
- * Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash
- * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it suffers from hash
- * collisions for specific sequence of repeating bytes. Check the following link for more info
- * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw
- */
-public class BloomFilter {
- public static final double DEFAULT_FPP = 0.05;
- protected BitSet bitSet;
- protected int numBits;
- protected int numHashFunctions;
-
- public BloomFilter() {
- }
-
- public BloomFilter(long expectedEntries) {
- this(expectedEntries, DEFAULT_FPP);
- }
-
- static void checkArgument(boolean expression, String message) {
- if (!expression) {
- throw new IllegalArgumentException(message);
- }
- }
-
- public BloomFilter(long expectedEntries, double fpp) {
- checkArgument(expectedEntries > 0, "expectedEntries should be > 0");
- checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0");
- int nb = optimalNumOfBits(expectedEntries, fpp);
- // make 'm' multiple of 64
- this.numBits = nb + (Long.SIZE - (nb % Long.SIZE));
- this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, numBits);
- this.bitSet = new BitSet(numBits);
- }
-
- /**
- * A constructor to support rebuilding the BloomFilter from a serialized representation.
- * @param bits
- * @param numBits
- * @param numFuncs
- */
- public BloomFilter(List<Long> bits, int numBits, int numFuncs) {
- super();
- long[] copied = new long[bits.size()];
- for (int i = 0; i < bits.size(); i++) copied[i] = bits.get(i);
- bitSet = new BitSet(copied);
- this.numBits = numBits;
- numHashFunctions = numFuncs;
- }
-
- static int optimalNumOfHashFunctions(long n, long m) {
- return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
- }
-
- static int optimalNumOfBits(long n, double p) {
- return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
- }
-
- public void add(byte[] val) {
- if (val == null) {
- addBytes(val, -1, -1);
- } else {
- addBytes(val, 0, val.length);
- }
- }
-
- public void addBytes(byte[] val, int offset, int length) {
- // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter"
- // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively
- // implement a Bloom filter without any loss in the asymptotic false positive probability'
-
- // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned
- // in the above paper
- long hash64 = val == null ? Murmur3.NULL_HASHCODE :
- Murmur3.hash64(val, offset, length);
- addHash(hash64);
- }
-
- private void addHash(long hash64) {
- int hash1 = (int) hash64;
- int hash2 = (int) (hash64 >>> 32);
-
- for (int i = 1; i <= numHashFunctions; i++) {
- int combinedHash = hash1 + (i * hash2);
- // hashcode should be positive, flip all the bits if it's negative
- if (combinedHash < 0) {
- combinedHash = ~combinedHash;
- }
- int pos = combinedHash % numBits;
- bitSet.set(pos);
- }
- }
-
- public void addString(String val) {
- if (val == null) {
- add(null);
- } else {
- add(val.getBytes());
- }
- }
-
- public void addLong(long val) {
- addHash(getLongHash(val));
- }
-
- public void addDouble(double val) {
- addLong(Double.doubleToLongBits(val));
- }
-
- public boolean test(byte[] val) {
- if (val == null) {
- return testBytes(val, -1, -1);
- }
- return testBytes(val, 0, val.length);
- }
-
- public boolean testBytes(byte[] val, int offset, int length) {
- long hash64 = val == null ? Murmur3.NULL_HASHCODE :
- Murmur3.hash64(val, offset, length);
- return testHash(hash64);
- }
-
- private boolean testHash(long hash64) {
- int hash1 = (int) hash64;
- int hash2 = (int) (hash64 >>> 32);
-
- for (int i = 1; i <= numHashFunctions; i++) {
- int combinedHash = hash1 + (i * hash2);
- // hashcode should be positive, flip all the bits if it's negative
- if (combinedHash < 0) {
- combinedHash = ~combinedHash;
- }
- int pos = combinedHash % numBits;
- if (!bitSet.get(pos)) {
- return false;
- }
- }
- return true;
- }
-
- public boolean testString(String val) {
- if (val == null) {
- return test(null);
- } else {
- return test(val.getBytes());
- }
- }
-
- public boolean testLong(long val) {
- return testHash(getLongHash(val));
- }
-
- // Thomas Wang's integer hash function
- // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
- private long getLongHash(long key) {
- key = (~key) + (key << 21); // key = (key << 21) - key - 1;
- key = key ^ (key >> 24);
- key = (key + (key << 3)) + (key << 8); // key * 265
- key = key ^ (key >> 14);
- key = (key + (key << 2)) + (key << 4); // key * 21
- key = key ^ (key >> 28);
- key = key + (key << 31);
- return key;
- }
-
- public boolean testDouble(double val) {
- return testLong(Double.doubleToLongBits(val));
- }
-
- public long sizeInBytes() {
- return getBitSize() / 8;
- }
-
- public int getBitSize() {
- return bitSet.getData().length * Long.SIZE;
- }
-
- public int getNumHashFunctions() {
- return numHashFunctions;
- }
-
- public long[] getBitSet() {
- return bitSet.getData();
- }
-
- @Override
- public String toString() {
- return "m: " + numBits + " k: " + numHashFunctions;
- }
-
- /**
- * Merge the specified bloom filter with current bloom filter.
- *
- * @param that - bloom filter to merge
- */
- public void merge(BloomFilter that) {
- if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions) {
- this.bitSet.putAll(that.bitSet);
- } else {
- throw new IllegalArgumentException("BloomFilters are not compatible for merging." +
- " this - " + this.toString() + " that - " + that.toString());
- }
- }
-
- public void reset() {
- this.bitSet.clear();
- }
-
- /**
- * Bare metal bit set implementation. For performance reasons, this implementation does not check
- * for index bounds nor expand the bit set size if the specified index is greater than the size.
- */
- public class BitSet {
- private final long[] data;
-
- public BitSet(long bits) {
- this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]);
- }
-
- /**
- * Deserialize long array as bit set.
- *
- * @param data - bit array
- */
- public BitSet(long[] data) {
- assert data.length > 0 : "data length is zero!";
- this.data = data;
- }
-
- /**
- * Sets the bit at specified index.
- *
- * @param index - position
- */
- public void set(int index) {
- data[index >>> 6] |= (1L << index);
- }
-
- /**
- * Returns true if the bit is set in the specified index.
- *
- * @param index - position
- * @return - value at the bit position
- */
- public boolean get(int index) {
- return (data[index >>> 6] & (1L << index)) != 0;
- }
-
- /**
- * Number of bits
- */
- public long bitSize() {
- return (long) data.length * Long.SIZE;
- }
-
- public long[] getData() {
- return data;
- }
-
- /**
- * Combines the two BitArrays using bitwise OR.
- */
- public void putAll(BitSet array) {
- assert data.length == array.data.length :
- "BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")";
- for (int i = 0; i < data.length; i++) {
- data[i] |= array.data[i];
- }
- }
-
- /**
- * Clear the bit set.
- */
- public void clear() {
- Arrays.fill(data, 0);
- }
- }
-}
[4/4] orc git commit: ORC-101 using little endian encoding of bloom
filter bitsets and update spec.
Posted by om...@apache.org.
ORC-101 using little endian encoding of bloom filter bitsets and update spec.
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/604dcc80
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/604dcc80
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/604dcc80
Branch: refs/heads/master
Commit: 604dcc801fb2cdb68fe8284c0facf66a32bfe119
Parents: 9d39cb8
Author: Owen O'Malley <om...@apache.org>
Authored: Tue Sep 20 15:51:36 2016 -0500
Committer: Owen O'Malley <om...@apache.org>
Committed: Wed Sep 21 11:38:57 2016 -0500
----------------------------------------------------------------------
.../java/org/apache/orc/util/BloomFilterIO.java | 6 +-
.../resources/orc-file-dump-bloomfilter.out | 104 +++++++-------
.../resources/orc-file-dump-bloomfilter2.out | 116 ++++++++--------
.../tools/src/test/resources/orc-file-dump.json | 134 +++++++++----------
site/_data/releases.yml | 4 +
site/_docs/spec-index.md | 11 +-
site/_docs/stripes.md | 4 +
7 files changed, 196 insertions(+), 183 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
index ebd8c49..a6c3940 100644
--- a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
+++ b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
@@ -24,7 +24,7 @@ import org.apache.orc.OrcProto;
import org.apache.orc.TypeDescription;
import java.nio.ByteBuffer;
-import java.util.Arrays;
+import java.nio.ByteOrder;
public class BloomFilterIO {
@@ -62,7 +62,8 @@ public class BloomFilterIO {
case BLOOM_FILTER_UTF8: {
ByteString bits = bloomFilter.getUtf8Bitset();
long[] values = new long[bits.size() / 8];
- bits.asReadOnlyByteBuffer().asLongBuffer().get(values);
+ bits.asReadOnlyByteBuffer().order(ByteOrder.LITTLE_ENDIAN)
+ .asLongBuffer().get(values);
return new BloomFilterUtf8(values, numFuncs);
}
default:
@@ -82,6 +83,7 @@ public class BloomFilterIO {
long[] bitset = bloomFilter.getBitSet();
if (bloomFilter instanceof BloomFilterUtf8) {
ByteBuffer buffer = ByteBuffer.allocate(bitset.length * 8);
+ buffer.order(ByteOrder.LITTLE_ENDIAN);
buffer.asLongBuffer().put(bitset);
builder.setUtf8Bitset(ByteString.copyFrom(buffer));
} else {
http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
index b879bed..e23327a 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
@@ -39,17 +39,17 @@ File Statistics:
Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
Stripes:
- Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 743
+ Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 749
Stream: column 0 section ROW_INDEX start: 3 length 17
Stream: column 1 section ROW_INDEX start: 20 length 166
Stream: column 2 section ROW_INDEX start: 186 length 169
Stream: column 3 section ROW_INDEX start: 355 length 87
- Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 304
- Stream: column 1 section DATA start: 746 length 20035
- Stream: column 2 section DATA start: 20781 length 40050
- Stream: column 3 section DATA start: 60831 length 3543
- Stream: column 3 section LENGTH start: 64374 length 25
- Stream: column 3 section DICTIONARY_DATA start: 64399 length 133
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 310
+ Stream: column 1 section DATA start: 752 length 20035
+ Stream: column 2 section DATA start: 20787 length 40050
+ Stream: column 3 section DATA start: 60837 length 3543
+ Stream: column 3 section LENGTH start: 64380 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 64405 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -67,17 +67,17 @@ Stripes:
Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 64618 data: 63775 rows: 5000 tail: 86 index: 736
- Stream: column 0 section ROW_INDEX start: 64618 length 17
- Stream: column 1 section ROW_INDEX start: 64635 length 164
- Stream: column 2 section ROW_INDEX start: 64799 length 168
- Stream: column 3 section ROW_INDEX start: 64967 length 83
- Stream: column 3 section BLOOM_FILTER_UTF8 start: 65050 length 304
- Stream: column 1 section DATA start: 65354 length 20035
- Stream: column 2 section DATA start: 85389 length 40050
- Stream: column 3 section DATA start: 125439 length 3532
- Stream: column 3 section LENGTH start: 128971 length 25
- Stream: column 3 section DICTIONARY_DATA start: 128996 length 133
+ Stripe: offset: 64624 data: 63775 rows: 5000 tail: 86 index: 742
+ Stream: column 0 section ROW_INDEX start: 64624 length 17
+ Stream: column 1 section ROW_INDEX start: 64641 length 164
+ Stream: column 2 section ROW_INDEX start: 64805 length 168
+ Stream: column 3 section ROW_INDEX start: 64973 length 83
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 65056 length 310
+ Stream: column 1 section DATA start: 65366 length 20035
+ Stream: column 2 section DATA start: 85401 length 40050
+ Stream: column 3 section DATA start: 125451 length 3532
+ Stream: column 3 section LENGTH start: 128983 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 129008 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -95,17 +95,17 @@ Stripes:
Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 129215 data: 63787 rows: 5000 tail: 86 index: 742
- Stream: column 0 section ROW_INDEX start: 129215 length 17
- Stream: column 1 section ROW_INDEX start: 129232 length 163
- Stream: column 2 section ROW_INDEX start: 129395 length 168
- Stream: column 3 section ROW_INDEX start: 129563 length 90
- Stream: column 3 section BLOOM_FILTER_UTF8 start: 129653 length 304
- Stream: column 1 section DATA start: 129957 length 20035
- Stream: column 2 section DATA start: 149992 length 40050
- Stream: column 3 section DATA start: 190042 length 3544
- Stream: column 3 section LENGTH start: 193586 length 25
- Stream: column 3 section DICTIONARY_DATA start: 193611 length 133
+ Stripe: offset: 129227 data: 63787 rows: 5000 tail: 86 index: 748
+ Stream: column 0 section ROW_INDEX start: 129227 length 17
+ Stream: column 1 section ROW_INDEX start: 129244 length 163
+ Stream: column 2 section ROW_INDEX start: 129407 length 168
+ Stream: column 3 section ROW_INDEX start: 129575 length 90
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 129665 length 310
+ Stream: column 1 section DATA start: 129975 length 20035
+ Stream: column 2 section DATA start: 150010 length 40050
+ Stream: column 3 section DATA start: 190060 length 3544
+ Stream: column 3 section LENGTH start: 193604 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 193629 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -123,17 +123,17 @@ Stripes:
Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 193830 data: 63817 rows: 5000 tail: 85 index: 744
- Stream: column 0 section ROW_INDEX start: 193830 length 17
- Stream: column 1 section ROW_INDEX start: 193847 length 165
- Stream: column 2 section ROW_INDEX start: 194012 length 167
- Stream: column 3 section ROW_INDEX start: 194179 length 91
- Stream: column 3 section BLOOM_FILTER_UTF8 start: 194270 length 304
- Stream: column 1 section DATA start: 194574 length 20035
- Stream: column 2 section DATA start: 214609 length 40050
- Stream: column 3 section DATA start: 254659 length 3574
- Stream: column 3 section LENGTH start: 258233 length 25
- Stream: column 3 section DICTIONARY_DATA start: 258258 length 133
+ Stripe: offset: 193848 data: 63817 rows: 5000 tail: 85 index: 750
+ Stream: column 0 section ROW_INDEX start: 193848 length 17
+ Stream: column 1 section ROW_INDEX start: 193865 length 165
+ Stream: column 2 section ROW_INDEX start: 194030 length 167
+ Stream: column 3 section ROW_INDEX start: 194197 length 91
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 194288 length 310
+ Stream: column 1 section DATA start: 194598 length 20035
+ Stream: column 2 section DATA start: 214633 length 40050
+ Stream: column 3 section DATA start: 254683 length 3574
+ Stream: column 3 section LENGTH start: 258257 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 258282 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -151,17 +151,17 @@ Stripes:
Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 258476 data: 12943 rows: 1000 tail: 78 index: 382
- Stream: column 0 section ROW_INDEX start: 258476 length 12
- Stream: column 1 section ROW_INDEX start: 258488 length 38
- Stream: column 2 section ROW_INDEX start: 258526 length 41
- Stream: column 3 section ROW_INDEX start: 258567 length 40
- Stream: column 3 section BLOOM_FILTER_UTF8 start: 258607 length 251
- Stream: column 1 section DATA start: 258858 length 4007
- Stream: column 2 section DATA start: 262865 length 8010
- Stream: column 3 section DATA start: 270875 length 768
- Stream: column 3 section LENGTH start: 271643 length 25
- Stream: column 3 section DICTIONARY_DATA start: 271668 length 133
+ Stripe: offset: 258500 data: 12943 rows: 1000 tail: 78 index: 375
+ Stream: column 0 section ROW_INDEX start: 258500 length 12
+ Stream: column 1 section ROW_INDEX start: 258512 length 38
+ Stream: column 2 section ROW_INDEX start: 258550 length 41
+ Stream: column 3 section ROW_INDEX start: 258591 length 40
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 258631 length 244
+ Stream: column 1 section DATA start: 258875 length 4007
+ Stream: column 2 section DATA start: 262882 length 8010
+ Stream: column 3 section DATA start: 270892 length 768
+ Stream: column 3 section LENGTH start: 271660 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 271685 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -172,7 +172,7 @@ Stripes:
Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
-File length: 272427 bytes
+File length: 272444 bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________
http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
index 75cd5f4..8296382 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -39,7 +39,7 @@ File Statistics:
Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
Stripes:
- Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14950
+ Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14949
Stream: column 0 section ROW_INDEX start: 3 length 17
Stream: column 1 section ROW_INDEX start: 20 length 166
Stream: column 2 section ROW_INDEX start: 186 length 169
@@ -47,12 +47,12 @@ Stripes:
Stream: column 2 section BLOOM_FILTER_UTF8 start: 6890 length 6046
Stream: column 3 section ROW_INDEX start: 12936 length 87
Stream: column 3 section BLOOM_FILTER start: 13023 length 1038
- Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 892
- Stream: column 1 section DATA start: 14953 length 20035
- Stream: column 2 section DATA start: 34988 length 40050
- Stream: column 3 section DATA start: 75038 length 3543
- Stream: column 3 section LENGTH start: 78581 length 25
- Stream: column 3 section DICTIONARY_DATA start: 78606 length 133
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 891
+ Stream: column 1 section DATA start: 14952 length 20035
+ Stream: column 2 section DATA start: 34987 length 40050
+ Stream: column 3 section DATA start: 75037 length 3543
+ Stream: column 3 section LENGTH start: 78580 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 78605 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -70,20 +70,20 @@ Stripes:
Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 0.5155 expectedFpp: 0.009676614
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 loadFactor: 0.9736 expectedFpp: 0.829482
- Stripe: offset: 78843 data: 63775 rows: 5000 tail: 103 index: 14941
- Stream: column 0 section ROW_INDEX start: 78843 length 17
- Stream: column 1 section ROW_INDEX start: 78860 length 164
- Stream: column 2 section ROW_INDEX start: 79024 length 168
- Stream: column 2 section BLOOM_FILTER start: 79192 length 6533
- Stream: column 2 section BLOOM_FILTER_UTF8 start: 85725 length 6046
- Stream: column 3 section ROW_INDEX start: 91771 length 83
- Stream: column 3 section BLOOM_FILTER start: 91854 length 1038
- Stream: column 3 section BLOOM_FILTER_UTF8 start: 92892 length 892
- Stream: column 1 section DATA start: 93784 length 20035
- Stream: column 2 section DATA start: 113819 length 40050
- Stream: column 3 section DATA start: 153869 length 3532
- Stream: column 3 section LENGTH start: 157401 length 25
- Stream: column 3 section DICTIONARY_DATA start: 157426 length 133
+ Stripe: offset: 78842 data: 63775 rows: 5000 tail: 103 index: 14940
+ Stream: column 0 section ROW_INDEX start: 78842 length 17
+ Stream: column 1 section ROW_INDEX start: 78859 length 164
+ Stream: column 2 section ROW_INDEX start: 79023 length 168
+ Stream: column 2 section BLOOM_FILTER start: 79191 length 6533
+ Stream: column 2 section BLOOM_FILTER_UTF8 start: 85724 length 6046
+ Stream: column 3 section ROW_INDEX start: 91770 length 83
+ Stream: column 3 section BLOOM_FILTER start: 91853 length 1038
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 92891 length 891
+ Stream: column 1 section DATA start: 93782 length 20035
+ Stream: column 2 section DATA start: 113817 length 40050
+ Stream: column 3 section DATA start: 153867 length 3532
+ Stream: column 3 section LENGTH start: 157399 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 157424 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -101,20 +101,20 @@ Stripes:
Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 0.5169 expectedFpp: 0.009855959
Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 0.5173 expectedFpp: 0.009911705
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 loadFactor: 0.9733 expectedFpp: 0.8276205
- Stripe: offset: 157662 data: 63787 rows: 5000 tail: 104 index: 14947
- Stream: column 0 section ROW_INDEX start: 157662 length 17
- Stream: column 1 section ROW_INDEX start: 157679 length 163
- Stream: column 2 section ROW_INDEX start: 157842 length 168
- Stream: column 2 section BLOOM_FILTER start: 158010 length 6533
- Stream: column 2 section BLOOM_FILTER_UTF8 start: 164543 length 6046
- Stream: column 3 section ROW_INDEX start: 170589 length 90
- Stream: column 3 section BLOOM_FILTER start: 170679 length 1038
- Stream: column 3 section BLOOM_FILTER_UTF8 start: 171717 length 892
- Stream: column 1 section DATA start: 172609 length 20035
- Stream: column 2 section DATA start: 192644 length 40050
- Stream: column 3 section DATA start: 232694 length 3544
- Stream: column 3 section LENGTH start: 236238 length 25
- Stream: column 3 section DICTIONARY_DATA start: 236263 length 133
+ Stripe: offset: 157660 data: 63787 rows: 5000 tail: 104 index: 14946
+ Stream: column 0 section ROW_INDEX start: 157660 length 17
+ Stream: column 1 section ROW_INDEX start: 157677 length 163
+ Stream: column 2 section ROW_INDEX start: 157840 length 168
+ Stream: column 2 section BLOOM_FILTER start: 158008 length 6533
+ Stream: column 2 section BLOOM_FILTER_UTF8 start: 164541 length 6046
+ Stream: column 3 section ROW_INDEX start: 170587 length 90
+ Stream: column 3 section BLOOM_FILTER start: 170677 length 1038
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 171715 length 891
+ Stream: column 1 section DATA start: 172606 length 20035
+ Stream: column 2 section DATA start: 192641 length 40050
+ Stream: column 3 section DATA start: 232691 length 3544
+ Stream: column 3 section LENGTH start: 236235 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 236260 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -132,20 +132,20 @@ Stripes:
Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 0.5149 expectedFpp: 0.009594797
Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 0.5135 expectedFpp: 0.009419539
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 loadFactor: 0.9722 expectedFpp: 0.82082444
- Stripe: offset: 236500 data: 63817 rows: 5000 tail: 103 index: 14940
- Stream: column 0 section ROW_INDEX start: 236500 length 17
- Stream: column 1 section ROW_INDEX start: 236517 length 165
- Stream: column 2 section ROW_INDEX start: 236682 length 167
- Stream: column 2 section BLOOM_FILTER start: 236849 length 6524
- Stream: column 2 section BLOOM_FILTER_UTF8 start: 243373 length 6046
- Stream: column 3 section ROW_INDEX start: 249419 length 91
- Stream: column 3 section BLOOM_FILTER start: 249510 length 1038
- Stream: column 3 section BLOOM_FILTER_UTF8 start: 250548 length 892
- Stream: column 1 section DATA start: 251440 length 20035
- Stream: column 2 section DATA start: 271475 length 40050
- Stream: column 3 section DATA start: 311525 length 3574
- Stream: column 3 section LENGTH start: 315099 length 25
- Stream: column 3 section DICTIONARY_DATA start: 315124 length 133
+ Stripe: offset: 236497 data: 63817 rows: 5000 tail: 103 index: 14939
+ Stream: column 0 section ROW_INDEX start: 236497 length 17
+ Stream: column 1 section ROW_INDEX start: 236514 length 165
+ Stream: column 2 section ROW_INDEX start: 236679 length 167
+ Stream: column 2 section BLOOM_FILTER start: 236846 length 6524
+ Stream: column 2 section BLOOM_FILTER_UTF8 start: 243370 length 6046
+ Stream: column 3 section ROW_INDEX start: 249416 length 91
+ Stream: column 3 section BLOOM_FILTER start: 249507 length 1038
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 250545 length 891
+ Stream: column 1 section DATA start: 251436 length 20035
+ Stream: column 2 section DATA start: 271471 length 40050
+ Stream: column 3 section DATA start: 311521 length 3574
+ Stream: column 3 section LENGTH start: 315095 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 315120 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -163,15 +163,15 @@ Stripes:
Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 0.5147 expectedFpp: 0.009567649
Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 0.5201 expectedFpp: 0.010295142
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 loadFactor: 0.9743 expectedFpp: 0.8332165
- Stripe: offset: 315360 data: 12943 rows: 1000 tail: 96 index: 3542
- Stream: column 0 section ROW_INDEX start: 315360 length 12
- Stream: column 1 section ROW_INDEX start: 315372 length 38
- Stream: column 2 section ROW_INDEX start: 315410 length 41
- Stream: column 2 section BLOOM_FILTER start: 315451 length 1337
- Stream: column 2 section BLOOM_FILTER_UTF8 start: 316788 length 1211
- Stream: column 3 section ROW_INDEX start: 317999 length 40
- Stream: column 3 section BLOOM_FILTER start: 318039 length 472
- Stream: column 3 section BLOOM_FILTER_UTF8 start: 318511 length 391
+ Stripe: offset: 315356 data: 12943 rows: 1000 tail: 96 index: 3546
+ Stream: column 0 section ROW_INDEX start: 315356 length 12
+ Stream: column 1 section ROW_INDEX start: 315368 length 38
+ Stream: column 2 section ROW_INDEX start: 315406 length 41
+ Stream: column 2 section BLOOM_FILTER start: 315447 length 1337
+ Stream: column 2 section BLOOM_FILTER_UTF8 start: 316784 length 1211
+ Stream: column 3 section ROW_INDEX start: 317995 length 40
+ Stream: column 3 section BLOOM_FILTER start: 318035 length 472
+ Stream: column 3 section BLOOM_FILTER_UTF8 start: 318507 length 395
Stream: column 1 section DATA start: 318902 length 4007
Stream: column 2 section DATA start: 322909 length 8010
Stream: column 3 section DATA start: 330919 length 768
http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/tools/src/test/resources/orc-file-dump.json
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.json b/java/tools/src/test/resources/orc-file-dump.json
index 3dd0dc0..b3e9d12 100644
--- a/java/tools/src/test/resources/orc-file-dump.json
+++ b/java/tools/src/test/resources/orc-file-dump.json
@@ -254,9 +254,9 @@
"stripeNumber": 1,
"stripeInformation": {
"offset": 3,
- "indexLength": 762,
+ "indexLength": 768,
"dataLength": 63770,
- "footerLength": 89,
+ "footerLength": 88,
"rowCount": 5000
},
"streams": [
@@ -288,42 +288,42 @@
"columnId": 3,
"section": "BLOOM_FILTER_UTF8",
"startOffset": 461,
- "length": 304
+ "length": 310
},
{
"columnId": 1,
"section": "DATA",
- "startOffset": 765,
+ "startOffset": 771,
"length": 20035
},
{
"columnId": 2,
"section": "DATA",
- "startOffset": 20800,
+ "startOffset": 20806,
"length": 40050
},
{
"columnId": 3,
"section": "PRESENT",
- "startOffset": 60850,
+ "startOffset": 60856,
"length": 17
},
{
"columnId": 3,
"section": "DATA",
- "startOffset": 60867,
+ "startOffset": 60873,
"length": 3510
},
{
"columnId": 3,
"section": "LENGTH",
- "startOffset": 64377,
+ "startOffset": 64383,
"length": 25
},
{
"columnId": 3,
"section": "DICTIONARY_DATA",
- "startOffset": 64402,
+ "startOffset": 64408,
"length": 133
}
],
@@ -494,8 +494,8 @@
{
"stripeNumber": 2,
"stripeInformation": {
- "offset": 64624,
- "indexLength": 753,
+ "offset": 64629,
+ "indexLength": 759,
"dataLength": 63763,
"footerLength": 87,
"rowCount": 5000
@@ -504,67 +504,67 @@
{
"columnId": 0,
"section": "ROW_INDEX",
- "startOffset": 64624,
+ "startOffset": 64629,
"length": 17
},
{
"columnId": 1,
"section": "ROW_INDEX",
- "startOffset": 64641,
+ "startOffset": 64646,
"length": 166
},
{
"columnId": 2,
"section": "ROW_INDEX",
- "startOffset": 64807,
+ "startOffset": 64812,
"length": 166
},
{
"columnId": 3,
"section": "ROW_INDEX",
- "startOffset": 64973,
+ "startOffset": 64978,
"length": 100
},
{
"columnId": 3,
"section": "BLOOM_FILTER_UTF8",
- "startOffset": 65073,
- "length": 304
+ "startOffset": 65078,
+ "length": 310
},
{
"columnId": 1,
"section": "DATA",
- "startOffset": 65377,
+ "startOffset": 65388,
"length": 20035
},
{
"columnId": 2,
"section": "DATA",
- "startOffset": 85412,
+ "startOffset": 85423,
"length": 40050
},
{
"columnId": 3,
"section": "PRESENT",
- "startOffset": 125462,
+ "startOffset": 125473,
"length": 17
},
{
"columnId": 3,
"section": "DATA",
- "startOffset": 125479,
+ "startOffset": 125490,
"length": 3503
},
{
"columnId": 3,
"section": "LENGTH",
- "startOffset": 128982,
+ "startOffset": 128993,
"length": 25
},
{
"columnId": 3,
"section": "DICTIONARY_DATA",
- "startOffset": 129007,
+ "startOffset": 129018,
"length": 133
}
],
@@ -735,77 +735,77 @@
{
"stripeNumber": 3,
"stripeInformation": {
- "offset": 129227,
- "indexLength": 754,
+ "offset": 129238,
+ "indexLength": 760,
"dataLength": 63770,
- "footerLength": 89,
+ "footerLength": 88,
"rowCount": 5000
},
"streams": [
{
"columnId": 0,
"section": "ROW_INDEX",
- "startOffset": 129227,
+ "startOffset": 129238,
"length": 17
},
{
"columnId": 1,
"section": "ROW_INDEX",
- "startOffset": 129244,
+ "startOffset": 129255,
"length": 164
},
{
"columnId": 2,
"section": "ROW_INDEX",
- "startOffset": 129408,
+ "startOffset": 129419,
"length": 167
},
{
"columnId": 3,
"section": "ROW_INDEX",
- "startOffset": 129575,
+ "startOffset": 129586,
"length": 102
},
{
"columnId": 3,
"section": "BLOOM_FILTER_UTF8",
- "startOffset": 129677,
- "length": 304
+ "startOffset": 129688,
+ "length": 310
},
{
"columnId": 1,
"section": "DATA",
- "startOffset": 129981,
+ "startOffset": 129998,
"length": 20035
},
{
"columnId": 2,
"section": "DATA",
- "startOffset": 150016,
+ "startOffset": 150033,
"length": 40050
},
{
"columnId": 3,
"section": "PRESENT",
- "startOffset": 190066,
+ "startOffset": 190083,
"length": 17
},
{
"columnId": 3,
"section": "DATA",
- "startOffset": 190083,
+ "startOffset": 190100,
"length": 3510
},
{
"columnId": 3,
"section": "LENGTH",
- "startOffset": 193593,
+ "startOffset": 193610,
"length": 25
},
{
"columnId": 3,
"section": "DICTIONARY_DATA",
- "startOffset": 193618,
+ "startOffset": 193635,
"length": 133
}
],
@@ -976,8 +976,8 @@
{
"stripeNumber": 4,
"stripeInformation": {
- "offset": 193840,
- "indexLength": 765,
+ "offset": 193856,
+ "indexLength": 771,
"dataLength": 63756,
"footerLength": 89,
"rowCount": 5000
@@ -986,67 +986,67 @@
{
"columnId": 0,
"section": "ROW_INDEX",
- "startOffset": 193840,
+ "startOffset": 193856,
"length": 17
},
{
"columnId": 1,
"section": "ROW_INDEX",
- "startOffset": 193857,
+ "startOffset": 193873,
"length": 166
},
{
"columnId": 2,
"section": "ROW_INDEX",
- "startOffset": 194023,
+ "startOffset": 194039,
"length": 171
},
{
"columnId": 3,
"section": "ROW_INDEX",
- "startOffset": 194194,
+ "startOffset": 194210,
"length": 107
},
{
"columnId": 3,
"section": "BLOOM_FILTER_UTF8",
- "startOffset": 194301,
- "length": 304
+ "startOffset": 194317,
+ "length": 310
},
{
"columnId": 1,
"section": "DATA",
- "startOffset": 194605,
+ "startOffset": 194627,
"length": 20035
},
{
"columnId": 2,
"section": "DATA",
- "startOffset": 214640,
+ "startOffset": 214662,
"length": 40050
},
{
"columnId": 3,
"section": "PRESENT",
- "startOffset": 254690,
+ "startOffset": 254712,
"length": 17
},
{
"columnId": 3,
"section": "DATA",
- "startOffset": 254707,
+ "startOffset": 254729,
"length": 3496
},
{
"columnId": 3,
"section": "LENGTH",
- "startOffset": 258203,
+ "startOffset": 258225,
"length": 25
},
{
"columnId": 3,
"section": "DICTIONARY_DATA",
- "startOffset": 258228,
+ "startOffset": 258250,
"length": 133
}
],
@@ -1217,8 +1217,8 @@
{
"stripeNumber": 5,
"stripeInformation": {
- "offset": 258450,
- "indexLength": 383,
+ "offset": 258472,
+ "indexLength": 376,
"dataLength": 12943,
"footerLength": 83,
"rowCount": 1000
@@ -1227,67 +1227,67 @@
{
"columnId": 0,
"section": "ROW_INDEX",
- "startOffset": 258450,
+ "startOffset": 258472,
"length": 12
},
{
"columnId": 1,
"section": "ROW_INDEX",
- "startOffset": 258462,
+ "startOffset": 258484,
"length": 38
},
{
"columnId": 2,
"section": "ROW_INDEX",
- "startOffset": 258500,
+ "startOffset": 258522,
"length": 41
},
{
"columnId": 3,
"section": "ROW_INDEX",
- "startOffset": 258541,
+ "startOffset": 258563,
"length": 41
},
{
"columnId": 3,
"section": "BLOOM_FILTER_UTF8",
- "startOffset": 258582,
- "length": 251
+ "startOffset": 258604,
+ "length": 244
},
{
"columnId": 1,
"section": "DATA",
- "startOffset": 258833,
+ "startOffset": 258848,
"length": 4007
},
{
"columnId": 2,
"section": "DATA",
- "startOffset": 262840,
+ "startOffset": 262855,
"length": 8010
},
{
"columnId": 3,
"section": "PRESENT",
- "startOffset": 270850,
+ "startOffset": 270865,
"length": 16
},
{
"columnId": 3,
"section": "DATA",
- "startOffset": 270866,
+ "startOffset": 270881,
"length": 752
},
{
"columnId": 3,
"section": "LENGTH",
- "startOffset": 271618,
+ "startOffset": 271633,
"length": 25
},
{
"columnId": 3,
"section": "DICTIONARY_DATA",
- "startOffset": 271643,
+ "startOffset": 271658,
"length": 133
}
],
@@ -1348,7 +1348,7 @@
}]
}
],
- "fileLength": 272409,
+ "fileLength": 272428,
"paddingLength": 0,
"paddingRatio": 0,
"status": "OK"
http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/site/_data/releases.yml
----------------------------------------------------------------------
diff --git a/site/_data/releases.yml b/site/_data/releases.yml
index 3331688..1282115 100644
--- a/site/_data/releases.yml
+++ b/site/_data/releases.yml
@@ -9,6 +9,7 @@
sha256: 5c394c7ed3a31d20726ded55ed9c5a0eeff1bd5b85b1cb2ee6c3c1a94560578c
known-issues:
ORC-40: Predicate push down is not implemented in C++.
+ ORC-101: Bloom filters for string and decimal use inconsistent encoding
1.1.2:
date: 2016-07-08
@@ -19,6 +20,7 @@
known-issues:
HIVE-14214: Schema evolution and predicate pushdown don't work together.
ORC-40: Predicate push down is not implemented in C++.
+ ORC-101: Bloom filters for string and decimal use inconsistent encoding
1.1.1:
date: 2016-06-13
@@ -29,6 +31,7 @@
known-issues:
HIVE-14214: Schema evolution and predicate pushdown don't work together.
ORC-40: Predicate push down is not implemented in C++.
+ ORC-101: Bloom filters for string and decimal use inconsistent encoding
1.1.0:
date: 2016-06-10
@@ -39,6 +42,7 @@
known-issues:
HIVE-14214: Schema evolution and predicate pushdown don't work together.
ORC-40: Predicate push down is not implemented in C++.
+ ORC-101: Bloom filters for string and decimal use inconsistent encoding
1.0.0:
date: 2016-01-25
http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/site/_docs/spec-index.md
----------------------------------------------------------------------
diff --git a/site/_docs/spec-index.md b/site/_docs/spec-index.md
index 009df59..263c9a8 100644
--- a/site/_docs/spec-index.md
+++ b/site/_docs/spec-index.md
@@ -57,14 +57,17 @@ group (default to 10,000 rows) in a column. Only the row groups that
satisfy min/max row index evaluation will be evaluated against the
bloom filter index.
-Each BloomFilterEntry stores the number of hash functions ('k') used and
-the bitset backing the bloom filter. The bitset is serialized as repeated
-longs from which the number of bits ('m') for the bloom filter can be derived.
-m = bitset.length * 64.
+Each BloomFilterEntry stores the number of hash functions ('k') used
+and the bitset backing the bloom filter. The original encoding (pre
+ORC-101) of bloom filters used the bitset field encoded as a repeating
+sequence of longs in the bitset field with a little endian encoding
+(0x1 is bit 0 and 0x2 is bit 1.) After ORC-101, the encoding is a
+sequence of bytes with a little endian encoding in the utf8bitset field.
```message BloomFilter {
optional uint32 numHashFunctions = 1;
repeated fixed64 bitset = 2;
+ optional bytes utf8bitset = 3;
}
```
http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/site/_docs/stripes.md
----------------------------------------------------------------------
diff --git a/site/_docs/stripes.md b/site/_docs/stripes.md
index d53f709..cc85feb 100644
--- a/site/_docs/stripes.md
+++ b/site/_docs/stripes.md
@@ -56,6 +56,10 @@ depends on the type and encoding of the column.
SECONDARY = 5;
// the index for seeking to particular row groups
ROW_INDEX = 6;
+ // original bloom filters used before ORC-101
+ BLOOM_FILTER = 7;
+ // bloom filters that consistently use utf8
+ BLOOM_FILTER_UTF8 = 8;
}
required Kind kind = 1;
// the column id
[3/4] orc git commit: ORC-101 Correct bloom filters for strings and
decimals to use utf8 encoding.
Posted by om...@apache.org.
ORC-101 Correct bloom filters for strings and decimals to use utf8 encoding.
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/9d39cb80
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/9d39cb80
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/9d39cb80
Branch: refs/heads/master
Commit: 9d39cb80f455f7c341bd4a9421651badb1d137f3
Parents: 7118e96
Author: Owen O'Malley <om...@apache.org>
Authored: Tue Sep 13 13:28:44 2016 -0700
Committer: Owen O'Malley <om...@apache.org>
Committed: Tue Sep 20 15:12:57 2016 -0500
----------------------------------------------------------------------
c++/include/orc/Reader.hh | 1 +
c++/src/Reader.cc | 2 +
.../src/java/org/apache/orc/BloomFilterIO.java | 50 --
.../src/java/org/apache/orc/DataReader.java | 7 +-
java/core/src/java/org/apache/orc/OrcConf.java | 10 +
java/core/src/java/org/apache/orc/OrcFile.java | 51 +-
.../java/org/apache/orc/TypeDescription.java | 26 +
.../orc/impl/ConvertTreeReaderFactory.java | 12 +-
.../src/java/org/apache/orc/impl/OrcIndex.java | 10 +-
.../org/apache/orc/impl/RecordReaderImpl.java | 70 ++-
.../org/apache/orc/impl/RecordReaderUtils.java | 192 ++++++--
.../org/apache/orc/impl/SchemaEvolution.java | 4 +
.../java/org/apache/orc/impl/StreamName.java | 1 +
.../java/org/apache/orc/impl/WriterImpl.java | 228 +++++++--
.../java/org/apache/orc/util/BloomFilter.java | 312 ++++++++++++
.../java/org/apache/orc/util/BloomFilterIO.java | 93 ++++
.../org/apache/orc/util/BloomFilterUtf8.java | 55 +++
.../test/org/apache/orc/TestVectorOrcFile.java | 4 +-
.../apache/orc/impl/TestRecordReaderImpl.java | 484 +++++++++++++------
.../test/org/apache/orc/util/TestMurmur3.java | 225 +++++++++
java/core/src/test/resources/log4j.properties | 3 +
.../src/test/resources/log4j.properties | 3 +
.../apache/hive/common/util/BloomFilter.java | 313 ------------
.../org/apache/hive/common/util/Murmur3.java | 335 -------------
.../src/java/org/apache/orc/util/Murmur3.java | 335 +++++++++++++
.../apache/hive/common/util/TestMurmur3.java | 224 ---------
.../src/java/org/apache/orc/tools/FileDump.java | 20 +-
.../java/org/apache/orc/tools/JsonFileDump.java | 27 +-
.../test/org/apache/orc/tools/TestFileDump.java | 6 +-
java/tools/src/test/resources/log4j.properties | 21 +
.../resources/orc-file-dump-bloomfilter.out | 106 ++--
.../resources/orc-file-dump-bloomfilter2.out | 121 +++--
.../orc-file-dump-dictionary-threshold.out | 2 +-
.../tools/src/test/resources/orc-file-dump.json | 150 +++---
java/tools/src/test/resources/orc-file-dump.out | 2 +-
.../src/test/resources/orc-file-has-null.out | 2 +-
proto/orc_proto.proto | 2 +
37 files changed, 2115 insertions(+), 1394 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/c++/include/orc/Reader.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index 25a0a17..eacbd80 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -53,6 +53,7 @@ namespace orc {
WriterVersion_HIVE_4243 = 2,
WriterVersion_HIVE_12055 = 3,
WriterVersion_HIVE_13083 = 4,
+ WriterVersion_ORC_101 = 5,
WriterVersion_MAX = INT64_MAX
};
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/c++/src/Reader.cc
----------------------------------------------------------------------
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 9b1f1b9..91f4ea1 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -72,6 +72,8 @@ namespace orc {
return "HIVE-12055";
case WriterVersion_HIVE_13083:
return "HIVE-13083";
+ case WriterVersion_ORC_101:
+ return "ORC-101";
}
std::stringstream buffer;
buffer << "future - " << version;
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/BloomFilterIO.java b/java/core/src/java/org/apache/orc/BloomFilterIO.java
deleted file mode 100644
index 106227d..0000000
--- a/java/core/src/java/org/apache/orc/BloomFilterIO.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import org.apache.hive.common.util.BloomFilter;
-
-public class BloomFilterIO extends BloomFilter {
-
- public BloomFilterIO(long expectedEntries) {
- super(expectedEntries, DEFAULT_FPP);
- }
-
- public BloomFilterIO(long expectedEntries, double fpp) {
- super(expectedEntries, fpp);
- }
-
- static long[] toArray(OrcProto.BloomFilter filter) {
- long[] result = new long[filter.getBitsetCount()];
- int i =0;
- for(Long l: filter.getBitsetList()) {
- result[i++] = l;
- }
- return result;
- }
-
-/**
- * Initializes the BloomFilter from the given Orc BloomFilter
- */
- public BloomFilterIO(OrcProto.BloomFilter bloomFilter) {
- this.bitSet = new BitSet(toArray(bloomFilter));
- this.numHashFunctions = bloomFilter.getNumHashFunctions();
- this.numBits = (int) this.bitSet.bitSize();
- }
-}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/DataReader.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/DataReader.java b/java/core/src/java/org/apache/orc/DataReader.java
index a5dbb76..b3f91f2 100644
--- a/java/core/src/java/org/apache/orc/DataReader.java
+++ b/java/core/src/java/org/apache/orc/DataReader.java
@@ -31,9 +31,14 @@ public interface DataReader extends AutoCloseable {
void open() throws IOException;
OrcIndex readRowIndex(StripeInformation stripe,
+ TypeDescription fileSchema,
OrcProto.StripeFooter footer,
- boolean[] included, OrcProto.RowIndex[] indexes,
+ boolean ignoreNonUtf8BloomFilter,
+ boolean[] included,
+ OrcProto.RowIndex[] indexes,
boolean[] sargColumns,
+ OrcFile.WriterVersion version,
+ OrcProto.Stream.Kind[] bloomFilterKinds,
OrcProto.BloomFilterIndex[] bloomFilterIndices
) throws IOException;
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/OrcConf.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcConf.java b/java/core/src/java/org/apache/orc/OrcConf.java
index ac8e3f0..05ab13b 100644
--- a/java/core/src/java/org/apache/orc/OrcConf.java
+++ b/java/core/src/java/org/apache/orc/OrcConf.java
@@ -105,6 +105,16 @@ public enum OrcConf {
"dictionary or not will be retained thereafter."),
BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns",
"", "List of columns to create bloom filters for when writing."),
+ BLOOM_FILTER_WRITE_VERSION("orc.bloom.filter.write.version",
+ "orc.bloom.filter.write.version", OrcFile.BloomFilterVersion.UTF8.toString(),
+ "Which version of the bloom filters should we write.\n" +
+ "The choices are:\n" +
+ " original - writes two versions of the bloom filters for use by\n" +
+ " both old and new readers.\n" +
+ " utf8 - writes just the new bloom filters."),
+ IGNORE_NON_UTF8_BLOOM_FILTERS("orc.bloom.filter.ignore.non-utf8",
+ "orc.bloom.filter.ignore.non-utf8", false,
+ "Should the reader ignore the obsolete non-UTF8 bloom filters."),
MAX_FILE_LENGTH("orc.max.file.length", "orc.max.file.length", Long.MAX_VALUE,
"The maximum size of the file to read for finding the file tail. This\n" +
"is primarily used for streaming ingest to read intermediate\n" +
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java
index ddfa9f7..6b2d48e 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -108,6 +108,7 @@ public class OrcFile {
HIVE_4243(2), // use real column names from Hive tables
HIVE_12055(3), // vectorized writer
HIVE_13083(4), // decimal writer updating present stream wrongly
+ ORC_101(5), // bloom filters use utf8
// Don't use any magic numbers here except for the below:
FUTURE(Integer.MAX_VALUE); // a version from a future writer
@@ -144,8 +145,12 @@ public class OrcFile {
if (val == FUTURE.id) return FUTURE; // Special handling for the magic value.
return values[val];
}
+
+ public boolean includes(WriterVersion other) {
+ return id >= other.id;
+ }
}
- public static final WriterVersion CURRENT_WRITER = WriterVersion.HIVE_13083;
+ public static final WriterVersion CURRENT_WRITER = WriterVersion.ORC_101;
public enum EncodingStrategy {
SPEED, COMPRESSION
@@ -231,6 +236,33 @@ public class OrcFile {
void preFooterWrite(WriterContext context) throws IOException;
}
+ public static enum BloomFilterVersion {
+ // Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams to support
+ // both old and new readers.
+ ORIGINAL("original"),
+ // Only include the BLOOM_FILTER_UTF8 streams that consistently use UTF8.
+ // See ORC-101
+ UTF8("utf8");
+
+ private final String id;
+ private BloomFilterVersion(String id) {
+ this.id = id;
+ }
+
+ public String toString() {
+ return id;
+ }
+
+ public static BloomFilterVersion fromString(String s) {
+ for (BloomFilterVersion version: values()) {
+ if (version.id.equals(s)) {
+ return version;
+ }
+ }
+ throw new IllegalArgumentException("Unknown BloomFilterVersion " + s);
+ }
+ }
+
/**
* Options for creating ORC file writers.
*/
@@ -253,6 +285,7 @@ public class OrcFile {
private double paddingTolerance;
private String bloomFilterColumns;
private double bloomFilterFpp;
+ private BloomFilterVersion bloomFilterVersion;
protected WriterOptions(Properties tableProperties, Configuration conf) {
configuration = conf;
@@ -286,6 +319,10 @@ public class OrcFile {
conf);
bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(tableProperties,
conf);
+ bloomFilterVersion =
+ BloomFilterVersion.fromString(
+ OrcConf.BLOOM_FILTER_WRITE_VERSION.getString(tableProperties,
+ conf));
}
/**
@@ -430,6 +467,14 @@ public class OrcFile {
}
/**
+ * Set the version of the bloom filters to write.
+ */
+ public WriterOptions bloomFilterVersion(BloomFilterVersion version) {
+ this.bloomFilterVersion = version;
+ return this;
+ }
+
+ /**
* A package local option to set the memory manager.
*/
protected WriterOptions memory(MemoryManager value) {
@@ -508,6 +553,10 @@ public class OrcFile {
public double getBloomFilterFpp() {
return bloomFilterFpp;
}
+
+ public BloomFilterVersion getBloomFilterVersion() {
+ return bloomFilterVersion;
+ }
}
/**
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java
index da9fe49..bc6787d 100644
--- a/java/core/src/java/org/apache/orc/TypeDescription.java
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -842,4 +842,30 @@ public class TypeDescription
printJsonToBuffer("", buffer, 0);
return buffer.toString();
}
+
+ /**
+ * Locate a subtype by its id.
+ * @param goal the column id to look for
+ * @return the subtype
+ */
+ public TypeDescription findSubtype(int goal) {
+ // call getId method to make sure the ids are assigned
+ int id = getId();
+ if (goal < id || goal > maxId) {
+ throw new IllegalArgumentException("Unknown type id " + id + " in " +
+ toJson());
+ }
+ if (goal == id) {
+ return this;
+ } else {
+ TypeDescription prev = null;
+ for(TypeDescription next: children) {
+ if (next.id > goal) {
+ return prev.findSubtype(goal);
+ }
+ prev = next;
+ }
+ return prev.findSubtype(goal);
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
index 36b9a20..20e0faa 100644
--- a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
+++ b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
@@ -1408,7 +1408,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
public void setConvertVectorElement(int elementNum) {
long longValue = longColVector.vector[elementNum];
String string = anyIntegerAsLongTreeReader.getString(longValue);
- byte[] bytes = string.getBytes();
+ byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
}
@@ -1450,7 +1450,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
float floatValue = (float) doubleColVector.vector[elementNum];
if (!Float.isNaN(floatValue)) {
String string = String.valueOf(floatValue);
- byte[] bytes = string.getBytes();
+ byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
} else {
bytesColVector.noNulls = false;
@@ -1495,7 +1495,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
double doubleValue = doubleColVector.vector[elementNum];
if (!Double.isNaN(doubleValue)) {
String string = String.valueOf(doubleValue);
- byte[] bytes = string.getBytes();
+ byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
} else {
bytesColVector.noNulls = false;
@@ -1544,7 +1544,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
@Override
public void setConvertVectorElement(int elementNum) {
String string = decimalColVector.vector[elementNum].getHiveDecimal().toString();
- byte[] bytes = string.getBytes();
+ byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
}
@@ -1584,7 +1584,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
public void setConvertVectorElement(int elementNum) throws IOException {
String string =
timestampColVector.asScratchTimestamp(elementNum).toString();
- byte[] bytes = string.getBytes();
+ byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
}
@@ -1626,7 +1626,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
public void setConvertVectorElement(int elementNum) throws IOException {
date.setTime(DateWritable.daysToMillis((int) longColVector.vector[elementNum]));
String string = date.toString();
- byte[] bytes = string.getBytes();
+ byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/OrcIndex.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/OrcIndex.java b/java/core/src/java/org/apache/orc/impl/OrcIndex.java
index 50a15f2..edcb3ba 100644
--- a/java/core/src/java/org/apache/orc/impl/OrcIndex.java
+++ b/java/core/src/java/org/apache/orc/impl/OrcIndex.java
@@ -22,10 +22,14 @@ import org.apache.orc.OrcProto;
public final class OrcIndex {
OrcProto.RowIndex[] rowGroupIndex;
+ OrcProto.Stream.Kind[] bloomFilterKinds;
OrcProto.BloomFilterIndex[] bloomFilterIndex;
- public OrcIndex(OrcProto.RowIndex[] rgIndex, OrcProto.BloomFilterIndex[] bfIndex) {
+ public OrcIndex(OrcProto.RowIndex[] rgIndex,
+ OrcProto.Stream.Kind[] bloomFilterKinds,
+ OrcProto.BloomFilterIndex[] bfIndex) {
this.rowGroupIndex = rgIndex;
+ this.bloomFilterKinds = bloomFilterKinds;
this.bloomFilterIndex = bfIndex;
}
@@ -37,6 +41,10 @@ public final class OrcIndex {
return bloomFilterIndex;
}
+ public OrcProto.Stream.Kind[] getBloomFilterKinds() {
+ return bloomFilterKinds;
+ }
+
public void setRowGroupIndex(OrcProto.RowIndex[] rowGroupIndex) {
this.rowGroupIndex = rowGroupIndex;
}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index e8ad54d..c7ce2bb 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -27,7 +27,9 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.OrcFile;
+import org.apache.orc.util.BloomFilter;
+import org.apache.orc.util.BloomFilterIO;
import org.apache.orc.BooleanColumnStatistics;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.CompressionCodec;
@@ -88,10 +90,13 @@ public class RecordReaderImpl implements RecordReader {
private final TreeReaderFactory.TreeReader reader;
private final OrcProto.RowIndex[] indexes;
private final OrcProto.BloomFilterIndex[] bloomFilterIndices;
+ private final OrcProto.Stream.Kind[] bloomFilterKind;
private final SargApplier sargApp;
// an array about which row groups aren't skipped
private boolean[] includedRowGroups = null;
private final DataReader dataReader;
+ private final boolean ignoreNonUtf8BloomFilter;
+ private final OrcFile.WriterVersion writerVersion;
/**
* Given a list of column names, find the given column and return the index.
@@ -134,6 +139,7 @@ public class RecordReaderImpl implements RecordReader {
protected RecordReaderImpl(ReaderImpl fileReader,
Reader.Options options) throws IOException {
this.included = options.getInclude();
+ this.writerVersion = fileReader.getWriterVersion();
included[0] = true;
if (options.getSchema() == null) {
if (LOG.isInfoEnabled()) {
@@ -162,11 +168,14 @@ public class RecordReaderImpl implements RecordReader {
this.types = fileReader.types;
this.bufferSize = fileReader.bufferSize;
this.rowIndexStride = fileReader.rowIndexStride;
+ this.ignoreNonUtf8BloomFilter =
+ OrcConf.IGNORE_NON_UTF8_BLOOM_FILTERS.getBoolean(fileReader.conf);
SearchArgument sarg = options.getSearchArgument();
if (sarg != null && rowIndexStride != 0) {
sargApp = new SargApplier(sarg, options.getColumnNames(),
rowIndexStride,
- included.length, evolution);
+ included.length, evolution,
+ writerVersion);
} else {
sargApp = null;
}
@@ -218,6 +227,7 @@ public class RecordReaderImpl implements RecordReader {
writerIncluded = evolution.getFileIncluded();
indexes = new OrcProto.RowIndex[types.size()];
bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()];
+ bloomFilterKind = new OrcProto.Stream.Kind[types.size()];
advanceToNextRow(reader, 0L, true);
}
@@ -339,20 +349,23 @@ public class RecordReaderImpl implements RecordReader {
* that is referenced in the predicate.
* @param statsProto the statistics for the column mentioned in the predicate
* @param predicate the leaf predicate we need to evaluation
- * @param bloomFilter
+ * @param bloomFilter the bloom filter
+ * @param writerVersion the version of software that wrote the file
+ * @param type what is the kind of this column
* @return the set of truth values that may be returned for the given
* predicate.
*/
static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto,
- PredicateLeaf predicate, OrcProto.BloomFilter bloomFilter) {
+ PredicateLeaf predicate,
+ OrcProto.Stream.Kind kind,
+ OrcProto.BloomFilter bloomFilter,
+ OrcFile.WriterVersion writerVersion,
+ TypeDescription.Category type) {
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto);
Object minValue = getMin(cs);
Object maxValue = getMax(cs);
- BloomFilterIO bf = null;
- if (bloomFilter != null) {
- bf = new BloomFilterIO(bloomFilter);
- }
- return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(), bf);
+ return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(),
+ BloomFilterIO.deserialize(kind, writerVersion, type, bloomFilter));
}
/**
@@ -365,14 +378,14 @@ public class RecordReaderImpl implements RecordReader {
*/
public static TruthValue evaluatePredicate(ColumnStatistics stats,
PredicateLeaf predicate,
- BloomFilterIO bloomFilter) {
+ BloomFilter bloomFilter) {
Object minValue = getMin(stats);
Object maxValue = getMax(stats);
return evaluatePredicateRange(predicate, minValue, maxValue, stats.hasNull(), bloomFilter);
}
static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min,
- Object max, boolean hasNull, BloomFilterIO bloomFilter) {
+ Object max, boolean hasNull, BloomFilter bloomFilter) {
// if we didn't have any values, everything must have been null
if (min == null) {
if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) {
@@ -421,7 +434,7 @@ public class RecordReaderImpl implements RecordReader {
}
private static boolean shouldEvaluateBloomFilter(PredicateLeaf predicate,
- TruthValue result, BloomFilterIO bloomFilter) {
+ TruthValue result, BloomFilter bloomFilter) {
// evaluate bloom filter only when
// 1) Bloom filter is available
// 2) Min/Max evaluation yield YES or MAYBE
@@ -531,7 +544,7 @@ public class RecordReaderImpl implements RecordReader {
}
private static TruthValue evaluatePredicateBloomFilter(PredicateLeaf predicate,
- final Object predObj, BloomFilterIO bloomFilter, boolean hasNull) {
+ final Object predObj, BloomFilter bloomFilter, boolean hasNull) {
switch (predicate.getOperator()) {
case NULL_SAFE_EQUALS:
// null safe equals does not return *_NULL variant. So set hasNull to false
@@ -553,7 +566,7 @@ public class RecordReaderImpl implements RecordReader {
}
}
- private static TruthValue checkInBloomFilter(BloomFilterIO bf, Object predObj, boolean hasNull) {
+ private static TruthValue checkInBloomFilter(BloomFilter bf, Object predObj, boolean hasNull) {
TruthValue result = hasNull ? TruthValue.NO_NULL : TruthValue.NO;
if (predObj instanceof Long) {
@@ -708,6 +721,7 @@ public class RecordReaderImpl implements RecordReader {
public final static boolean[] READ_ALL_RGS = null;
public final static boolean[] READ_NO_RGS = new boolean[0];
+ private final OrcFile.WriterVersion writerVersion;
private final SearchArgument sarg;
private final List<PredicateLeaf> sargLeaves;
private final int[] filterColumns;
@@ -716,10 +730,13 @@ public class RecordReaderImpl implements RecordReader {
private final boolean[] sargColumns;
private SchemaEvolution evolution;
- public SargApplier(SearchArgument sarg, String[] columnNames,
+ public SargApplier(SearchArgument sarg,
+ String[] columnNames,
long rowIndexStride,
int includedCount,
- SchemaEvolution evolution) {
+ SchemaEvolution evolution,
+ OrcFile.WriterVersion writerVersion) {
+ this.writerVersion = writerVersion;
this.sarg = sarg;
sargLeaves = sarg.getLeaves();
filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves,
@@ -745,8 +762,11 @@ public class RecordReaderImpl implements RecordReader {
* row groups must be read.
* @throws IOException
*/
- public boolean[] pickRowGroups(StripeInformation stripe, OrcProto.RowIndex[] indexes,
- OrcProto.BloomFilterIndex[] bloomFilterIndices, boolean returnNone) throws IOException {
+ public boolean[] pickRowGroups(StripeInformation stripe,
+ OrcProto.RowIndex[] indexes,
+ OrcProto.Stream.Kind[] bloomFilterKinds,
+ OrcProto.BloomFilterIndex[] bloomFilterIndices,
+ boolean returnNone) throws IOException {
long rowsInStripe = stripe.getNumberOfRows();
int groupsInStripe = (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride);
boolean[] result = new boolean[groupsInStripe]; // TODO: avoid alloc?
@@ -765,11 +785,15 @@ public class RecordReaderImpl implements RecordReader {
}
OrcProto.ColumnStatistics stats = entry.getStatistics();
OrcProto.BloomFilter bf = null;
+ OrcProto.Stream.Kind bfk = null;
if (bloomFilterIndices != null && bloomFilterIndices[columnIx] != null) {
+ bfk = bloomFilterKinds[columnIx];
bf = bloomFilterIndices[columnIx].getBloomFilter(rowGroup);
}
if (evolution != null && evolution.isPPDSafeConversion(columnIx)) {
- leafValues[pred] = evaluatePredicateProto(stats, sargLeaves.get(pred), bf);
+ leafValues[pred] = evaluatePredicateProto(stats,
+ sargLeaves.get(pred), bfk, bf, writerVersion,
+ evolution.getFileSchema().findSubtype(columnIx).getCategory());
} else {
leafValues[pred] = TruthValue.YES_NO_NULL;
}
@@ -809,7 +833,8 @@ public class RecordReaderImpl implements RecordReader {
return null;
}
readRowIndex(currentStripe, writerIncluded, sargApp.sargColumns);
- return sargApp.pickRowGroups(stripes.get(currentStripe), indexes, bloomFilterIndices, false);
+ return sargApp.pickRowGroups(stripes.get(currentStripe), indexes,
+ bloomFilterKind, bloomFilterIndices, false);
}
private void clearStreams() {
@@ -1168,8 +1193,9 @@ public class RecordReaderImpl implements RecordReader {
sargColumns = sargColumns == null ?
(sargApp == null ? null : sargApp.sargColumns) : sargColumns;
}
- return dataReader.readRowIndex(stripe, stripeFooter, included, indexes, sargColumns,
- bloomFilterIndex);
+ return dataReader.readRowIndex(stripe, evolution.getFileType(0), stripeFooter,
+ ignoreNonUtf8BloomFilter, included, indexes, sargColumns, writerVersion,
+ bloomFilterKind, bloomFilterIndex);
}
private void seekToRowEntry(TreeReaderFactory.TreeReader reader, int rowEntry)
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
index 3d57732..cadee35 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
@@ -30,13 +30,13 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.io.DiskRange;
import org.apache.hadoop.hive.common.io.DiskRangeList;
-import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper;
-import org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper;
import org.apache.orc.CompressionCodec;
import org.apache.orc.DataReader;
+import org.apache.orc.OrcFile;
import org.apache.orc.OrcProto;
import org.apache.orc.StripeInformation;
+import org.apache.orc.TypeDescription;
/**
* Stateless methods shared between RecordReaderImpl and EncodedReaderImpl.
@@ -44,6 +44,100 @@ import org.apache.orc.StripeInformation;
public class RecordReaderUtils {
private static final HadoopShims SHIMS = HadoopShims.Factory.get();
+ static boolean hadBadBloomFilters(TypeDescription.Category category,
+ OrcFile.WriterVersion version) {
+ switch(category) {
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return !version.includes(OrcFile.WriterVersion.HIVE_12055);
+ case DECIMAL:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ /**
+ * Plans the list of disk ranges that the given stripe needs to read the
+ * indexes. All of the positions are relative to the start of the stripe.
+ * @param fileSchema the schema for the file
+ * @param footer the stripe footer
+ * @param ignoreNonUtf8BloomFilter should the reader ignore non-utf8
+ * encoded bloom filters
+ * @param fileIncluded the columns (indexed by file columns) that should be
+ * read
+ * @param sargColumns true for the columns (indexed by file columns) that
+ * we need bloom filters for
+ * @param version the version of the software that wrote the file
+ * @param bloomFilterKinds (output) the stream kind of the bloom filters
+ * @return a list of merged disk ranges to read
+ */
+ static DiskRangeList planIndexReading(TypeDescription fileSchema,
+ OrcProto.StripeFooter footer,
+ boolean ignoreNonUtf8BloomFilter,
+ boolean[] fileIncluded,
+ boolean[] sargColumns,
+ OrcFile.WriterVersion version,
+ OrcProto.Stream.Kind[] bloomFilterKinds) {
+ DiskRangeList.CreateHelper result = new DiskRangeList.CreateHelper();
+ List<OrcProto.Stream> streams = footer.getStreamsList();
+ // figure out which kind of bloom filter we want for each column
+ // picks bloom_filter_utf8 if its available, otherwise bloom_filter
+ if (sargColumns != null) {
+ for (OrcProto.Stream stream : streams) {
+ if (stream.hasKind() && stream.hasColumn()) {
+ int column = stream.getColumn();
+ if (sargColumns[column]) {
+ switch (stream.getKind()) {
+ case BLOOM_FILTER:
+ if (bloomFilterKinds[column] == null &&
+ !(ignoreNonUtf8BloomFilter &&
+ hadBadBloomFilters(fileSchema.findSubtype(column)
+ .getCategory(), version))) {
+ bloomFilterKinds[column] = OrcProto.Stream.Kind.BLOOM_FILTER;
+ }
+ break;
+ case BLOOM_FILTER_UTF8:
+ bloomFilterKinds[column] = OrcProto.Stream.Kind.BLOOM_FILTER_UTF8;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+ }
+ long offset = 0;
+ for(OrcProto.Stream stream: footer.getStreamsList()) {
+ if (stream.hasKind() && stream.hasColumn()) {
+ int column = stream.getColumn();
+ if (fileIncluded == null || fileIncluded[column]) {
+ boolean needStream = false;
+ switch (stream.getKind()) {
+ case ROW_INDEX:
+ needStream = true;
+ break;
+ case BLOOM_FILTER:
+ needStream = bloomFilterKinds[column] == OrcProto.Stream.Kind.BLOOM_FILTER;
+ break;
+ case BLOOM_FILTER_UTF8:
+ needStream = bloomFilterKinds[column] == OrcProto.Stream.Kind.BLOOM_FILTER_UTF8;
+ break;
+ default:
+ // PASS
+ break;
+ }
+ if (needStream) {
+ result.addOrMerge(offset, offset + stream.getLength(), true, false);
+ }
+ }
+ }
+ offset += stream.getLength();
+ }
+ return result.get();
+ }
+
private static class DefaultDataReader implements DataReader {
private FSDataInputStream file = null;
private final ByteBufferAllocatorPool pool;
@@ -91,10 +185,14 @@ public class RecordReaderUtils {
@Override
public OrcIndex readRowIndex(StripeInformation stripe,
+ TypeDescription fileSchema,
OrcProto.StripeFooter footer,
+ boolean ignoreNonUtf8BloomFilter,
boolean[] included,
OrcProto.RowIndex[] indexes,
boolean[] sargColumns,
+ OrcFile.WriterVersion version,
+ OrcProto.Stream.Kind[] bloomFilterKinds,
OrcProto.BloomFilterIndex[] bloomFilterIndices
) throws IOException {
if (file == null) {
@@ -106,49 +204,61 @@ public class RecordReaderUtils {
if (indexes == null) {
indexes = new OrcProto.RowIndex[typeCount];
}
+ if (bloomFilterKinds == null) {
+ bloomFilterKinds = new OrcProto.Stream.Kind[typeCount];
+ }
if (bloomFilterIndices == null) {
bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
}
- long offset = stripe.getOffset();
- List<OrcProto.Stream> streams = footer.getStreamsList();
- for (int i = 0; i < streams.size(); i++) {
- OrcProto.Stream stream = streams.get(i);
- OrcProto.Stream nextStream = null;
- if (i < streams.size() - 1) {
- nextStream = streams.get(i+1);
+ DiskRangeList ranges = planIndexReading(fileSchema, footer,
+ ignoreNonUtf8BloomFilter, included, sargColumns, version,
+ bloomFilterKinds);
+ ranges = readDiskRanges(file, zcr, stripe.getOffset(), ranges, false);
+ long offset = 0;
+ DiskRangeList range = ranges;
+ for(OrcProto.Stream stream: footer.getStreamsList()) {
+ // advance to find the next range
+ while (range != null && range.getEnd() <= offset) {
+ range = range.next;
}
- int col = stream.getColumn();
- int len = (int) stream.getLength();
- // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
- // filter and combine the io to read row index and bloom filters for that column together
- if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) {
- boolean readBloomFilter = false;
- if (sargColumns != null && sargColumns[col] &&
- nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) {
- len += nextStream.getLength();
- i += 1;
- readBloomFilter = true;
- }
- if ((included == null || included[col]) && indexes[col] == null) {
- byte[] buffer = new byte[len];
- file.readFully(offset, buffer, 0, buffer.length);
- ByteBuffer bb = ByteBuffer.wrap(buffer);
- indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index",
- ReaderImpl.singleton(new BufferChunk(bb, 0)), stream.getLength(),
- codec, bufferSize));
- if (readBloomFilter) {
- bb.position((int) stream.getLength());
- bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create(
- "bloom_filter", ReaderImpl.singleton(new BufferChunk(bb, 0)),
- nextStream.getLength(), codec, bufferSize));
- }
+ // no more ranges, so we are done
+ if (range == null) {
+ break;
+ }
+ int column = stream.getColumn();
+ if (stream.hasKind() && range.getOffset() <= offset) {
+ switch (stream.getKind()) {
+ case ROW_INDEX:
+ if (included == null || included[column]) {
+ ByteBuffer bb = range.getData().duplicate();
+ bb.position((int) (offset - range.getOffset()));
+ bb.limit((int) (bb.position() + stream.getLength()));
+ indexes[column] = OrcProto.RowIndex.parseFrom(
+ InStream.createCodedInputStream("index",
+ ReaderImpl.singleton(new BufferChunk(bb, 0)),
+ stream.getLength(),
+ codec, bufferSize));
+ }
+ break;
+ case BLOOM_FILTER:
+ case BLOOM_FILTER_UTF8:
+ if (sargColumns != null && sargColumns[column]) {
+ ByteBuffer bb = range.getData().duplicate();
+ bb.position((int) (offset - range.getOffset()));
+ bb.limit((int) (bb.position() + stream.getLength()));
+ bloomFilterIndices[column] = OrcProto.BloomFilterIndex.parseFrom
+ (InStream.createCodedInputStream("bloom_filter",
+ ReaderImpl.singleton(new BufferChunk(bb, 0)),
+ stream.getLength(), codec, bufferSize));
+ }
+ break;
+ default:
+ break;
}
}
- offset += len;
+ offset += stream.getLength();
}
-
- OrcIndex index = new OrcIndex(indexes, bloomFilterIndices);
- return index;
+ return new OrcIndex(indexes, bloomFilterKinds, bloomFilterIndices);
}
@Override
@@ -234,14 +344,14 @@ public class RecordReaderUtils {
}
public static void addEntireStreamToRanges(
- long offset, long length, CreateHelper list, boolean doMergeBuffers) {
+ long offset, long length, DiskRangeList.CreateHelper list, boolean doMergeBuffers) {
list.addOrMerge(offset, offset + length, doMergeBuffers, false);
}
public static void addRgFilteredStreamToRanges(OrcProto.Stream stream,
boolean[] includedRowGroups, boolean isCompressed, OrcProto.RowIndex index,
OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, boolean hasNull,
- long offset, long length, CreateHelper list, boolean doMergeBuffers) {
+ long offset, long length, DiskRangeList.CreateHelper list, boolean doMergeBuffers) {
for (int group = 0; group < includedRowGroups.length; ++group) {
if (!includedRowGroups[group]) continue;
int posn = getIndexPosition(
@@ -399,7 +509,7 @@ public class RecordReaderUtils {
if (range == null) return null;
DiskRangeList prev = range.prev;
if (prev == null) {
- prev = new MutateHelper(range);
+ prev = new DiskRangeList.MutateHelper(range);
}
while (range != null) {
if (range.hasData()) {
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
index 1e11728..20adfd8 100644
--- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
+++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
@@ -153,6 +153,10 @@ public class SchemaEvolution {
return hasConversion;
}
+ public TypeDescription getFileSchema() {
+ return fileSchema;
+ }
+
public TypeDescription getFileType(TypeDescription readerType) {
return getFileType(readerType.getId());
}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/StreamName.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/StreamName.java b/java/core/src/java/org/apache/orc/impl/StreamName.java
index b3fd145..e3561bf 100644
--- a/java/core/src/java/org/apache/orc/impl/StreamName.java
+++ b/java/core/src/java/org/apache/orc/impl/StreamName.java
@@ -78,6 +78,7 @@ public class StreamName implements Comparable<StreamName> {
case ROW_INDEX:
case DICTIONARY_COUNT:
case BLOOM_FILTER:
+ case BLOOM_FILTER_UTF8:
return Area.INDEX;
default:
return Area.DATA;
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/WriterImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
index 3df1b76..940ef59 100644
--- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
@@ -21,6 +21,7 @@ package org.apache.orc.impl;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
@@ -34,12 +35,10 @@ import io.airlift.compress.lz4.Lz4Compressor;
import io.airlift.compress.lz4.Lz4Decompressor;
import io.airlift.compress.lzo.LzoCompressor;
import io.airlift.compress.lzo.LzoDecompressor;
-import io.airlift.compress.snappy.SnappyCompressor;
-import io.airlift.compress.snappy.SnappyDecompressor;
-import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.orc.BinaryColumnStatistics;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilter;
+import org.apache.orc.util.BloomFilterIO;
import org.apache.orc.CompressionCodec;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcConf;
@@ -50,6 +49,7 @@ import org.apache.orc.StringColumnStatistics;
import org.apache.orc.StripeInformation;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
+import org.apache.orc.util.BloomFilterUtf8;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -147,6 +147,7 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
private final OrcFile.CompressionStrategy compressionStrategy;
private final boolean[] bloomFilterColumns;
private final double bloomFilterFpp;
+ private final OrcFile.BloomFilterVersion bloomFilterVersion;
private boolean writeTimeZone;
public WriterImpl(FileSystem fs,
@@ -157,6 +158,7 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
this.conf = opts.getConfiguration();
this.callback = opts.getCallback();
this.schema = opts.getSchema();
+ bloomFilterVersion = opts.getBloomFilterVersion();
if (callback != null) {
callbackContext = new OrcFile.WriterContext(){
@@ -426,6 +428,7 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
case BLOOM_FILTER:
case DATA:
case DICTIONARY_DATA:
+ case BLOOM_FILTER_UTF8:
if (getCompressionStrategy() == OrcFile.CompressionStrategy.SPEED) {
modifiers = EnumSet.of(CompressionCodec.Modifier.FAST,
CompressionCodec.Modifier.TEXT);
@@ -543,6 +546,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
public boolean hasWriterTimeZone() {
return writeTimeZone;
}
+
+ public OrcFile.BloomFilterVersion getBloomFilterVersion() {
+ return bloomFilterVersion;
+ }
}
/**
@@ -564,9 +571,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
private final OrcProto.RowIndexEntry.Builder rowIndexEntry;
private final PositionedOutputStream rowIndexStream;
private final PositionedOutputStream bloomFilterStream;
- protected final BloomFilterIO bloomFilter;
+ private final PositionedOutputStream bloomFilterStreamUtf8;
+ protected final BloomFilter bloomFilter;
+ protected final BloomFilterUtf8 bloomFilterUtf8;
protected final boolean createBloomFilter;
private final OrcProto.BloomFilterIndex.Builder bloomFilterIndex;
+ private final OrcProto.BloomFilterIndex.Builder bloomFilterIndexUtf8;
private final OrcProto.BloomFilter.Builder bloomFilterEntry;
private boolean foundNulls;
private OutStream isPresentOutStream;
@@ -612,15 +622,30 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
}
if (createBloomFilter) {
bloomFilterEntry = OrcProto.BloomFilter.newBuilder();
- bloomFilterIndex = OrcProto.BloomFilterIndex.newBuilder();
- bloomFilterStream = streamFactory.createStream(id, OrcProto.Stream.Kind.BLOOM_FILTER);
- bloomFilter = new BloomFilterIO(streamFactory.getRowIndexStride(),
+ if (streamFactory.getBloomFilterVersion() == OrcFile.BloomFilterVersion.ORIGINAL) {
+ bloomFilter = new BloomFilter(streamFactory.getRowIndexStride(),
+ streamFactory.getBloomFilterFPP());
+ bloomFilterIndex = OrcProto.BloomFilterIndex.newBuilder();
+ bloomFilterStream = streamFactory.createStream(id,
+ OrcProto.Stream.Kind.BLOOM_FILTER);;
+ } else {
+ bloomFilter = null;
+ bloomFilterIndex = null;
+ bloomFilterStream = null;
+ }
+ bloomFilterUtf8 = new BloomFilterUtf8(streamFactory.getRowIndexStride(),
streamFactory.getBloomFilterFPP());
+ bloomFilterIndexUtf8 = OrcProto.BloomFilterIndex.newBuilder();
+ bloomFilterStreamUtf8 = streamFactory.createStream(id,
+ OrcProto.Stream.Kind.BLOOM_FILTER_UTF8);;
} else {
bloomFilterEntry = null;
bloomFilterIndex = null;
+ bloomFilterIndexUtf8 = null;
+ bloomFilterStreamUtf8 = null;
bloomFilterStream = null;
bloomFilter = null;
+ bloomFilterUtf8 = null;
}
}
@@ -788,7 +813,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
bloomFilterIndex.build().writeTo(bloomFilterStream);
bloomFilterStream.flush();
bloomFilterIndex.clear();
- bloomFilterEntry.clear();
+ }
+ // write the bloom filter to out stream
+ if (bloomFilterStreamUtf8 != null) {
+ bloomFilterIndexUtf8.build().writeTo(bloomFilterStreamUtf8);
+ bloomFilterStreamUtf8.flush();
+ bloomFilterIndexUtf8.clear();
}
}
@@ -837,12 +867,16 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
void addBloomFilterEntry() {
if (createBloomFilter) {
- bloomFilterEntry.setNumHashFunctions(bloomFilter.getNumHashFunctions());
- bloomFilterEntry.addAllBitset(Arrays.asList(ArrayUtils.toObject(
- bloomFilter.getBitSet())));
- bloomFilterIndex.addBloomFilter(bloomFilterEntry.build());
- bloomFilter.reset();
- bloomFilterEntry.clear();
+ if (bloomFilter != null) {
+ BloomFilterIO.serialize(bloomFilterEntry, bloomFilter);
+ bloomFilterIndex.addBloomFilter(bloomFilterEntry.build());
+ bloomFilter.reset();
+ }
+ if (bloomFilterUtf8 != null) {
+ BloomFilterIO.serialize(bloomFilterEntry, bloomFilterUtf8);
+ bloomFilterIndexUtf8.addBloomFilter(bloomFilterEntry.build());
+ bloomFilterUtf8.reset();
+ }
}
}
@@ -946,7 +980,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
byte value = (byte) vec.vector[0];
indexStatistics.updateInteger(value, length);
if (createBloomFilter) {
- bloomFilter.addLong(value);
+ if (bloomFilter != null) {
+ bloomFilter.addLong(value);
+ }
+ bloomFilterUtf8.addLong(value);
}
for(int i=0; i < length; ++i) {
writer.write(value);
@@ -959,7 +996,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
writer.write(value);
indexStatistics.updateInteger(value, 1);
if (createBloomFilter) {
- bloomFilter.addLong(value);
+ if (bloomFilter != null) {
+ bloomFilter.addLong(value);
+ }
+ bloomFilterUtf8.addLong(value);
}
}
}
@@ -1017,7 +1057,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
long value = vec.vector[0];
indexStatistics.updateInteger(value, length);
if (createBloomFilter) {
- bloomFilter.addLong(value);
+ if (bloomFilter != null) {
+ bloomFilter.addLong(value);
+ }
+ bloomFilterUtf8.addLong(value);
}
for(int i=0; i < length; ++i) {
writer.write(value);
@@ -1030,7 +1073,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
writer.write(value);
indexStatistics.updateInteger(value, 1);
if (createBloomFilter) {
- bloomFilter.addLong(value);
+ if (bloomFilter != null) {
+ bloomFilter.addLong(value);
+ }
+ bloomFilterUtf8.addLong(value);
}
}
}
@@ -1077,7 +1123,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
float value = (float) vec.vector[0];
indexStatistics.updateDouble(value);
if (createBloomFilter) {
- bloomFilter.addDouble(value);
+ if (bloomFilter != null) {
+ bloomFilter.addDouble(value);
+ }
+ bloomFilterUtf8.addDouble(value);
}
for(int i=0; i < length; ++i) {
utils.writeFloat(stream, value);
@@ -1090,7 +1139,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
utils.writeFloat(stream, value);
indexStatistics.updateDouble(value);
if (createBloomFilter) {
- bloomFilter.addDouble(value);
+ if (bloomFilter != null) {
+ bloomFilter.addDouble(value);
+ }
+ bloomFilterUtf8.addDouble(value);
}
}
}
@@ -1138,7 +1190,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
double value = vec.vector[0];
indexStatistics.updateDouble(value);
if (createBloomFilter) {
- bloomFilter.addDouble(value);
+ if (bloomFilter != null) {
+ bloomFilter.addDouble(value);
+ }
+ bloomFilterUtf8.addDouble(value);
}
for(int i=0; i < length; ++i) {
utils.writeDouble(stream, value);
@@ -1151,7 +1206,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
utils.writeDouble(stream, value);
indexStatistics.updateDouble(value);
if (createBloomFilter) {
- bloomFilter.addDouble(value);
+ if (bloomFilter != null) {
+ bloomFilter.addDouble(value);
+ }
+ bloomFilterUtf8.addDouble(value);
}
}
}
@@ -1430,7 +1488,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
indexStatistics.updateString(vec.vector[0], vec.start[0],
vec.length[0], length);
if (createBloomFilter) {
- bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+ if (bloomFilter != null) {
+ // translate from UTF-8 to the default charset
+ bloomFilter.addString(new String(vec.vector[0], vec.start[0],
+ vec.length[0], StandardCharsets.UTF_8));
+ }
+ bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
}
}
} else {
@@ -1447,7 +1510,13 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
indexStatistics.updateString(vec.vector[offset + i],
vec.start[offset + i], vec.length[offset + i], 1);
if (createBloomFilter) {
- bloomFilter.addBytes(vec.vector[offset + i],
+ if (bloomFilter != null) {
+ // translate from UTF-8 to the default charset
+ bloomFilter.addString(new String(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i],
+ StandardCharsets.UTF_8));
+ }
+ bloomFilterUtf8.addBytes(vec.vector[offset + i],
vec.start[offset + i], vec.length[offset + i]);
}
}
@@ -1504,7 +1573,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
}
indexStatistics.updateString(ptr, ptrOffset, itemLength, length);
if (createBloomFilter) {
- bloomFilter.addBytes(ptr, ptrOffset, itemLength);
+ if (bloomFilter != null) {
+ // translate from UTF-8 to the default charset
+ bloomFilter.addString(new String(vec.vector[0], vec.start[0],
+ vec.length[0], StandardCharsets.UTF_8));
+ }
+ bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
}
}
} else {
@@ -1531,7 +1605,14 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
}
indexStatistics.updateString(ptr, ptrOffset, itemLength, 1);
if (createBloomFilter) {
- bloomFilter.addBytes(ptr, ptrOffset, itemLength);
+ if (bloomFilter != null) {
+ // translate from UTF-8 to the default charset
+ bloomFilter.addString(new String(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i],
+ StandardCharsets.UTF_8));
+ }
+ bloomFilterUtf8.addBytes(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i]);
}
}
}
@@ -1576,7 +1657,14 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
indexStatistics.updateString(vec.vector[0], vec.start[0],
itemLength, length);
if (createBloomFilter) {
- bloomFilter.addBytes(vec.vector[0], vec.start[0], itemLength);
+ if (bloomFilter != null) {
+ // translate from UTF-8 to the default charset
+ bloomFilter.addString(new String(vec.vector[0],
+ vec.start[0], itemLength,
+ StandardCharsets.UTF_8));
+ }
+ bloomFilterUtf8.addBytes(vec.vector[0],
+ vec.start[0], itemLength);
}
}
} else {
@@ -1594,7 +1682,13 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
indexStatistics.updateString(vec.vector[offset + i],
vec.start[offset + i], itemLength, 1);
if (createBloomFilter) {
- bloomFilter.addBytes(vec.vector[offset + i],
+ if (bloomFilter != null) {
+ // translate from UTF-8 to the default charset
+ bloomFilter.addString(new String(vec.vector[offset + i],
+ vec.start[offset + i], itemLength,
+ StandardCharsets.UTF_8));
+ }
+ bloomFilterUtf8.addBytes(vec.vector[offset + i],
vec.start[offset + i], itemLength);
}
}
@@ -1646,7 +1740,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
indexStatistics.updateBinary(vec.vector[0], vec.start[0],
vec.length[0], length);
if (createBloomFilter) {
- bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+ if (bloomFilter != null) {
+ bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+ }
+ bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
}
}
} else {
@@ -1658,7 +1755,11 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
indexStatistics.updateBinary(vec.vector[offset + i],
vec.start[offset + i], vec.length[offset + i], 1);
if (createBloomFilter) {
- bloomFilter.addBytes(vec.vector[offset + i],
+ if (bloomFilter != null) {
+ bloomFilter.addBytes(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i]);
+ }
+ bloomFilterUtf8.addBytes(vec.vector[offset + i],
vec.start[offset + i], vec.length[offset + i]);
}
}
@@ -1734,7 +1835,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
long millis = val.getTime();
indexStatistics.updateTimestamp(millis);
if (createBloomFilter) {
- bloomFilter.addLong(millis);
+ if (bloomFilter != null) {
+ bloomFilter.addLong(millis);
+ }
+ bloomFilterUtf8.addLong(millis);
}
final long secs = millis / MILLIS_PER_SECOND - base_timestamp;
final long nano = formatNanos(val.getNanos());
@@ -1753,7 +1857,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
nanos.write(formatNanos(val.getNanos()));
indexStatistics.updateTimestamp(millis);
if (createBloomFilter) {
- bloomFilter.addLong(millis);
+ if (bloomFilter != null) {
+ bloomFilter.addLong(millis);
+ }
+ bloomFilterUtf8.addLong(millis);
}
}
}
@@ -1819,7 +1926,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
int value = (int) vec.vector[0];
indexStatistics.updateDate(value);
if (createBloomFilter) {
- bloomFilter.addLong(value);
+ if (bloomFilter != null) {
+ bloomFilter.addLong(value);
+ }
+ bloomFilterUtf8.addLong(value);
}
for(int i=0; i < length; ++i) {
writer.write(value);
@@ -1832,7 +1942,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
writer.write(value);
indexStatistics.updateDate(value);
if (createBloomFilter) {
- bloomFilter.addLong(value);
+ if (bloomFilter != null) {
+ bloomFilter.addLong(value);
+ }
+ bloomFilterUtf8.addLong(value);
}
}
}
@@ -1901,7 +2014,11 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
HiveDecimal value = vec.vector[0].getHiveDecimal();
indexStatistics.updateDecimal(value);
if (createBloomFilter) {
- bloomFilter.addString(value.toString());
+ String str = value.toString();
+ if (bloomFilter != null) {
+ bloomFilter.addString(str);
+ }
+ bloomFilterUtf8.addString(str);
}
for(int i=0; i < length; ++i) {
SerializationUtils.writeBigInteger(valueStream,
@@ -1918,7 +2035,11 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
scaleStream.write(value.scale());
indexStatistics.updateDecimal(value);
if (createBloomFilter) {
- bloomFilter.addString(value.toString());
+ String str = value.toString();
+ if (bloomFilter != null) {
+ bloomFilter.addString(str);
+ }
+ bloomFilterUtf8.addString(str);
}
}
}
@@ -2065,7 +2186,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
childrenWriters[0].writeBatch(vec.child, childOffset, childLength);
}
if (createBloomFilter) {
- bloomFilter.addLong(childLength);
+ if (bloomFilter != null) {
+ bloomFilter.addLong(childLength);
+ }
+ bloomFilterUtf8.addLong(childLength);
}
}
} else {
@@ -2088,6 +2212,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
} else {
currentLength += nextLength;
}
+ if (createBloomFilter) {
+ if (bloomFilter != null) {
+ bloomFilter.addLong(nextLength);
+ }
+ bloomFilterUtf8.addLong(nextLength);
+ }
}
}
if (currentLength != 0) {
@@ -2161,7 +2291,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
childrenWriters[1].writeBatch(vec.values, childOffset, childLength);
}
if (createBloomFilter) {
- bloomFilter.addLong(childLength);
+ if (bloomFilter != null) {
+ bloomFilter.addLong(childLength);
+ }
+ bloomFilterUtf8.addLong(childLength);
}
}
} else {
@@ -2186,6 +2319,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
} else {
currentLength += nextLength;
}
+ if (createBloomFilter) {
+ if (bloomFilter != null) {
+ bloomFilter.addLong(nextLength);
+ }
+ bloomFilterUtf8.addLong(nextLength);
+ }
}
}
if (currentLength != 0) {
@@ -2247,7 +2386,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
tags.write(tag);
}
if (createBloomFilter) {
- bloomFilter.addLong(tag);
+ if (bloomFilter != null) {
+ bloomFilter.addLong(tag);
+ }
+ bloomFilterUtf8.addLong(tag);
}
childrenWriters[tag].writeBatch(vec.fields[tag], offset, length);
}
@@ -2275,6 +2417,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
currentStart[tag] = i + offset;
currentLength[tag] = 1;
}
+ if (createBloomFilter) {
+ if (bloomFilter != null) {
+ bloomFilter.addLong(tag);
+ }
+ bloomFilterUtf8.addLong(tag);
+ }
}
}
// write out any left over sequences
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/util/BloomFilter.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilter.java b/java/core/src/java/org/apache/orc/util/BloomFilter.java
new file mode 100644
index 0000000..a6ff741
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/util/BloomFilter.java
@@ -0,0 +1,312 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are
+ * highly space efficient when compared to using a HashSet. Because of the probabilistic nature of
+ * bloom filter false positive (element not present in bloom filter but test() says true) are
+ * possible but false negatives are not possible (if element is present then test() will never
+ * say false). The false positive probability is configurable (default: 5%) depending on which
+ * storage requirement may increase or decrease. Lower the false positive probability greater
+ * is the space requirement.
+ * Bloom filters are sensitive to number of elements that will be inserted in the bloom filter.
+ * During the creation of bloom filter expected number of entries must be specified. If the number
+ * of insertions exceed the specified initial number of entries then false positive probability will
+ * increase accordingly.
+ *
+ * Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash
+ * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it suffers from hash
+ * collisions for specific sequence of repeating bytes. Check the following link for more info
+ * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw
+ *
+ * Note that this class is here for backwards compatibility, because it uses
+ * the JVM default character set for strings. All new users should
+ * BloomFilterUtf8, which always uses UTF8 for the encoding.
+ */
+public class BloomFilter {
+ public static final double DEFAULT_FPP = 0.05;
+ private final BitSet bitSet;
+ private final int numBits;
+ private final int numHashFunctions;
+
+ static void checkArgument(boolean expression, String message) {
+ if (!expression) {
+ throw new IllegalArgumentException(message);
+ }
+ }
+
+ public BloomFilter(long expectedEntries) {
+ this(expectedEntries, DEFAULT_FPP);
+ }
+
+ public BloomFilter(long expectedEntries, double fpp) {
+ checkArgument(expectedEntries > 0, "expectedEntries should be > 0");
+ checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0");
+ int nb = optimalNumOfBits(expectedEntries, fpp);
+ // make 'm' multiple of 64
+ this.numBits = nb + (Long.SIZE - (nb % Long.SIZE));
+ this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, numBits);
+ this.bitSet = new BitSet(numBits);
+ }
+
+ /**
+ * A constructor to support rebuilding the BloomFilter from a serialized representation.
+ * @param bits the serialized bits
+ * @param numFuncs the number of functions used
+ */
+ public BloomFilter(long[] bits, int numFuncs) {
+ super();
+ bitSet = new BitSet(bits);
+ this.numBits = (int) bitSet.bitSize();
+ numHashFunctions = numFuncs;
+ }
+
+ static int optimalNumOfHashFunctions(long n, long m) {
+ return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
+ }
+
+ static int optimalNumOfBits(long n, double p) {
+ return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
+ }
+
+ public void add(byte[] val) {
+ if (val == null) {
+ addBytes(val, -1, -1);
+ } else {
+ addBytes(val, 0, val.length);
+ }
+ }
+
+ public void addBytes(byte[] val, int offset, int length) {
+ // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter"
+ // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively
+ // implement a Bloom filter without any loss in the asymptotic false positive probability'
+
+ // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned
+ // in the above paper
+ long hash64 = val == null ? Murmur3.NULL_HASHCODE :
+ Murmur3.hash64(val, offset, length);
+ addHash(hash64);
+ }
+
+ private void addHash(long hash64) {
+ int hash1 = (int) hash64;
+ int hash2 = (int) (hash64 >>> 32);
+
+ for (int i = 1; i <= numHashFunctions; i++) {
+ int combinedHash = hash1 + (i * hash2);
+ // hashcode should be positive, flip all the bits if it's negative
+ if (combinedHash < 0) {
+ combinedHash = ~combinedHash;
+ }
+ int pos = combinedHash % numBits;
+ bitSet.set(pos);
+ }
+ }
+
+ public void addString(String val) {
+ if (val == null) {
+ add(null);
+ } else {
+ add(val.getBytes(Charset.defaultCharset()));
+ }
+ }
+
+ public void addLong(long val) {
+ addHash(getLongHash(val));
+ }
+
+ public void addDouble(double val) {
+ addLong(Double.doubleToLongBits(val));
+ }
+
+ public boolean test(byte[] val) {
+ if (val == null) {
+ return testBytes(val, -1, -1);
+ }
+ return testBytes(val, 0, val.length);
+ }
+
+ public boolean testBytes(byte[] val, int offset, int length) {
+ long hash64 = val == null ? Murmur3.NULL_HASHCODE :
+ Murmur3.hash64(val, offset, length);
+ return testHash(hash64);
+ }
+
+ private boolean testHash(long hash64) {
+ int hash1 = (int) hash64;
+ int hash2 = (int) (hash64 >>> 32);
+
+ for (int i = 1; i <= numHashFunctions; i++) {
+ int combinedHash = hash1 + (i * hash2);
+ // hashcode should be positive, flip all the bits if it's negative
+ if (combinedHash < 0) {
+ combinedHash = ~combinedHash;
+ }
+ int pos = combinedHash % numBits;
+ if (!bitSet.get(pos)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ public boolean testString(String val) {
+ if (val == null) {
+ return test(null);
+ } else {
+ return test(val.getBytes(Charset.defaultCharset()));
+ }
+ }
+
+ public boolean testLong(long val) {
+ return testHash(getLongHash(val));
+ }
+
+ // Thomas Wang's integer hash function
+ // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
+ private long getLongHash(long key) {
+ key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+ key = key ^ (key >> 24);
+ key = (key + (key << 3)) + (key << 8); // key * 265
+ key = key ^ (key >> 14);
+ key = (key + (key << 2)) + (key << 4); // key * 21
+ key = key ^ (key >> 28);
+ key = key + (key << 31);
+ return key;
+ }
+
+ public boolean testDouble(double val) {
+ return testLong(Double.doubleToLongBits(val));
+ }
+
+ public long sizeInBytes() {
+ return getBitSize() / 8;
+ }
+
+ public int getBitSize() {
+ return bitSet.getData().length * Long.SIZE;
+ }
+
+ public int getNumHashFunctions() {
+ return numHashFunctions;
+ }
+
+ public long[] getBitSet() {
+ return bitSet.getData();
+ }
+
+ @Override
+ public String toString() {
+ return "m: " + numBits + " k: " + numHashFunctions;
+ }
+
+ /**
+ * Merge the specified bloom filter with current bloom filter.
+ *
+ * @param that - bloom filter to merge
+ */
+ public void merge(BloomFilter that) {
+ if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions) {
+ this.bitSet.putAll(that.bitSet);
+ } else {
+ throw new IllegalArgumentException("BloomFilters are not compatible for merging." +
+ " this - " + this.toString() + " that - " + that.toString());
+ }
+ }
+
+ public void reset() {
+ this.bitSet.clear();
+ }
+
+ /**
+ * Bare metal bit set implementation. For performance reasons, this implementation does not check
+ * for index bounds nor expand the bit set size if the specified index is greater than the size.
+ */
+ public static class BitSet {
+ private final long[] data;
+
+ public BitSet(long bits) {
+ this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]);
+ }
+
+ /**
+ * Deserialize long array as bit set.
+ *
+ * @param data - bit array
+ */
+ public BitSet(long[] data) {
+ assert data.length > 0 : "data length is zero!";
+ this.data = data;
+ }
+
+ /**
+ * Sets the bit at specified index.
+ *
+ * @param index - position
+ */
+ public void set(int index) {
+ data[index >>> 6] |= (1L << index);
+ }
+
+ /**
+ * Returns true if the bit is set in the specified index.
+ *
+ * @param index - position
+ * @return - value at the bit position
+ */
+ public boolean get(int index) {
+ return (data[index >>> 6] & (1L << index)) != 0;
+ }
+
+ /**
+ * Number of bits
+ */
+ public long bitSize() {
+ return (long) data.length * Long.SIZE;
+ }
+
+ public long[] getData() {
+ return data;
+ }
+
+ /**
+ * Combines the two BitArrays using bitwise OR.
+ */
+ public void putAll(BitSet array) {
+ assert data.length == array.data.length :
+ "BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")";
+ for (int i = 0; i < data.length; i++) {
+ data[i] |= array.data[i];
+ }
+ }
+
+ /**
+ * Clear the bit set.
+ */
+ public void clear() {
+ Arrays.fill(data, 0);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
new file mode 100644
index 0000000..ebd8c49
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+import com.google.protobuf.ByteString;
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcProto;
+import org.apache.orc.TypeDescription;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+
+public class BloomFilterIO {
+
+ private BloomFilterIO() {
+ // never called
+ }
+
+ /**
+ * Deserialize a bloom filter from the ORC file.
+ */
+ public static BloomFilter deserialize(OrcProto.Stream.Kind kind,
+ OrcFile.WriterVersion fileVersion,
+ TypeDescription.Category type,
+ OrcProto.BloomFilter bloomFilter) {
+ if (bloomFilter == null) {
+ return null;
+ }
+ int numFuncs = bloomFilter.getNumHashFunctions();
+ switch (kind) {
+ case BLOOM_FILTER: {
+ long values[] = new long[bloomFilter.getBitsetCount()];
+ for (int i = 0; i < values.length; ++i) {
+ values[i] = bloomFilter.getBitset(i);
+ }
+ // After HIVE-12055 the bloom filters for strings correctly use
+ // UTF8.
+ if (fileVersion.includes(OrcFile.WriterVersion.HIVE_12055) &&
+ (type == TypeDescription.Category.STRING ||
+ type == TypeDescription.Category.CHAR ||
+ type == TypeDescription.Category.VARCHAR)) {
+ return new BloomFilterUtf8(values, numFuncs);
+ }
+ return new BloomFilter(values, numFuncs);
+ }
+ case BLOOM_FILTER_UTF8: {
+ ByteString bits = bloomFilter.getUtf8Bitset();
+ long[] values = new long[bits.size() / 8];
+ bits.asReadOnlyByteBuffer().asLongBuffer().get(values);
+ return new BloomFilterUtf8(values, numFuncs);
+ }
+ default:
+ throw new IllegalArgumentException("Unknown bloom filter kind " + kind);
+ }
+ }
+
+ /**
+ * Serialize the BloomFilter to the ORC file.
+ * @param builder the builder to write to
+ * @param bloomFilter the bloom filter to serialize
+ */
+ public static void serialize(OrcProto.BloomFilter.Builder builder,
+ BloomFilter bloomFilter) {
+ builder.clear();
+ builder.setNumHashFunctions(bloomFilter.getNumHashFunctions());
+ long[] bitset = bloomFilter.getBitSet();
+ if (bloomFilter instanceof BloomFilterUtf8) {
+ ByteBuffer buffer = ByteBuffer.allocate(bitset.length * 8);
+ buffer.asLongBuffer().put(bitset);
+ builder.setUtf8Bitset(ByteString.copyFrom(buffer));
+ } else {
+ for(int i=0; i < bitset.length; ++i) {
+ builder.addBitset(bitset[i]);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/util/BloomFilterUtf8.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilterUtf8.java b/java/core/src/java/org/apache/orc/util/BloomFilterUtf8.java
new file mode 100644
index 0000000..aad4fab
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/util/BloomFilterUtf8.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * This class represents the fix from ORC-101 where we fixed the bloom filter
+ * from using the JVM's default character set to always using UTF-8.
+ */
+public class BloomFilterUtf8 extends BloomFilter {
+
+ public BloomFilterUtf8(long expectedEntries, double fpp) {
+ super(expectedEntries, fpp);
+ }
+
+ public BloomFilterUtf8(long[] bits, int numFuncs) {
+ super(bits, numFuncs);
+ }
+
+
+ public void addString(String val) {
+ if (val == null) {
+ add(null);
+ } else {
+ add(val.getBytes(StandardCharsets.UTF_8));
+ }
+ }
+
+ public boolean testString(String val) {
+ if (val == null) {
+ return test(null);
+ } else {
+ return test(val.getBytes(StandardCharsets.UTF_8));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index af20d1f..5ef0ced 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -1904,8 +1904,8 @@ public class TestVectorOrcFile {
.withZeroCopy(false)
.build());
OrcIndex index =
- meta.readRowIndex(reader.getStripes().get(0), null, null, null, null,
- null);
+ meta.readRowIndex(reader.getStripes().get(0), null, null, false, null, null,
+ null, OrcFile.WriterVersion.ORC_101, null, null);
// check the primitive columns to make sure they have the right number of
// items in the first row group
for(int c=1; c < 9; ++c) {