You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@orc.apache.org by om...@apache.org on 2016/09/22 19:21:53 UTC

[1/4] orc git commit: ORC-101 Correct bloom filters for strings and decimals to use utf8 encoding.

Repository: orc
Updated Branches:
  refs/heads/master 7118e968b -> 604dcc801


http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java b/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
deleted file mode 100644
index 88c3514..0000000
--- a/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
+++ /dev/null
@@ -1,335 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-/**
- * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms.
- *
- * Murmur3 32 and 128 bit variants.
- * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
- * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
- *
- * This is a public domain code with no copyrights.
- * From homepage of MurmurHash (https://code.google.com/p/smhasher/),
- * "All MurmurHash versions are public domain software, and the author disclaims all copyright
- * to their code."
- */
-public class Murmur3 {
-  // from 64-bit linear congruential generator
-  public static final long NULL_HASHCODE = 2862933555777941757L;
-
-  // Constants for 32 bit variant
-  private static final int C1_32 = 0xcc9e2d51;
-  private static final int C2_32 = 0x1b873593;
-  private static final int R1_32 = 15;
-  private static final int R2_32 = 13;
-  private static final int M_32 = 5;
-  private static final int N_32 = 0xe6546b64;
-
-  // Constants for 128 bit variant
-  private static final long C1 = 0x87c37b91114253d5L;
-  private static final long C2 = 0x4cf5ad432745937fL;
-  private static final int R1 = 31;
-  private static final int R2 = 27;
-  private static final int R3 = 33;
-  private static final int M = 5;
-  private static final int N1 = 0x52dce729;
-  private static final int N2 = 0x38495ab5;
-
-  private static final int DEFAULT_SEED = 104729;
-
-  /**
-   * Murmur3 32-bit variant.
-   *
-   * @param data - input byte array
-   * @return - hashcode
-   */
-  public static int hash32(byte[] data) {
-    return hash32(data, data.length, DEFAULT_SEED);
-  }
-
-  /**
-   * Murmur3 32-bit variant.
-   *
-   * @param data   - input byte array
-   * @param length - length of array
-   * @param seed   - seed. (default 0)
-   * @return - hashcode
-   */
-  public static int hash32(byte[] data, int length, int seed) {
-    int hash = seed;
-    final int nblocks = length >> 2;
-
-    // body
-    for (int i = 0; i < nblocks; i++) {
-      int i_4 = i << 2;
-      int k = (data[i_4] & 0xff)
-          | ((data[i_4 + 1] & 0xff) << 8)
-          | ((data[i_4 + 2] & 0xff) << 16)
-          | ((data[i_4 + 3] & 0xff) << 24);
-
-      // mix functions
-      k *= C1_32;
-      k = Integer.rotateLeft(k, R1_32);
-      k *= C2_32;
-      hash ^= k;
-      hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
-    }
-
-    // tail
-    int idx = nblocks << 2;
-    int k1 = 0;
-    switch (length - idx) {
-      case 3:
-        k1 ^= data[idx + 2] << 16;
-      case 2:
-        k1 ^= data[idx + 1] << 8;
-      case 1:
-        k1 ^= data[idx];
-
-        // mix functions
-        k1 *= C1_32;
-        k1 = Integer.rotateLeft(k1, R1_32);
-        k1 *= C2_32;
-        hash ^= k1;
-    }
-
-    // finalization
-    hash ^= length;
-    hash ^= (hash >>> 16);
-    hash *= 0x85ebca6b;
-    hash ^= (hash >>> 13);
-    hash *= 0xc2b2ae35;
-    hash ^= (hash >>> 16);
-
-    return hash;
-  }
-
-  /**
-   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
-   *
-   * @param data - input byte array
-   * @return - hashcode
-   */
-  public static long hash64(byte[] data) {
-    return hash64(data, 0, data.length, DEFAULT_SEED);
-  }
-
-  public static long hash64(byte[] data, int offset, int length) {
-    return hash64(data, offset, length, DEFAULT_SEED);
-  }
-
-  /**
-   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
-   *
-   * @param data   - input byte array
-   * @param length - length of array
-   * @param seed   - seed. (default is 0)
-   * @return - hashcode
-   */
-  public static long hash64(byte[] data, int offset, int length, int seed) {
-    long hash = seed;
-    final int nblocks = length >> 3;
-
-    // body
-    for (int i = 0; i < nblocks; i++) {
-      final int i8 = i << 3;
-      long k = ((long) data[offset + i8] & 0xff)
-          | (((long) data[offset + i8 + 1] & 0xff) << 8)
-          | (((long) data[offset + i8 + 2] & 0xff) << 16)
-          | (((long) data[offset + i8 + 3] & 0xff) << 24)
-          | (((long) data[offset + i8 + 4] & 0xff) << 32)
-          | (((long) data[offset + i8 + 5] & 0xff) << 40)
-          | (((long) data[offset + i8 + 6] & 0xff) << 48)
-          | (((long) data[offset + i8 + 7] & 0xff) << 56);
-
-      // mix functions
-      k *= C1;
-      k = Long.rotateLeft(k, R1);
-      k *= C2;
-      hash ^= k;
-      hash = Long.rotateLeft(hash, R2) * M + N1;
-    }
-
-    // tail
-    long k1 = 0;
-    int tailStart = nblocks << 3;
-    switch (length - tailStart) {
-      case 7:
-        k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
-      case 6:
-        k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
-      case 5:
-        k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
-      case 4:
-        k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
-      case 3:
-        k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
-      case 2:
-        k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
-      case 1:
-        k1 ^= ((long) data[offset + tailStart] & 0xff);
-        k1 *= C1;
-        k1 = Long.rotateLeft(k1, R1);
-        k1 *= C2;
-        hash ^= k1;
-    }
-
-    // finalization
-    hash ^= length;
-    hash = fmix64(hash);
-
-    return hash;
-  }
-
-  /**
-   * Murmur3 128-bit variant.
-   *
-   * @param data - input byte array
-   * @return - hashcode (2 longs)
-   */
-  public static long[] hash128(byte[] data) {
-    return hash128(data, 0, data.length, DEFAULT_SEED);
-  }
-
-  /**
-   * Murmur3 128-bit variant.
-   *
-   * @param data   - input byte array
-   * @param offset - the first element of array
-   * @param length - length of array
-   * @param seed   - seed. (default is 0)
-   * @return - hashcode (2 longs)
-   */
-  public static long[] hash128(byte[] data, int offset, int length, int seed) {
-    long h1 = seed;
-    long h2 = seed;
-    final int nblocks = length >> 4;
-
-    // body
-    for (int i = 0; i < nblocks; i++) {
-      final int i16 = i << 4;
-      long k1 = ((long) data[offset + i16] & 0xff)
-          | (((long) data[offset + i16 + 1] & 0xff) << 8)
-          | (((long) data[offset + i16 + 2] & 0xff) << 16)
-          | (((long) data[offset + i16 + 3] & 0xff) << 24)
-          | (((long) data[offset + i16 + 4] & 0xff) << 32)
-          | (((long) data[offset + i16 + 5] & 0xff) << 40)
-          | (((long) data[offset + i16 + 6] & 0xff) << 48)
-          | (((long) data[offset + i16 + 7] & 0xff) << 56);
-
-      long k2 = ((long) data[offset + i16 + 8] & 0xff)
-          | (((long) data[offset + i16 + 9] & 0xff) << 8)
-          | (((long) data[offset + i16 + 10] & 0xff) << 16)
-          | (((long) data[offset + i16 + 11] & 0xff) << 24)
-          | (((long) data[offset + i16 + 12] & 0xff) << 32)
-          | (((long) data[offset + i16 + 13] & 0xff) << 40)
-          | (((long) data[offset + i16 + 14] & 0xff) << 48)
-          | (((long) data[offset + i16 + 15] & 0xff) << 56);
-
-      // mix functions for k1
-      k1 *= C1;
-      k1 = Long.rotateLeft(k1, R1);
-      k1 *= C2;
-      h1 ^= k1;
-      h1 = Long.rotateLeft(h1, R2);
-      h1 += h2;
-      h1 = h1 * M + N1;
-
-      // mix functions for k2
-      k2 *= C2;
-      k2 = Long.rotateLeft(k2, R3);
-      k2 *= C1;
-      h2 ^= k2;
-      h2 = Long.rotateLeft(h2, R1);
-      h2 += h1;
-      h2 = h2 * M + N2;
-    }
-
-    // tail
-    long k1 = 0;
-    long k2 = 0;
-    int tailStart = nblocks << 4;
-    switch (length - tailStart) {
-      case 15:
-        k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
-      case 14:
-        k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
-      case 13:
-        k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
-      case 12:
-        k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
-      case 11:
-        k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
-      case 10:
-        k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
-      case 9:
-        k2 ^= (long) (data[offset + tailStart + 8] & 0xff);
-        k2 *= C2;
-        k2 = Long.rotateLeft(k2, R3);
-        k2 *= C1;
-        h2 ^= k2;
-
-      case 8:
-        k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
-      case 7:
-        k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
-      case 6:
-        k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
-      case 5:
-        k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
-      case 4:
-        k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
-      case 3:
-        k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
-      case 2:
-        k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
-      case 1:
-        k1 ^= (long) (data[offset + tailStart] & 0xff);
-        k1 *= C1;
-        k1 = Long.rotateLeft(k1, R1);
-        k1 *= C2;
-        h1 ^= k1;
-    }
-
-    // finalization
-    h1 ^= length;
-    h2 ^= length;
-
-    h1 += h2;
-    h2 += h1;
-
-    h1 = fmix64(h1);
-    h2 = fmix64(h2);
-
-    h1 += h2;
-    h2 += h1;
-
-    return new long[]{h1, h2};
-  }
-
-  private static long fmix64(long h) {
-    h ^= (h >>> 33);
-    h *= 0xff51afd7ed558ccdL;
-    h ^= (h >>> 33);
-    h *= 0xc4ceb9fe1a85ec53L;
-    h ^= (h >>> 33);
-    return h;
-  }
-}

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/java/org/apache/orc/util/Murmur3.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/java/org/apache/orc/util/Murmur3.java b/java/storage-api/src/java/org/apache/orc/util/Murmur3.java
new file mode 100644
index 0000000..838681c
--- /dev/null
+++ b/java/storage-api/src/java/org/apache/orc/util/Murmur3.java
@@ -0,0 +1,335 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+/**
+ * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms.
+ *
+ * Murmur3 32 and 128 bit variants.
+ * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
+ * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
+ *
+ * This is a public domain code with no copyrights.
+ * From homepage of MurmurHash (https://code.google.com/p/smhasher/),
+ * "All MurmurHash versions are public domain software, and the author disclaims all copyright
+ * to their code."
+ */
+public class Murmur3 {
+  // from 64-bit linear congruential generator
+  public static final long NULL_HASHCODE = 2862933555777941757L;
+
+  // Constants for 32 bit variant
+  private static final int C1_32 = 0xcc9e2d51;
+  private static final int C2_32 = 0x1b873593;
+  private static final int R1_32 = 15;
+  private static final int R2_32 = 13;
+  private static final int M_32 = 5;
+  private static final int N_32 = 0xe6546b64;
+
+  // Constants for 128 bit variant
+  private static final long C1 = 0x87c37b91114253d5L;
+  private static final long C2 = 0x4cf5ad432745937fL;
+  private static final int R1 = 31;
+  private static final int R2 = 27;
+  private static final int R3 = 33;
+  private static final int M = 5;
+  private static final int N1 = 0x52dce729;
+  private static final int N2 = 0x38495ab5;
+
+  private static final int DEFAULT_SEED = 104729;
+
+  /**
+   * Murmur3 32-bit variant.
+   *
+   * @param data - input byte array
+   * @return - hashcode
+   */
+  public static int hash32(byte[] data) {
+    return hash32(data, data.length, DEFAULT_SEED);
+  }
+
+  /**
+   * Murmur3 32-bit variant.
+   *
+   * @param data   - input byte array
+   * @param length - length of array
+   * @param seed   - seed. (default 0)
+   * @return - hashcode
+   */
+  public static int hash32(byte[] data, int length, int seed) {
+    int hash = seed;
+    final int nblocks = length >> 2;
+
+    // body
+    for (int i = 0; i < nblocks; i++) {
+      int i_4 = i << 2;
+      int k = (data[i_4] & 0xff)
+          | ((data[i_4 + 1] & 0xff) << 8)
+          | ((data[i_4 + 2] & 0xff) << 16)
+          | ((data[i_4 + 3] & 0xff) << 24);
+
+      // mix functions
+      k *= C1_32;
+      k = Integer.rotateLeft(k, R1_32);
+      k *= C2_32;
+      hash ^= k;
+      hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
+    }
+
+    // tail
+    int idx = nblocks << 2;
+    int k1 = 0;
+    switch (length - idx) {
+      case 3:
+        k1 ^= data[idx + 2] << 16;
+      case 2:
+        k1 ^= data[idx + 1] << 8;
+      case 1:
+        k1 ^= data[idx];
+
+        // mix functions
+        k1 *= C1_32;
+        k1 = Integer.rotateLeft(k1, R1_32);
+        k1 *= C2_32;
+        hash ^= k1;
+    }
+
+    // finalization
+    hash ^= length;
+    hash ^= (hash >>> 16);
+    hash *= 0x85ebca6b;
+    hash ^= (hash >>> 13);
+    hash *= 0xc2b2ae35;
+    hash ^= (hash >>> 16);
+
+    return hash;
+  }
+
+  /**
+   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
+   *
+   * @param data - input byte array
+   * @return - hashcode
+   */
+  public static long hash64(byte[] data) {
+    return hash64(data, 0, data.length, DEFAULT_SEED);
+  }
+
+  public static long hash64(byte[] data, int offset, int length) {
+    return hash64(data, offset, length, DEFAULT_SEED);
+  }
+
+  /**
+   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant.
+   *
+   * @param data   - input byte array
+   * @param length - length of array
+   * @param seed   - seed. (default is 0)
+   * @return - hashcode
+   */
+  public static long hash64(byte[] data, int offset, int length, int seed) {
+    long hash = seed;
+    final int nblocks = length >> 3;
+
+    // body
+    for (int i = 0; i < nblocks; i++) {
+      final int i8 = i << 3;
+      long k = ((long) data[offset + i8] & 0xff)
+          | (((long) data[offset + i8 + 1] & 0xff) << 8)
+          | (((long) data[offset + i8 + 2] & 0xff) << 16)
+          | (((long) data[offset + i8 + 3] & 0xff) << 24)
+          | (((long) data[offset + i8 + 4] & 0xff) << 32)
+          | (((long) data[offset + i8 + 5] & 0xff) << 40)
+          | (((long) data[offset + i8 + 6] & 0xff) << 48)
+          | (((long) data[offset + i8 + 7] & 0xff) << 56);
+
+      // mix functions
+      k *= C1;
+      k = Long.rotateLeft(k, R1);
+      k *= C2;
+      hash ^= k;
+      hash = Long.rotateLeft(hash, R2) * M + N1;
+    }
+
+    // tail
+    long k1 = 0;
+    int tailStart = nblocks << 3;
+    switch (length - tailStart) {
+      case 7:
+        k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
+      case 6:
+        k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
+      case 5:
+        k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
+      case 4:
+        k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
+      case 3:
+        k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
+      case 2:
+        k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
+      case 1:
+        k1 ^= ((long) data[offset + tailStart] & 0xff);
+        k1 *= C1;
+        k1 = Long.rotateLeft(k1, R1);
+        k1 *= C2;
+        hash ^= k1;
+    }
+
+    // finalization
+    hash ^= length;
+    hash = fmix64(hash);
+
+    return hash;
+  }
+
+  /**
+   * Murmur3 128-bit variant.
+   *
+   * @param data - input byte array
+   * @return - hashcode (2 longs)
+   */
+  public static long[] hash128(byte[] data) {
+    return hash128(data, 0, data.length, DEFAULT_SEED);
+  }
+
+  /**
+   * Murmur3 128-bit variant.
+   *
+   * @param data   - input byte array
+   * @param offset - the first element of array
+   * @param length - length of array
+   * @param seed   - seed. (default is 0)
+   * @return - hashcode (2 longs)
+   */
+  public static long[] hash128(byte[] data, int offset, int length, int seed) {
+    long h1 = seed;
+    long h2 = seed;
+    final int nblocks = length >> 4;
+
+    // body
+    for (int i = 0; i < nblocks; i++) {
+      final int i16 = i << 4;
+      long k1 = ((long) data[offset + i16] & 0xff)
+          | (((long) data[offset + i16 + 1] & 0xff) << 8)
+          | (((long) data[offset + i16 + 2] & 0xff) << 16)
+          | (((long) data[offset + i16 + 3] & 0xff) << 24)
+          | (((long) data[offset + i16 + 4] & 0xff) << 32)
+          | (((long) data[offset + i16 + 5] & 0xff) << 40)
+          | (((long) data[offset + i16 + 6] & 0xff) << 48)
+          | (((long) data[offset + i16 + 7] & 0xff) << 56);
+
+      long k2 = ((long) data[offset + i16 + 8] & 0xff)
+          | (((long) data[offset + i16 + 9] & 0xff) << 8)
+          | (((long) data[offset + i16 + 10] & 0xff) << 16)
+          | (((long) data[offset + i16 + 11] & 0xff) << 24)
+          | (((long) data[offset + i16 + 12] & 0xff) << 32)
+          | (((long) data[offset + i16 + 13] & 0xff) << 40)
+          | (((long) data[offset + i16 + 14] & 0xff) << 48)
+          | (((long) data[offset + i16 + 15] & 0xff) << 56);
+
+      // mix functions for k1
+      k1 *= C1;
+      k1 = Long.rotateLeft(k1, R1);
+      k1 *= C2;
+      h1 ^= k1;
+      h1 = Long.rotateLeft(h1, R2);
+      h1 += h2;
+      h1 = h1 * M + N1;
+
+      // mix functions for k2
+      k2 *= C2;
+      k2 = Long.rotateLeft(k2, R3);
+      k2 *= C1;
+      h2 ^= k2;
+      h2 = Long.rotateLeft(h2, R1);
+      h2 += h1;
+      h2 = h2 * M + N2;
+    }
+
+    // tail
+    long k1 = 0;
+    long k2 = 0;
+    int tailStart = nblocks << 4;
+    switch (length - tailStart) {
+      case 15:
+        k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
+      case 14:
+        k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
+      case 13:
+        k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
+      case 12:
+        k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
+      case 11:
+        k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
+      case 10:
+        k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
+      case 9:
+        k2 ^= (long) (data[offset + tailStart + 8] & 0xff);
+        k2 *= C2;
+        k2 = Long.rotateLeft(k2, R3);
+        k2 *= C1;
+        h2 ^= k2;
+
+      case 8:
+        k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
+      case 7:
+        k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
+      case 6:
+        k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
+      case 5:
+        k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
+      case 4:
+        k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
+      case 3:
+        k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
+      case 2:
+        k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
+      case 1:
+        k1 ^= (long) (data[offset + tailStart] & 0xff);
+        k1 *= C1;
+        k1 = Long.rotateLeft(k1, R1);
+        k1 *= C2;
+        h1 ^= k1;
+    }
+
+    // finalization
+    h1 ^= length;
+    h2 ^= length;
+
+    h1 += h2;
+    h2 += h1;
+
+    h1 = fmix64(h1);
+    h2 = fmix64(h2);
+
+    h1 += h2;
+    h2 += h1;
+
+    return new long[]{h1, h2};
+  }
+
+  private static long fmix64(long h) {
+    h ^= (h >>> 33);
+    h *= 0xff51afd7ed558ccdL;
+    h ^= (h >>> 33);
+    h *= 0xc4ceb9fe1a85ec53L;
+    h ^= (h >>> 33);
+    return h;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java b/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java
deleted file mode 100644
index 5facc7c..0000000
--- a/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java
+++ /dev/null
@@ -1,224 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-import static org.junit.Assert.assertEquals;
-
-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-
-import org.junit.Test;
-
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.util.Arrays;
-import java.util.Random;
-
-/**
- * Tests for Murmur3 variants.
- */
-public class TestMurmur3 {
-
-  @Test
-  public void testHashCodesM3_32_string() {
-    String key = "test";
-    int seed = 123;
-    HashFunction hf = Hashing.murmur3_32(seed);
-    int hc1 = hf.hashBytes(key.getBytes()).asInt();
-    int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
-    assertEquals(hc1, hc2);
-
-    key = "testkey";
-    hc1 = hf.hashBytes(key.getBytes()).asInt();
-    hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
-    assertEquals(hc1, hc2);
-  }
-
-  @Test
-  public void testHashCodesM3_32_ints() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_32(seed);
-    for (int i = 0; i < 1000; i++) {
-      int val = rand.nextInt();
-      byte[] data = ByteBuffer.allocate(4).putInt(val).array();
-      int hc1 = hf.hashBytes(data).asInt();
-      int hc2 = Murmur3.hash32(data, data.length, seed);
-      assertEquals(hc1, hc2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_32_longs() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_32(seed);
-    for (int i = 0; i < 1000; i++) {
-      long val = rand.nextLong();
-      byte[] data = ByteBuffer.allocate(8).putLong(val).array();
-      int hc1 = hf.hashBytes(data).asInt();
-      int hc2 = Murmur3.hash32(data, data.length, seed);
-      assertEquals(hc1, hc2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_32_double() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_32(seed);
-    for (int i = 0; i < 1000; i++) {
-      double val = rand.nextDouble();
-      byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
-      int hc1 = hf.hashBytes(data).asInt();
-      int hc2 = Murmur3.hash32(data, data.length, seed);
-      assertEquals(hc1, hc2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_128_string() {
-    String key = "test";
-    int seed = 123;
-    HashFunction hf = Hashing.murmur3_128(seed);
-    // guava stores the hashcodes in little endian order
-    ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-    buf.put(hf.hashBytes(key.getBytes()).asBytes());
-    buf.flip();
-    long gl1 = buf.getLong();
-    long gl2 = buf.getLong(8);
-    long[] hc = Murmur3.hash128(key.getBytes(), 0, key.getBytes().length, seed);
-    long m1 = hc[0];
-    long m2 = hc[1];
-    assertEquals(gl1, m1);
-    assertEquals(gl2, m2);
-
-    key = "testkey128_testkey128";
-    buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-    buf.put(hf.hashBytes(key.getBytes()).asBytes());
-    buf.flip();
-    gl1 = buf.getLong();
-    gl2 = buf.getLong(8);
-    byte[] keyBytes = key.getBytes();
-    hc = Murmur3.hash128(keyBytes, 0, keyBytes.length, seed);
-    m1 = hc[0];
-    m2 = hc[1];
-    assertEquals(gl1, m1);
-    assertEquals(gl2, m2);
-
-    byte[] offsetKeyBytes = new byte[keyBytes.length + 35];
-    Arrays.fill(offsetKeyBytes, (byte) -1);
-    System.arraycopy(keyBytes, 0, offsetKeyBytes, 35, keyBytes.length);
-    hc = Murmur3.hash128(offsetKeyBytes, 35, keyBytes.length, seed);
-    assertEquals(gl1, hc[0]);
-    assertEquals(gl2, hc[1]);
-  }
-
-  @Test
-  public void testHashCodeM3_64() {
-    byte[] origin = ("It was the best of times, it was the worst of times," +
-        " it was the age of wisdom, it was the age of foolishness," +
-        " it was the epoch of belief, it was the epoch of incredulity," +
-        " it was the season of Light, it was the season of Darkness," +
-        " it was the spring of hope, it was the winter of despair," +
-        " we had everything before us, we had nothing before us," +
-        " we were all going direct to Heaven," +
-        " we were all going direct the other way.").getBytes();
-    long hash = Murmur3.hash64(origin, 0, origin.length);
-    assertEquals(305830725663368540L, hash);
-
-    byte[] originOffset = new byte[origin.length + 150];
-    Arrays.fill(originOffset, (byte) 123);
-    System.arraycopy(origin, 0, originOffset, 150, origin.length);
-    hash = Murmur3.hash64(originOffset, 150, origin.length);
-    assertEquals(305830725663368540L, hash);
-  }
-
-  @Test
-  public void testHashCodesM3_128_ints() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_128(seed);
-    for (int i = 0; i < 1000; i++) {
-      int val = rand.nextInt();
-      byte[] data = ByteBuffer.allocate(4).putInt(val).array();
-      // guava stores the hashcodes in little endian order
-      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-      buf.put(hf.hashBytes(data).asBytes());
-      buf.flip();
-      long gl1 = buf.getLong();
-      long gl2 = buf.getLong(8);
-      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
-      long m1 = hc[0];
-      long m2 = hc[1];
-      assertEquals(gl1, m1);
-      assertEquals(gl2, m2);
-
-      byte[] offsetData = new byte[data.length + 50];
-      System.arraycopy(data, 0, offsetData, 50, data.length);
-      hc = Murmur3.hash128(offsetData, 50, data.length, seed);
-      assertEquals(gl1, hc[0]);
-      assertEquals(gl2, hc[1]);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_128_longs() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_128(seed);
-    for (int i = 0; i < 1000; i++) {
-      long val = rand.nextLong();
-      byte[] data = ByteBuffer.allocate(8).putLong(val).array();
-      // guava stores the hashcodes in little endian order
-      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-      buf.put(hf.hashBytes(data).asBytes());
-      buf.flip();
-      long gl1 = buf.getLong();
-      long gl2 = buf.getLong(8);
-      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
-      long m1 = hc[0];
-      long m2 = hc[1];
-      assertEquals(gl1, m1);
-      assertEquals(gl2, m2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_128_double() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_128(seed);
-    for (int i = 0; i < 1000; i++) {
-      double val = rand.nextDouble();
-      byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
-      // guava stores the hashcodes in little endian order
-      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-      buf.put(hf.hashBytes(data).asBytes());
-      buf.flip();
-      long gl1 = buf.getLong();
-      long gl2 = buf.getLong(8);
-      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
-      long m1 = hc[0];
-      long m2 = hc[1];
-      assertEquals(gl1, m1);
-      assertEquals(gl2, m2);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/java/org/apache/orc/tools/FileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 876070b..7206503 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -37,7 +37,8 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.hdfs.DistributedFileSystem;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilter;
+import org.apache.orc.util.BloomFilterIO;
 import org.apache.orc.ColumnStatistics;
 import org.apache.orc.CompressionKind;
 import org.apache.orc.OrcFile;
@@ -383,7 +384,9 @@ public final class FileDump {
           StringBuilder buf = new StringBuilder();
           String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex());
           buf.append(rowIdxString);
-          String bloomFilString = getFormattedBloomFilters(col, indices.getBloomFilterIndex());
+          String bloomFilString = getFormattedBloomFilters(col, indices,
+              reader.getWriterVersion(),
+              reader.getSchema().findSubtype(col).getCategory());
           buf.append(bloomFilString);
           System.out.println(buf);
         }
@@ -604,15 +607,18 @@ public final class FileDump {
     return -1;
   }
 
-  private static String getFormattedBloomFilters(int col,
-      OrcProto.BloomFilterIndex[] bloomFilterIndex) {
+  private static String getFormattedBloomFilters(int col, OrcIndex index,
+                                                 OrcFile.WriterVersion version,
+                                                 TypeDescription.Category type) {
+    OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex();
     StringBuilder buf = new StringBuilder();
-    BloomFilterIO stripeLevelBF = null;
+    BloomFilter stripeLevelBF = null;
     if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
       int idx = 0;
       buf.append("\n    Bloom filters for column ").append(col).append(":");
       for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
-        BloomFilterIO toMerge = new BloomFilterIO(bf);
+        BloomFilter toMerge = BloomFilterIO.deserialize(
+            index.getBloomFilterKinds()[col], version, type, bf);
         buf.append("\n      Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge));
         if (stripeLevelBF == null) {
           stripeLevelBF = toMerge;
@@ -626,7 +632,7 @@ public final class FileDump {
     return buf.toString();
   }
 
-  private static String getBloomFilterStats(BloomFilterIO bf) {
+  private static String getBloomFilterStats(BloomFilter bf) {
     StringBuilder sb = new StringBuilder();
     int bitCount = bf.getBitSize();
     int popCount = 0;

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index e2048ea..aa3072c 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -20,18 +20,20 @@ package org.apache.orc.tools;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcFile;
 import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
 import org.apache.orc.impl.AcidStats;
 import org.apache.orc.impl.OrcAcidUtils;
 import org.apache.orc.impl.RecordReaderImpl;
+import org.apache.orc.util.BloomFilter;
 import org.codehaus.jettison.json.JSONArray;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilterIO;
 import org.apache.orc.BinaryColumnStatistics;
 import org.apache.orc.BooleanColumnStatistics;
 import org.apache.orc.ColumnStatistics;
@@ -50,12 +52,16 @@ import org.codehaus.jettison.json.JSONException;
 import org.codehaus.jettison.json.JSONObject;
 import org.codehaus.jettison.json.JSONStringer;
 import org.codehaus.jettison.json.JSONWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * File dump tool with json formatted output.
  */
 public class JsonFileDump {
 
+  private static final Logger LOG = LoggerFactory.getLogger(JsonFileDump.class);
+
   public static void printJsonMetaData(List<String> files,
       Configuration conf,
       List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone)
@@ -185,7 +191,9 @@ public class JsonFileDump {
               writer.object();
               writer.key("columnId").value(col);
               writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
-              writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex());
+              writeBloomFilterIndexes(writer, col, indices,
+                  reader.getWriterVersion(),
+                  reader.getSchema().findSubtype(col).getCategory());
               writer.endObject();
             }
             writer.endArray();
@@ -334,16 +342,21 @@ public class JsonFileDump {
   }
 
   private static void writeBloomFilterIndexes(JSONWriter writer, int col,
-      OrcProto.BloomFilterIndex[] bloomFilterIndex) throws JSONException {
+                                              OrcIndex index,
+                                              OrcFile.WriterVersion version,
+                                              TypeDescription.Category type
+                                              ) throws JSONException {
 
-    BloomFilterIO stripeLevelBF = null;
+    BloomFilter stripeLevelBF = null;
+    OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex();
     if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
       int entryIx = 0;
       writer.key("bloomFilterIndexes").array();
       for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
         writer.object();
         writer.key("entryId").value(entryIx++);
-        BloomFilterIO toMerge = new BloomFilterIO(bf);
+        BloomFilter toMerge = BloomFilterIO.deserialize(
+            index.getBloomFilterKinds()[col], version, type, bf);
         writeBloomFilterStats(writer, toMerge);
         if (stripeLevelBF == null) {
           stripeLevelBF = toMerge;
@@ -362,7 +375,7 @@ public class JsonFileDump {
     }
   }
 
-  private static void writeBloomFilterStats(JSONWriter writer, BloomFilterIO bf)
+  private static void writeBloomFilterStats(JSONWriter writer, BloomFilter bf)
       throws JSONException {
     int bitCount = bf.getBitSize();
     int popCount = 0;

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index 10cc87d..65ff404 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -445,8 +445,9 @@ public class TestFileDump {
         .compress(CompressionKind.ZLIB)
         .bufferSize(10000)
         .rowIndexStride(1000)
-        .bloomFilterColumns("l")
-        .bloomFilterFpp(0.01);
+        .bloomFilterColumns("l,s")
+        .bloomFilterFpp(0.01)
+        .bloomFilterVersion(OrcFile.BloomFilterVersion.ORIGINAL);
     VectorizedRowBatch batch = schema.createRowBatch(1000);
     Writer writer = OrcFile.createWriter(testFilePath, options);
     Random r1 = new Random(1);
@@ -483,7 +484,6 @@ public class TestFileDump {
     System.out.flush();
     System.setOut(origOut);
 
-
     checkOutput(outputFilename, workDir + File.separator + outputFilename);
   }
 

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/log4j.properties b/java/tools/src/test/resources/log4j.properties
new file mode 100644
index 0000000..8224baf
--- /dev/null
+++ b/java/tools/src/test/resources/log4j.properties
@@ -0,0 +1,21 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootLogger=WARN,stdout
+
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target   = System.err
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n
+
+# Suppress the warnings about native io not being available
+log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
index 18fd2fb..b879bed 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096
@@ -39,17 +39,17 @@ File Statistics:
   Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
 
 Stripes:
-  Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 951
+  Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 743
     Stream: column 0 section ROW_INDEX start: 3 length 17
     Stream: column 1 section ROW_INDEX start: 20 length 166
     Stream: column 2 section ROW_INDEX start: 186 length 169
     Stream: column 3 section ROW_INDEX start: 355 length 87
-    Stream: column 3 section BLOOM_FILTER start: 442 length 512
-    Stream: column 1 section DATA start: 954 length 20035
-    Stream: column 2 section DATA start: 20989 length 40050
-    Stream: column 3 section DATA start: 61039 length 3543
-    Stream: column 3 section LENGTH start: 64582 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 64607 length 133
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 304
+    Stream: column 1 section DATA start: 746 length 20035
+    Stream: column 2 section DATA start: 20781 length 40050
+    Stream: column 3 section DATA start: 60831 length 3543
+    Stream: column 3 section LENGTH start: 64374 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 64399 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -67,17 +67,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 64826 data: 63775 rows: 5000 tail: 86 index: 944
-    Stream: column 0 section ROW_INDEX start: 64826 length 17
-    Stream: column 1 section ROW_INDEX start: 64843 length 164
-    Stream: column 2 section ROW_INDEX start: 65007 length 168
-    Stream: column 3 section ROW_INDEX start: 65175 length 83
-    Stream: column 3 section BLOOM_FILTER start: 65258 length 512
-    Stream: column 1 section DATA start: 65770 length 20035
-    Stream: column 2 section DATA start: 85805 length 40050
-    Stream: column 3 section DATA start: 125855 length 3532
-    Stream: column 3 section LENGTH start: 129387 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 129412 length 133
+  Stripe: offset: 64618 data: 63775 rows: 5000 tail: 86 index: 736
+    Stream: column 0 section ROW_INDEX start: 64618 length 17
+    Stream: column 1 section ROW_INDEX start: 64635 length 164
+    Stream: column 2 section ROW_INDEX start: 64799 length 168
+    Stream: column 3 section ROW_INDEX start: 64967 length 83
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 65050 length 304
+    Stream: column 1 section DATA start: 65354 length 20035
+    Stream: column 2 section DATA start: 85389 length 40050
+    Stream: column 3 section DATA start: 125439 length 3532
+    Stream: column 3 section LENGTH start: 128971 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 128996 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -95,17 +95,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 129631 data: 63787 rows: 5000 tail: 86 index: 950
-    Stream: column 0 section ROW_INDEX start: 129631 length 17
-    Stream: column 1 section ROW_INDEX start: 129648 length 163
-    Stream: column 2 section ROW_INDEX start: 129811 length 168
-    Stream: column 3 section ROW_INDEX start: 129979 length 90
-    Stream: column 3 section BLOOM_FILTER start: 130069 length 512
-    Stream: column 1 section DATA start: 130581 length 20035
-    Stream: column 2 section DATA start: 150616 length 40050
-    Stream: column 3 section DATA start: 190666 length 3544
-    Stream: column 3 section LENGTH start: 194210 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 194235 length 133
+  Stripe: offset: 129215 data: 63787 rows: 5000 tail: 86 index: 742
+    Stream: column 0 section ROW_INDEX start: 129215 length 17
+    Stream: column 1 section ROW_INDEX start: 129232 length 163
+    Stream: column 2 section ROW_INDEX start: 129395 length 168
+    Stream: column 3 section ROW_INDEX start: 129563 length 90
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 129653 length 304
+    Stream: column 1 section DATA start: 129957 length 20035
+    Stream: column 2 section DATA start: 149992 length 40050
+    Stream: column 3 section DATA start: 190042 length 3544
+    Stream: column 3 section LENGTH start: 193586 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 193611 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -123,17 +123,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 194454 data: 63817 rows: 5000 tail: 86 index: 952
-    Stream: column 0 section ROW_INDEX start: 194454 length 17
-    Stream: column 1 section ROW_INDEX start: 194471 length 165
-    Stream: column 2 section ROW_INDEX start: 194636 length 167
-    Stream: column 3 section ROW_INDEX start: 194803 length 91
-    Stream: column 3 section BLOOM_FILTER start: 194894 length 512
-    Stream: column 1 section DATA start: 195406 length 20035
-    Stream: column 2 section DATA start: 215441 length 40050
-    Stream: column 3 section DATA start: 255491 length 3574
-    Stream: column 3 section LENGTH start: 259065 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 259090 length 133
+  Stripe: offset: 193830 data: 63817 rows: 5000 tail: 85 index: 744
+    Stream: column 0 section ROW_INDEX start: 193830 length 17
+    Stream: column 1 section ROW_INDEX start: 193847 length 165
+    Stream: column 2 section ROW_INDEX start: 194012 length 167
+    Stream: column 3 section ROW_INDEX start: 194179 length 91
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 194270 length 304
+    Stream: column 1 section DATA start: 194574 length 20035
+    Stream: column 2 section DATA start: 214609 length 40050
+    Stream: column 3 section DATA start: 254659 length 3574
+    Stream: column 3 section LENGTH start: 258233 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 258258 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -151,17 +151,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 259309 data: 12943 rows: 1000 tail: 78 index: 432
-    Stream: column 0 section ROW_INDEX start: 259309 length 12
-    Stream: column 1 section ROW_INDEX start: 259321 length 38
-    Stream: column 2 section ROW_INDEX start: 259359 length 41
-    Stream: column 3 section ROW_INDEX start: 259400 length 40
-    Stream: column 3 section BLOOM_FILTER start: 259440 length 301
-    Stream: column 1 section DATA start: 259741 length 4007
-    Stream: column 2 section DATA start: 263748 length 8010
-    Stream: column 3 section DATA start: 271758 length 768
-    Stream: column 3 section LENGTH start: 272526 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 272551 length 133
+  Stripe: offset: 258476 data: 12943 rows: 1000 tail: 78 index: 382
+    Stream: column 0 section ROW_INDEX start: 258476 length 12
+    Stream: column 1 section ROW_INDEX start: 258488 length 38
+    Stream: column 2 section ROW_INDEX start: 258526 length 41
+    Stream: column 3 section ROW_INDEX start: 258567 length 40
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 258607 length 251
+    Stream: column 1 section DATA start: 258858 length 4007
+    Stream: column 2 section DATA start: 262865 length 8010
+    Stream: column 3 section DATA start: 270875 length 768
+    Stream: column 3 section LENGTH start: 271643 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 271668 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -172,7 +172,7 @@ Stripes:
       Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
 
-File length: 273307 bytes
+File length: 272427 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 ________________________________________________________________________________________________________________________

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
index fa5cc2d..75cd5f4 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096
@@ -39,17 +39,20 @@ File Statistics:
   Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
 
 Stripes:
-  Stripe: offset: 3 data: 63786 rows: 5000 tail: 85 index: 6974
+  Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14950
     Stream: column 0 section ROW_INDEX start: 3 length 17
     Stream: column 1 section ROW_INDEX start: 20 length 166
     Stream: column 2 section ROW_INDEX start: 186 length 169
     Stream: column 2 section BLOOM_FILTER start: 355 length 6535
-    Stream: column 3 section ROW_INDEX start: 6890 length 87
-    Stream: column 1 section DATA start: 6977 length 20035
-    Stream: column 2 section DATA start: 27012 length 40050
-    Stream: column 3 section DATA start: 67062 length 3543
-    Stream: column 3 section LENGTH start: 70605 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 70630 length 133
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 6890 length 6046
+    Stream: column 3 section ROW_INDEX start: 12936 length 87
+    Stream: column 3 section BLOOM_FILTER start: 13023 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 892
+    Stream: column 1 section DATA start: 14953 length 20035
+    Stream: column 2 section DATA start: 34988 length 40050
+    Stream: column 3 section DATA start: 75038 length 3543
+    Stream: column 3 section LENGTH start: 78581 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 78606 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -67,17 +70,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 0.5155 expectedFpp: 0.009676614
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 loadFactor: 0.9736 expectedFpp: 0.829482
-  Stripe: offset: 70848 data: 63775 rows: 5000 tail: 85 index: 6965
-    Stream: column 0 section ROW_INDEX start: 70848 length 17
-    Stream: column 1 section ROW_INDEX start: 70865 length 164
-    Stream: column 2 section ROW_INDEX start: 71029 length 168
-    Stream: column 2 section BLOOM_FILTER start: 71197 length 6533
-    Stream: column 3 section ROW_INDEX start: 77730 length 83
-    Stream: column 1 section DATA start: 77813 length 20035
-    Stream: column 2 section DATA start: 97848 length 40050
-    Stream: column 3 section DATA start: 137898 length 3532
-    Stream: column 3 section LENGTH start: 141430 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 141455 length 133
+  Stripe: offset: 78843 data: 63775 rows: 5000 tail: 103 index: 14941
+    Stream: column 0 section ROW_INDEX start: 78843 length 17
+    Stream: column 1 section ROW_INDEX start: 78860 length 164
+    Stream: column 2 section ROW_INDEX start: 79024 length 168
+    Stream: column 2 section BLOOM_FILTER start: 79192 length 6533
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 85725 length 6046
+    Stream: column 3 section ROW_INDEX start: 91771 length 83
+    Stream: column 3 section BLOOM_FILTER start: 91854 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 92892 length 892
+    Stream: column 1 section DATA start: 93784 length 20035
+    Stream: column 2 section DATA start: 113819 length 40050
+    Stream: column 3 section DATA start: 153869 length 3532
+    Stream: column 3 section LENGTH start: 157401 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 157426 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -95,17 +101,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 0.5169 expectedFpp: 0.009855959
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 0.5173 expectedFpp: 0.009911705
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 loadFactor: 0.9733 expectedFpp: 0.8276205
-  Stripe: offset: 141673 data: 63787 rows: 5000 tail: 85 index: 6971
-    Stream: column 0 section ROW_INDEX start: 141673 length 17
-    Stream: column 1 section ROW_INDEX start: 141690 length 163
-    Stream: column 2 section ROW_INDEX start: 141853 length 168
-    Stream: column 2 section BLOOM_FILTER start: 142021 length 6533
-    Stream: column 3 section ROW_INDEX start: 148554 length 90
-    Stream: column 1 section DATA start: 148644 length 20035
-    Stream: column 2 section DATA start: 168679 length 40050
-    Stream: column 3 section DATA start: 208729 length 3544
-    Stream: column 3 section LENGTH start: 212273 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 212298 length 133
+  Stripe: offset: 157662 data: 63787 rows: 5000 tail: 104 index: 14947
+    Stream: column 0 section ROW_INDEX start: 157662 length 17
+    Stream: column 1 section ROW_INDEX start: 157679 length 163
+    Stream: column 2 section ROW_INDEX start: 157842 length 168
+    Stream: column 2 section BLOOM_FILTER start: 158010 length 6533
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 164543 length 6046
+    Stream: column 3 section ROW_INDEX start: 170589 length 90
+    Stream: column 3 section BLOOM_FILTER start: 170679 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 171717 length 892
+    Stream: column 1 section DATA start: 172609 length 20035
+    Stream: column 2 section DATA start: 192644 length 40050
+    Stream: column 3 section DATA start: 232694 length 3544
+    Stream: column 3 section LENGTH start: 236238 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 236263 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -123,17 +132,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 0.5149 expectedFpp: 0.009594797
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 0.5135 expectedFpp: 0.009419539
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 loadFactor: 0.9722 expectedFpp: 0.82082444
-  Stripe: offset: 212516 data: 63817 rows: 5000 tail: 85 index: 6964
-    Stream: column 0 section ROW_INDEX start: 212516 length 17
-    Stream: column 1 section ROW_INDEX start: 212533 length 165
-    Stream: column 2 section ROW_INDEX start: 212698 length 167
-    Stream: column 2 section BLOOM_FILTER start: 212865 length 6524
-    Stream: column 3 section ROW_INDEX start: 219389 length 91
-    Stream: column 1 section DATA start: 219480 length 20035
-    Stream: column 2 section DATA start: 239515 length 40050
-    Stream: column 3 section DATA start: 279565 length 3574
-    Stream: column 3 section LENGTH start: 283139 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 283164 length 133
+  Stripe: offset: 236500 data: 63817 rows: 5000 tail: 103 index: 14940
+    Stream: column 0 section ROW_INDEX start: 236500 length 17
+    Stream: column 1 section ROW_INDEX start: 236517 length 165
+    Stream: column 2 section ROW_INDEX start: 236682 length 167
+    Stream: column 2 section BLOOM_FILTER start: 236849 length 6524
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 243373 length 6046
+    Stream: column 3 section ROW_INDEX start: 249419 length 91
+    Stream: column 3 section BLOOM_FILTER start: 249510 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 250548 length 892
+    Stream: column 1 section DATA start: 251440 length 20035
+    Stream: column 2 section DATA start: 271475 length 40050
+    Stream: column 3 section DATA start: 311525 length 3574
+    Stream: column 3 section LENGTH start: 315099 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 315124 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -151,17 +163,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 0.5147 expectedFpp: 0.009567649
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 0.5201 expectedFpp: 0.010295142
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 loadFactor: 0.9743 expectedFpp: 0.8332165
-  Stripe: offset: 283382 data: 12943 rows: 1000 tail: 78 index: 1468
-    Stream: column 0 section ROW_INDEX start: 283382 length 12
-    Stream: column 1 section ROW_INDEX start: 283394 length 38
-    Stream: column 2 section ROW_INDEX start: 283432 length 41
-    Stream: column 2 section BLOOM_FILTER start: 283473 length 1337
-    Stream: column 3 section ROW_INDEX start: 284810 length 40
-    Stream: column 1 section DATA start: 284850 length 4007
-    Stream: column 2 section DATA start: 288857 length 8010
-    Stream: column 3 section DATA start: 296867 length 768
-    Stream: column 3 section LENGTH start: 297635 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 297660 length 133
+  Stripe: offset: 315360 data: 12943 rows: 1000 tail: 96 index: 3542
+    Stream: column 0 section ROW_INDEX start: 315360 length 12
+    Stream: column 1 section ROW_INDEX start: 315372 length 38
+    Stream: column 2 section ROW_INDEX start: 315410 length 41
+    Stream: column 2 section BLOOM_FILTER start: 315451 length 1337
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 316788 length 1211
+    Stream: column 3 section ROW_INDEX start: 317999 length 40
+    Stream: column 3 section BLOOM_FILTER start: 318039 length 472
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 318511 length 391
+    Stream: column 1 section DATA start: 318902 length 4007
+    Stream: column 2 section DATA start: 322909 length 8010
+    Stream: column 3 section DATA start: 330919 length 768
+    Stream: column 3 section LENGTH start: 331687 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 331712 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -172,7 +187,7 @@ Stripes:
       Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294
 
-File length: 298416 bytes
+File length: 332489 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 ________________________________________________________________________________________________________________________

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
index 17a964b..4b0822f 100644
--- a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
+++ b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump.json
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.json b/java/tools/src/test/resources/orc-file-dump.json
index bf654a1..3dd0dc0 100644
--- a/java/tools/src/test/resources/orc-file-dump.json
+++ b/java/tools/src/test/resources/orc-file-dump.json
@@ -1,7 +1,7 @@
 {
   "fileName": "TestFileDump.testDump.orc",
   "fileVersion": "0.12",
-  "writerVersion": "HIVE_13083",
+  "writerVersion": "ORC_101",
   "numberOfRows": 21000,
   "compression": "ZLIB",
   "compressionBufferSize": 4096,
@@ -254,9 +254,9 @@
       "stripeNumber": 1,
       "stripeInformation": {
         "offset": 3,
-        "indexLength": 970,
+        "indexLength": 762,
         "dataLength": 63770,
-        "footerLength": 90,
+        "footerLength": 89,
         "rowCount": 5000
       },
       "streams": [
@@ -286,44 +286,44 @@
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
+          "section": "BLOOM_FILTER_UTF8",
           "startOffset": 461,
-          "length": 512
+          "length": 304
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 973,
+          "startOffset": 765,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 21008,
+          "startOffset": 20800,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 61058,
+          "startOffset": 60850,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 61075,
+          "startOffset": 60867,
           "length": 3510
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 64585,
+          "startOffset": 64377,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 64610,
+          "startOffset": 64402,
           "length": 133
         }
       ],
@@ -494,77 +494,77 @@
     {
       "stripeNumber": 2,
       "stripeInformation": {
-        "offset": 64833,
-        "indexLength": 961,
+        "offset": 64624,
+        "indexLength": 753,
         "dataLength": 63763,
-        "footerLength": 88,
+        "footerLength": 87,
         "rowCount": 5000
       },
       "streams": [
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 64833,
+          "startOffset": 64624,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 64850,
+          "startOffset": 64641,
           "length": 166
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 65016,
+          "startOffset": 64807,
           "length": 166
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 65182,
+          "startOffset": 64973,
           "length": 100
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
-          "startOffset": 65282,
-          "length": 512
+          "section": "BLOOM_FILTER_UTF8",
+          "startOffset": 65073,
+          "length": 304
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 65794,
+          "startOffset": 65377,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 85829,
+          "startOffset": 85412,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 125879,
+          "startOffset": 125462,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 125896,
+          "startOffset": 125479,
           "length": 3503
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 129399,
+          "startOffset": 128982,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 129424,
+          "startOffset": 129007,
           "length": 133
         }
       ],
@@ -735,77 +735,77 @@
     {
       "stripeNumber": 3,
       "stripeInformation": {
-        "offset": 129645,
-        "indexLength": 962,
+        "offset": 129227,
+        "indexLength": 754,
         "dataLength": 63770,
-        "footerLength": 91,
+        "footerLength": 89,
         "rowCount": 5000
       },
       "streams": [
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 129645,
+          "startOffset": 129227,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 129662,
+          "startOffset": 129244,
           "length": 164
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 129826,
+          "startOffset": 129408,
           "length": 167
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 129993,
+          "startOffset": 129575,
           "length": 102
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
-          "startOffset": 130095,
-          "length": 512
+          "section": "BLOOM_FILTER_UTF8",
+          "startOffset": 129677,
+          "length": 304
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 130607,
+          "startOffset": 129981,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 150642,
+          "startOffset": 150016,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 190692,
+          "startOffset": 190066,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 190709,
+          "startOffset": 190083,
           "length": 3510
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 194219,
+          "startOffset": 193593,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 194244,
+          "startOffset": 193618,
           "length": 133
         }
       ],
@@ -976,77 +976,77 @@
     {
       "stripeNumber": 4,
       "stripeInformation": {
-        "offset": 194468,
-        "indexLength": 973,
+        "offset": 193840,
+        "indexLength": 765,
         "dataLength": 63756,
-        "footerLength": 91,
+        "footerLength": 89,
         "rowCount": 5000
       },
       "streams": [
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 194468,
+          "startOffset": 193840,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 194485,
+          "startOffset": 193857,
           "length": 166
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 194651,
+          "startOffset": 194023,
           "length": 171
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 194822,
+          "startOffset": 194194,
           "length": 107
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
-          "startOffset": 194929,
-          "length": 512
+          "section": "BLOOM_FILTER_UTF8",
+          "startOffset": 194301,
+          "length": 304
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 195441,
+          "startOffset": 194605,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 215476,
+          "startOffset": 214640,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 255526,
+          "startOffset": 254690,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 255543,
+          "startOffset": 254707,
           "length": 3496
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 259039,
+          "startOffset": 258203,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 259064,
+          "startOffset": 258228,
           "length": 133
         }
       ],
@@ -1217,8 +1217,8 @@
     {
       "stripeNumber": 5,
       "stripeInformation": {
-        "offset": 259288,
-        "indexLength": 433,
+        "offset": 258450,
+        "indexLength": 383,
         "dataLength": 12943,
         "footerLength": 83,
         "rowCount": 1000
@@ -1227,67 +1227,67 @@
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 259288,
+          "startOffset": 258450,
           "length": 12
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 259300,
+          "startOffset": 258462,
           "length": 38
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 259338,
+          "startOffset": 258500,
           "length": 41
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 259379,
+          "startOffset": 258541,
           "length": 41
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
-          "startOffset": 259420,
-          "length": 301
+          "section": "BLOOM_FILTER_UTF8",
+          "startOffset": 258582,
+          "length": 251
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 259721,
+          "startOffset": 258833,
           "length": 4007
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 263728,
+          "startOffset": 262840,
           "length": 8010
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 271738,
+          "startOffset": 270850,
           "length": 16
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 271754,
+          "startOffset": 270866,
           "length": 752
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 272506,
+          "startOffset": 271618,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 272531,
+          "startOffset": 271643,
           "length": 133
         }
       ],
@@ -1348,7 +1348,7 @@
       }]
     }
   ],
-  "fileLength": 273300,
+  "fileLength": 272409,
   "paddingLength": 0,
   "paddingRatio": 0,
   "status": "OK"

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.out b/java/tools/src/test/resources/orc-file-dump.out
index 70f7fbd..ae8195e 100644
--- a/java/tools/src/test/resources/orc-file-dump.out
+++ b/java/tools/src/test/resources/orc-file-dump.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-has-null.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-has-null.out b/java/tools/src/test/resources/orc-file-has-null.out
index df075d5..c02f803 100644
--- a/java/tools/src/test/resources/orc-file-has-null.out
+++ b/java/tools/src/test/resources/orc-file-has-null.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 20000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/proto/orc_proto.proto
----------------------------------------------------------------------
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index dbc34ab..de6974e 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -91,6 +91,7 @@ message RowIndex {
 message BloomFilter {
   optional uint32 numHashFunctions = 1;
   repeated fixed64 bitset = 2;
+  optional bytes utf8bitset = 3;
 }
 
 message BloomFilterIndex {
@@ -109,6 +110,7 @@ message Stream {
     SECONDARY = 5;
     ROW_INDEX = 6;
     BLOOM_FILTER = 7;
+    BLOOM_FILTER_UTF8 = 8;
   }
   optional Kind kind = 1;
   optional uint32 column = 2;

[2/4] orc git commit: ORC-101 Correct bloom filters for strings and decimals to use utf8 encoding.

Posted by om...@apache.org.

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
index 6d1955d..f159eef 100644
--- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
@@ -19,9 +19,11 @@
 package org.apache.orc.impl;
 
 import static junit.framework.Assert.assertEquals;
+import static junit.framework.TestCase.fail;
 import static org.hamcrest.core.Is.is;
-import static org.junit.Assert.*;
-import static org.mockito.Mockito.any;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
 import static org.mockito.Mockito.atLeastOnce;
 import static org.mockito.Mockito.doThrow;
 import static org.mockito.Mockito.mock;
@@ -33,9 +35,9 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.sql.Timestamp;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
-import junit.framework.Assert;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileStatus;
@@ -46,7 +48,7 @@ import org.apache.hadoop.fs.Seekable;
 import org.apache.hadoop.hive.common.io.DiskRangeList;
 import org.apache.hadoop.hive.common.type.HiveDecimal;
 import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilter;
 import org.apache.orc.DataReader;
 import org.apache.orc.RecordReader;
 import org.apache.orc.TypeDescription;
@@ -62,6 +64,7 @@ import org.apache.orc.OrcFile;
 import org.apache.orc.Reader;
 import org.apache.orc.OrcProto;
 
+import org.junit.Assert;
 import org.junit.Test;
 import org.mockito.MockSettings;
 import org.mockito.Mockito;
@@ -375,23 +378,23 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN));
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN));
 
     pred = createPredicateLeaf(
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN));
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN));
 
     pred = createPredicateLeaf(
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", false, null);
     assertEquals(TruthValue.NO,
-      RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
+      RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN));
     assertEquals(TruthValue.YES_NO,
-      RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
+      RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.BOOLEAN));
   }
 
   @Test
@@ -399,34 +402,34 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.FLOAT, "x", 15.0, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     // Stats gets converted to column type. "15" is outside of "10" and "100"
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.STRING, "x", "15", null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     // Integer stats will not be converted date because of days/seconds/millis ambiguity
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
     assertEquals(TruthValue.YES_NO,
-      RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+      RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -434,39 +437,39 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.FLOAT, "x", 15.0, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     // Stats gets converted to column type. "15.0" is outside of "10.0" and "100.0"
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.STRING, "x", "15", null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE));
 
     // Double is not converted to date type because of days/seconds/millis ambiguity
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15*1000L), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150*1000L), null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.DOUBLE));
   }
 
   @Test
@@ -474,33 +477,33 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 100L, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.FLOAT, "x", 100.0, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.STRING, "x", "100", null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     // IllegalArgumentException is thrown when converting String to Date, hence YES_NO
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DATE, "x", new DateWritable(100).get(), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 1000), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 1000), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("100"), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(100), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -509,69 +512,69 @@ public class TestRecordReaderImpl {
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
     // Date to Integer conversion is not possible.
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     // Date to Float conversion is also not possible.
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.FLOAT, "x", 15.0, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.STRING, "x", "15", null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.STRING, "x", "1970-01-11", null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.STRING, "x", "15.1", null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.STRING, "x", "__a15__1", null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.STRING, "x", "2000-01-16", null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.STRING, "x", "1970-01-16", null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DATE, "x", new DateWritable(150).get(), null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     // Date to Decimal conversion is also not possible.
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15L * 24L * 60L * 60L * 1000L), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -579,39 +582,39 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.FLOAT, "x", 15.0, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     // "15" out of range of "10.0" and "100.0"
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.STRING, "x", "15", null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     // Decimal to Date not possible.
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15 * 1000L), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150 * 1000L), null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -619,46 +622,46 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.FLOAT, "x", 15.0, null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.STRING, "x", "15", null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.STRING, "x", new Timestamp(15).toString(), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO,
         RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10 * 24L * 60L * 60L * 1000L,
-          100 * 24L * 60L * 60L * 1000L), pred, null));
+          100 * 24L * 60L * 60L * 1000L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
         PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -667,17 +670,17 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG,
             "x", 15L, null);
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -686,17 +689,17 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG,
             "x", 15L, null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -705,15 +708,15 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.LONG,
             "x", 15L, null);
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), lessThan, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), lessThan, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), lessThan, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), lessThan, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), lessThan, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), lessThan, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -722,15 +725,15 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.LONG,
             "x", 15L, null);
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -742,13 +745,13 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
             "x", null, args);
     assertEquals(TruthValue.YES_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 20L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 20L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 30L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -760,19 +763,19 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.LONG,
             "x", null, args);
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 5L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 5L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.NO_NULL,
-      RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 40L), pred, null));
+      RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 40L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL,
-      RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 15L), pred, null));
+      RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 15L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 25L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 25L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 25L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 25L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 20L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 20L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -781,7 +784,7 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.LONG,
             "x", null, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
 
@@ -791,17 +794,17 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING,
             "x", "c", null);
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // before
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle
     assertEquals(TruthValue.YES_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // same
   }
 
   @Test
@@ -810,17 +813,17 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING,
             "x", "c", null);
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // before
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // same
   }
 
   @Test
@@ -829,17 +832,17 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.STRING,
             "x", "c", null);
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // before
     assertEquals(TruthValue.YES_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle
     assertEquals(TruthValue.NO_NULL, // min, same stats
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -848,17 +851,17 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.STRING,
             "x", "c", null);
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // before
     assertEquals(TruthValue.YES_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after
     assertEquals(TruthValue.YES_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // same
   }
 
   @Test
@@ -870,17 +873,17 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
             "x", null, args);
     assertEquals(TruthValue.NO_NULL, // before & after
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null)); // max
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // min
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle
     assertEquals(TruthValue.YES_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // same
   }
 
   @Test
@@ -892,31 +895,31 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.STRING,
             "x", null, args);
     assertEquals(TruthValue.YES_NULL, // before & after
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NULL, // before & max
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.NO_NULL, // before & before
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("h", "g", true), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("h", "g", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL, // before & min
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("f", "g", true), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("f", "g", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL, // before & middle
-      RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "g", true), pred, null));
+      RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "g", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     assertEquals(TruthValue.YES_NULL, // min & after
-      RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "e", true), pred, null));
+      RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "e", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NULL, // min & max
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "f", true), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "f", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.YES_NO_NULL, // min & middle
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "g", true), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "g", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
 
     assertEquals(TruthValue.NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // after
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "c", true), pred, null)); // max
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // max
     assertEquals(TruthValue.YES_NO_NULL,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG)); // middle
     assertEquals(TruthValue.YES_NULL, // min & after, same stats
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -925,9 +928,9 @@ public class TestRecordReaderImpl {
         (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.STRING,
             "x", null, null);
     assertEquals(TruthValue.YES_NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
     assertEquals(TruthValue.NO,
-        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", false), pred, null));
+        RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", false), pred, null, null, OrcFile.WriterVersion.ORC_101, TypeDescription.Category.LONG));
   }
 
   @Test
@@ -1304,7 +1307,7 @@ public class TestRecordReaderImpl {
   public void testIntNullSafeEqualsBloomFilter() throws Exception {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addLong(i);
     }
@@ -1319,7 +1322,7 @@ public class TestRecordReaderImpl {
   public void testIntEqualsBloomFilter() throws Exception {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addLong(i);
     }
@@ -1338,7 +1341,7 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf
         (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
             "x", null, args);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addLong(i);
     }
@@ -1356,7 +1359,7 @@ public class TestRecordReaderImpl {
   public void testDoubleNullSafeEqualsBloomFilter() throws Exception {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addDouble(i);
     }
@@ -1371,7 +1374,7 @@ public class TestRecordReaderImpl {
   public void testDoubleEqualsBloomFilter() throws Exception {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addDouble(i);
     }
@@ -1390,7 +1393,7 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf
         (PredicateLeaf.Operator.IN, PredicateLeaf.Type.FLOAT,
             "x", null, args);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addDouble(i);
     }
@@ -1408,7 +1411,7 @@ public class TestRecordReaderImpl {
   public void testStringNullSafeEqualsBloomFilter() throws Exception {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addString("str_" + i);
     }
@@ -1423,7 +1426,7 @@ public class TestRecordReaderImpl {
   public void testStringEqualsBloomFilter() throws Exception {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addString("str_" + i);
     }
@@ -1442,7 +1445,7 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf
         (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
             "x", null, args);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addString("str_" + i);
     }
@@ -1461,7 +1464,7 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x",
         new DateWritable(15).get(), null);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addLong((new DateWritable(i)).getDays());
     }
@@ -1477,7 +1480,7 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DATE, "x",
         new DateWritable(15).get(), null);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addLong((new DateWritable(i)).getDays());
     }
@@ -1496,7 +1499,7 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf
         (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DATE,
             "x", null, args);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addLong((new DateWritable(i)).getDays());
     }
@@ -1516,7 +1519,7 @@ public class TestRecordReaderImpl {
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x",
         new Timestamp(15),
         null);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addLong((new Timestamp(i)).getTime());
     }
@@ -1531,7 +1534,7 @@ public class TestRecordReaderImpl {
   public void testTimestampEqualsBloomFilter() throws Exception {
     PredicateLeaf pred = createPredicateLeaf(
         PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addLong((new Timestamp(i)).getTime());
     }
@@ -1550,7 +1553,7 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf
         (PredicateLeaf.Operator.IN, PredicateLeaf.Type.TIMESTAMP,
             "x", null, args);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addLong((new Timestamp(i)).getTime());
     }
@@ -1570,7 +1573,7 @@ public class TestRecordReaderImpl {
         PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x",
         new HiveDecimalWritable("15"),
         null);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addString(HiveDecimal.create(i).toString());
     }
@@ -1587,7 +1590,7 @@ public class TestRecordReaderImpl {
         PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DECIMAL, "x",
         new HiveDecimalWritable("15"),
         null);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addString(HiveDecimal.create(i).toString());
     }
@@ -1606,7 +1609,7 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf
         (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
             "x", null, args);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addString(HiveDecimal.create(i).toString());
     }
@@ -1629,7 +1632,7 @@ public class TestRecordReaderImpl {
     PredicateLeaf pred = createPredicateLeaf
         (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
             "x", null, args);
-    BloomFilterIO bf = new BloomFilterIO(10000);
+    BloomFilter bf = new BloomFilter(10000);
     for (int i = 20; i < 1000; i++) {
       bf.addString(HiveDecimal.create(i).toString());
     }
@@ -1692,4 +1695,171 @@ public class TestRecordReaderImpl {
 
     recordReader.close();
   }
+
+  @Test
+  public void TestOldBloomFilters() throws Exception {
+    OrcProto.StripeFooter footer =
+        OrcProto.StripeFooter.newBuilder()
+            .addStreams(OrcProto.Stream.newBuilder()
+               .setColumn(1).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(1).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(2).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(2).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(3).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(3).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+        .build();
+    TypeDescription schema = TypeDescription.fromString("struct<x:int,y:decimal(10,2),z:string>");
+    OrcProto.Stream.Kind[] bloomFilterKinds = new OrcProto.Stream.Kind[4];
+
+    // normal read
+    DiskRangeList ranges = RecordReaderUtils.planIndexReading(schema, footer,
+        false, new boolean[]{true, true, false, true},
+        new boolean[]{false, true, false, true},
+        OrcFile.WriterVersion.HIVE_4243,
+        bloomFilterKinds);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[3]);
+    assertEquals("range start: 0 end: 2000", ranges.toString());
+    assertEquals("range start: 4000 end: 6000", ranges.next.toString());
+    assertEquals(null, ranges.next.next);
+
+    // ignore non-utf8 bloom filter
+    Arrays.fill(bloomFilterKinds, null);
+    ranges = RecordReaderUtils.planIndexReading(schema, footer,
+        true, new boolean[]{true, true, false, true},
+        new boolean[]{false, true, false, true},
+        OrcFile.WriterVersion.HIVE_4243,
+        bloomFilterKinds);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+    assertEquals(null, bloomFilterKinds[3]);
+    assertEquals("range start: 0 end: 2000", ranges.toString());
+    assertEquals("range start: 4000 end: 5000", ranges.next.toString());
+    assertEquals(null, ranges.next.next);
+
+    // check that we are handling the post hive-12055 strings correctly
+    Arrays.fill(bloomFilterKinds, null);
+    ranges = RecordReaderUtils.planIndexReading(schema, footer,
+        true, null, new boolean[]{false, true, true, true},
+        OrcFile.WriterVersion.HIVE_12055, bloomFilterKinds);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+    assertEquals(null, bloomFilterKinds[2]);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[3]);
+    assertEquals("range start: 0 end: 3000", ranges.toString());
+    assertEquals("range start: 4000 end: 6000", ranges.next.toString());
+    assertEquals(null, ranges.next.next);
+
+    // ignore non-utf8 bloom filter on decimal
+    Arrays.fill(bloomFilterKinds, null);
+    ranges = RecordReaderUtils.planIndexReading(schema, footer,
+        true, null,
+        new boolean[]{false, false, true, false},
+        OrcFile.WriterVersion.HIVE_4243,
+        bloomFilterKinds);
+    assertEquals(null, bloomFilterKinds[2]);
+    assertEquals("range start: 0 end: 1000", ranges.toString());
+    assertEquals("range start: 2000 end: 3000", ranges.next.toString());
+    assertEquals("range start: 4000 end: 5000", ranges.next.next.toString());
+    assertEquals(null, ranges.next.next.next);
+  }
+
+  @Test
+  public void TestCompatibleBloomFilters() throws Exception {
+    OrcProto.StripeFooter footer =
+        OrcProto.StripeFooter.newBuilder()
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(1).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(1).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(2).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(2).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(2).setKind(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(3).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(3).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(3).setKind(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).setLength(1000).build())
+            .build();
+    TypeDescription schema = TypeDescription.fromString("struct<x:int,y:decimal(10,2),z:string>");
+    OrcProto.Stream.Kind[] bloomFilterKinds = new OrcProto.Stream.Kind[4];
+
+    // normal read
+    DiskRangeList ranges = RecordReaderUtils.planIndexReading(schema, footer,
+        false, new boolean[]{true, true, false, true},
+        new boolean[]{false, true, false, true},
+        OrcFile.WriterVersion.HIVE_4243,
+        bloomFilterKinds);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, bloomFilterKinds[3]);
+    assertEquals("range start: 0 end: 2000", ranges.toString());
+    assertEquals("range start: 5000 end: 6000", ranges.next.toString());
+    assertEquals("range start: 7000 end: 8000", ranges.next.next.toString());
+    assertEquals(null, ranges.next.next.next);
+
+    //
+    Arrays.fill(bloomFilterKinds, null);
+    ranges = RecordReaderUtils.planIndexReading(schema, footer,
+        true, null,
+        new boolean[]{false, true, true, false},
+        OrcFile.WriterVersion.HIVE_4243,
+        bloomFilterKinds);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, bloomFilterKinds[2]);
+    assertEquals("range start: 0 end: 3000", ranges.toString());
+    assertEquals("range start: 4000 end: 6000", ranges.next.toString());
+    assertEquals(null, ranges.next.next);
+  }
+
+  @Test
+  public void TestNewBloomFilters() throws Exception {
+    OrcProto.StripeFooter footer =
+        OrcProto.StripeFooter.newBuilder()
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(1).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(1).setKind(OrcProto.Stream.Kind.BLOOM_FILTER).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(2).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(2).setKind(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(3).setKind(OrcProto.Stream.Kind.ROW_INDEX).setLength(1000).build())
+            .addStreams(OrcProto.Stream.newBuilder()
+                .setColumn(3).setKind(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).setLength(1000).build())
+            .build();
+    TypeDescription schema = TypeDescription.fromString("struct<x:int,y:decimal(10,2),z:string>");
+    OrcProto.Stream.Kind[] bloomFilterKinds = new OrcProto.Stream.Kind[4];
+
+    // normal read
+    DiskRangeList ranges = RecordReaderUtils.planIndexReading(schema, footer,
+        false, new boolean[]{true, true, false, true},
+        new boolean[]{false, true, false, true},
+        OrcFile.WriterVersion.HIVE_4243,
+        bloomFilterKinds);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, bloomFilterKinds[3]);
+    assertEquals("range start: 0 end: 2000", ranges.toString());
+    assertEquals("range start: 4000 end: 6000", ranges.next.toString());
+    assertEquals(null, ranges.next.next);
+
+    //
+    Arrays.fill(bloomFilterKinds, null);
+    ranges = RecordReaderUtils.planIndexReading(schema, footer,
+        true, null,
+        new boolean[]{false, true, true, false},
+        OrcFile.WriterVersion.HIVE_4243,
+        bloomFilterKinds);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
+    assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, bloomFilterKinds[2]);
+    assertEquals("range start: 0 end: 5000", ranges.toString());
+    assertEquals(null, ranges.next);
+  }
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/test/org/apache/orc/util/TestMurmur3.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/util/TestMurmur3.java b/java/core/src/test/org/apache/orc/util/TestMurmur3.java
new file mode 100644
index 0000000..575e250
--- /dev/null
+++ b/java/core/src/test/org/apache/orc/util/TestMurmur3.java
@@ -0,0 +1,225 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+import static org.junit.Assert.assertEquals;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+
+import org.apache.orc.util.Murmur3;
+import org.junit.Test;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+import java.util.Random;
+
+/**
+ * Tests for Murmur3 variants.
+ */
+public class TestMurmur3 {
+
+  @Test
+  public void testHashCodesM3_32_string() {
+    String key = "test";
+    int seed = 123;
+    HashFunction hf = Hashing.murmur3_32(seed);
+    int hc1 = hf.hashBytes(key.getBytes()).asInt();
+    int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
+    assertEquals(hc1, hc2);
+
+    key = "testkey";
+    hc1 = hf.hashBytes(key.getBytes()).asInt();
+    hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
+    assertEquals(hc1, hc2);
+  }
+
+  @Test
+  public void testHashCodesM3_32_ints() {
+    int seed = 123;
+    Random rand = new Random(seed);
+    HashFunction hf = Hashing.murmur3_32(seed);
+    for (int i = 0; i < 1000; i++) {
+      int val = rand.nextInt();
+      byte[] data = ByteBuffer.allocate(4).putInt(val).array();
+      int hc1 = hf.hashBytes(data).asInt();
+      int hc2 = Murmur3.hash32(data, data.length, seed);
+      assertEquals(hc1, hc2);
+    }
+  }
+
+  @Test
+  public void testHashCodesM3_32_longs() {
+    int seed = 123;
+    Random rand = new Random(seed);
+    HashFunction hf = Hashing.murmur3_32(seed);
+    for (int i = 0; i < 1000; i++) {
+      long val = rand.nextLong();
+      byte[] data = ByteBuffer.allocate(8).putLong(val).array();
+      int hc1 = hf.hashBytes(data).asInt();
+      int hc2 = Murmur3.hash32(data, data.length, seed);
+      assertEquals(hc1, hc2);
+    }
+  }
+
+  @Test
+  public void testHashCodesM3_32_double() {
+    int seed = 123;
+    Random rand = new Random(seed);
+    HashFunction hf = Hashing.murmur3_32(seed);
+    for (int i = 0; i < 1000; i++) {
+      double val = rand.nextDouble();
+      byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
+      int hc1 = hf.hashBytes(data).asInt();
+      int hc2 = Murmur3.hash32(data, data.length, seed);
+      assertEquals(hc1, hc2);
+    }
+  }
+
+  @Test
+  public void testHashCodesM3_128_string() {
+    String key = "test";
+    int seed = 123;
+    HashFunction hf = Hashing.murmur3_128(seed);
+    // guava stores the hashcodes in little endian order
+    ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
+    buf.put(hf.hashBytes(key.getBytes()).asBytes());
+    buf.flip();
+    long gl1 = buf.getLong();
+    long gl2 = buf.getLong(8);
+    long[] hc = Murmur3.hash128(key.getBytes(), 0, key.getBytes().length, seed);
+    long m1 = hc[0];
+    long m2 = hc[1];
+    assertEquals(gl1, m1);
+    assertEquals(gl2, m2);
+
+    key = "testkey128_testkey128";
+    buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
+    buf.put(hf.hashBytes(key.getBytes()).asBytes());
+    buf.flip();
+    gl1 = buf.getLong();
+    gl2 = buf.getLong(8);
+    byte[] keyBytes = key.getBytes();
+    hc = Murmur3.hash128(keyBytes, 0, keyBytes.length, seed);
+    m1 = hc[0];
+    m2 = hc[1];
+    assertEquals(gl1, m1);
+    assertEquals(gl2, m2);
+
+    byte[] offsetKeyBytes = new byte[keyBytes.length + 35];
+    Arrays.fill(offsetKeyBytes, (byte) -1);
+    System.arraycopy(keyBytes, 0, offsetKeyBytes, 35, keyBytes.length);
+    hc = Murmur3.hash128(offsetKeyBytes, 35, keyBytes.length, seed);
+    assertEquals(gl1, hc[0]);
+    assertEquals(gl2, hc[1]);
+  }
+
+  @Test
+  public void testHashCodeM3_64() {
+    byte[] origin = ("It was the best of times, it was the worst of times," +
+        " it was the age of wisdom, it was the age of foolishness," +
+        " it was the epoch of belief, it was the epoch of incredulity," +
+        " it was the season of Light, it was the season of Darkness," +
+        " it was the spring of hope, it was the winter of despair," +
+        " we had everything before us, we had nothing before us," +
+        " we were all going direct to Heaven," +
+        " we were all going direct the other way.").getBytes();
+    long hash = Murmur3.hash64(origin, 0, origin.length);
+    assertEquals(305830725663368540L, hash);
+
+    byte[] originOffset = new byte[origin.length + 150];
+    Arrays.fill(originOffset, (byte) 123);
+    System.arraycopy(origin, 0, originOffset, 150, origin.length);
+    hash = Murmur3.hash64(originOffset, 150, origin.length);
+    assertEquals(305830725663368540L, hash);
+  }
+
+  @Test
+  public void testHashCodesM3_128_ints() {
+    int seed = 123;
+    Random rand = new Random(seed);
+    HashFunction hf = Hashing.murmur3_128(seed);
+    for (int i = 0; i < 1000; i++) {
+      int val = rand.nextInt();
+      byte[] data = ByteBuffer.allocate(4).putInt(val).array();
+      // guava stores the hashcodes in little endian order
+      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
+      buf.put(hf.hashBytes(data).asBytes());
+      buf.flip();
+      long gl1 = buf.getLong();
+      long gl2 = buf.getLong(8);
+      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
+      long m1 = hc[0];
+      long m2 = hc[1];
+      assertEquals(gl1, m1);
+      assertEquals(gl2, m2);
+
+      byte[] offsetData = new byte[data.length + 50];
+      System.arraycopy(data, 0, offsetData, 50, data.length);
+      hc = Murmur3.hash128(offsetData, 50, data.length, seed);
+      assertEquals(gl1, hc[0]);
+      assertEquals(gl2, hc[1]);
+    }
+  }
+
+  @Test
+  public void testHashCodesM3_128_longs() {
+    int seed = 123;
+    Random rand = new Random(seed);
+    HashFunction hf = Hashing.murmur3_128(seed);
+    for (int i = 0; i < 1000; i++) {
+      long val = rand.nextLong();
+      byte[] data = ByteBuffer.allocate(8).putLong(val).array();
+      // guava stores the hashcodes in little endian order
+      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
+      buf.put(hf.hashBytes(data).asBytes());
+      buf.flip();
+      long gl1 = buf.getLong();
+      long gl2 = buf.getLong(8);
+      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
+      long m1 = hc[0];
+      long m2 = hc[1];
+      assertEquals(gl1, m1);
+      assertEquals(gl2, m2);
+    }
+  }
+
+  @Test
+  public void testHashCodesM3_128_double() {
+    int seed = 123;
+    Random rand = new Random(seed);
+    HashFunction hf = Hashing.murmur3_128(seed);
+    for (int i = 0; i < 1000; i++) {
+      double val = rand.nextDouble();
+      byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
+      // guava stores the hashcodes in little endian order
+      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
+      buf.put(hf.hashBytes(data).asBytes());
+      buf.flip();
+      long gl1 = buf.getLong();
+      long gl2 = buf.getLong(8);
+      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
+      long m1 = hc[0];
+      long m2 = hc[1];
+      assertEquals(gl1, m1);
+      assertEquals(gl2, m2);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/java/core/src/test/resources/log4j.properties b/java/core/src/test/resources/log4j.properties
index d2c063d..fae44b6 100644
--- a/java/core/src/test/resources/log4j.properties
+++ b/java/core/src/test/resources/log4j.properties
@@ -15,3 +15,6 @@ log4j.rootLogger=WARN,stdout
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n
+
+# Suppress the warnings about native io not being available
+log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/mapreduce/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/java/mapreduce/src/test/resources/log4j.properties b/java/mapreduce/src/test/resources/log4j.properties
index d2c063d..fae44b6 100644
--- a/java/mapreduce/src/test/resources/log4j.properties
+++ b/java/mapreduce/src/test/resources/log4j.properties
@@ -15,3 +15,6 @@ log4j.rootLogger=WARN,stdout
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n
+
+# Suppress the warnings about native io not being available
+log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java b/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java
deleted file mode 100644
index e60690d..0000000
--- a/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java
+++ /dev/null
@@ -1,313 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are
- * highly space efficient when compared to using a HashSet. Because of the probabilistic nature of
- * bloom filter false positive (element not present in bloom filter but test() says true) are
- * possible but false negatives are not possible (if element is present then test() will never
- * say false). The false positive probability is configurable (default: 5%) depending on which
- * storage requirement may increase or decrease. Lower the false positive probability greater
- * is the space requirement.
- * Bloom filters are sensitive to number of elements that will be inserted in the bloom filter.
- * During the creation of bloom filter expected number of entries must be specified. If the number
- * of insertions exceed the specified initial number of entries then false positive probability will
- * increase accordingly.
- *
- * Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash
- * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it suffers from hash
- * collisions for specific sequence of repeating bytes. Check the following link for more info
- * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw
- */
-public class BloomFilter {
-  public static final double DEFAULT_FPP = 0.05;
-  protected BitSet bitSet;
-  protected int numBits;
-  protected int numHashFunctions;
-
-  public BloomFilter() {
-  }
-
-  public BloomFilter(long expectedEntries) {
-    this(expectedEntries, DEFAULT_FPP);
-  }
-
-  static void checkArgument(boolean expression, String message) {
-    if (!expression) {
-      throw new IllegalArgumentException(message);
-    }
-  }
-
-  public BloomFilter(long expectedEntries, double fpp) {
-    checkArgument(expectedEntries > 0, "expectedEntries should be > 0");
-    checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0");
-    int nb = optimalNumOfBits(expectedEntries, fpp);
-    // make 'm' multiple of 64
-    this.numBits = nb + (Long.SIZE - (nb % Long.SIZE));
-    this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, numBits);
-    this.bitSet = new BitSet(numBits);
-  }
-
-  /**
-   * A constructor to support rebuilding the BloomFilter from a serialized representation.
-   * @param bits
-   * @param numBits
-   * @param numFuncs
-   */
-  public BloomFilter(List<Long> bits, int numBits, int numFuncs) {
-    super();
-    long[] copied = new long[bits.size()];
-    for (int i = 0; i < bits.size(); i++) copied[i] = bits.get(i);
-    bitSet = new BitSet(copied);
-    this.numBits = numBits;
-    numHashFunctions = numFuncs;
-  }
-
-  static int optimalNumOfHashFunctions(long n, long m) {
-    return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
-  }
-
-  static int optimalNumOfBits(long n, double p) {
-    return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
-  }
-
-  public void add(byte[] val) {
-    if (val == null) {
-      addBytes(val, -1, -1);
-    } else {
-      addBytes(val, 0, val.length);
-    }
-  }
-
-  public void addBytes(byte[] val, int offset, int length) {
-    // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter"
-    // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively
-    // implement a Bloom filter without any loss in the asymptotic false positive probability'
-
-    // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned
-    // in the above paper
-    long hash64 = val == null ? Murmur3.NULL_HASHCODE :
-        Murmur3.hash64(val, offset, length);
-    addHash(hash64);
-  }
-
-  private void addHash(long hash64) {
-    int hash1 = (int) hash64;
-    int hash2 = (int) (hash64 >>> 32);
-
-    for (int i = 1; i <= numHashFunctions; i++) {
-      int combinedHash = hash1 + (i * hash2);
-      // hashcode should be positive, flip all the bits if it's negative
-      if (combinedHash < 0) {
-        combinedHash = ~combinedHash;
-      }
-      int pos = combinedHash % numBits;
-      bitSet.set(pos);
-    }
-  }
-
-  public void addString(String val) {
-    if (val == null) {
-      add(null);
-    } else {
-      add(val.getBytes());
-    }
-  }
-
-  public void addLong(long val) {
-    addHash(getLongHash(val));
-  }
-
-  public void addDouble(double val) {
-    addLong(Double.doubleToLongBits(val));
-  }
-
-  public boolean test(byte[] val) {
-    if (val == null) {
-      return testBytes(val, -1, -1);
-    }
-    return testBytes(val, 0, val.length);
-  }
-
-  public boolean testBytes(byte[] val, int offset, int length) {
-    long hash64 = val == null ? Murmur3.NULL_HASHCODE :
-        Murmur3.hash64(val, offset, length);
-    return testHash(hash64);
-  }
-
-  private boolean testHash(long hash64) {
-    int hash1 = (int) hash64;
-    int hash2 = (int) (hash64 >>> 32);
-
-    for (int i = 1; i <= numHashFunctions; i++) {
-      int combinedHash = hash1 + (i * hash2);
-      // hashcode should be positive, flip all the bits if it's negative
-      if (combinedHash < 0) {
-        combinedHash = ~combinedHash;
-      }
-      int pos = combinedHash % numBits;
-      if (!bitSet.get(pos)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  public boolean testString(String val) {
-    if (val == null) {
-      return test(null);
-    } else {
-      return test(val.getBytes());
-    }
-  }
-
-  public boolean testLong(long val) {
-    return testHash(getLongHash(val));
-  }
-
-  // Thomas Wang's integer hash function
-  // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
-  private long getLongHash(long key) {
-    key = (~key) + (key << 21); // key = (key << 21) - key - 1;
-    key = key ^ (key >> 24);
-    key = (key + (key << 3)) + (key << 8); // key * 265
-    key = key ^ (key >> 14);
-    key = (key + (key << 2)) + (key << 4); // key * 21
-    key = key ^ (key >> 28);
-    key = key + (key << 31);
-    return key;
-  }
-
-  public boolean testDouble(double val) {
-    return testLong(Double.doubleToLongBits(val));
-  }
-
-  public long sizeInBytes() {
-    return getBitSize() / 8;
-  }
-
-  public int getBitSize() {
-    return bitSet.getData().length * Long.SIZE;
-  }
-
-  public int getNumHashFunctions() {
-    return numHashFunctions;
-  }
-
-  public long[] getBitSet() {
-    return bitSet.getData();
-  }
-
-  @Override
-  public String toString() {
-    return "m: " + numBits + " k: " + numHashFunctions;
-  }
-
-  /**
-   * Merge the specified bloom filter with current bloom filter.
-   *
-   * @param that - bloom filter to merge
-   */
-  public void merge(BloomFilter that) {
-    if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions) {
-      this.bitSet.putAll(that.bitSet);
-    } else {
-      throw new IllegalArgumentException("BloomFilters are not compatible for merging." +
-          " this - " + this.toString() + " that - " + that.toString());
-    }
-  }
-
-  public void reset() {
-    this.bitSet.clear();
-  }
-
-  /**
-   * Bare metal bit set implementation. For performance reasons, this implementation does not check
-   * for index bounds nor expand the bit set size if the specified index is greater than the size.
-   */
-  public class BitSet {
-    private final long[] data;
-
-    public BitSet(long bits) {
-      this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]);
-    }
-
-    /**
-     * Deserialize long array as bit set.
-     *
-     * @param data - bit array
-     */
-    public BitSet(long[] data) {
-      assert data.length > 0 : "data length is zero!";
-      this.data = data;
-    }
-
-    /**
-     * Sets the bit at specified index.
-     *
-     * @param index - position
-     */
-    public void set(int index) {
-      data[index >>> 6] |= (1L << index);
-    }
-
-    /**
-     * Returns true if the bit is set in the specified index.
-     *
-     * @param index - position
-     * @return - value at the bit position
-     */
-    public boolean get(int index) {
-      return (data[index >>> 6] & (1L << index)) != 0;
-    }
-
-    /**
-     * Number of bits
-     */
-    public long bitSize() {
-      return (long) data.length * Long.SIZE;
-    }
-
-    public long[] getData() {
-      return data;
-    }
-
-    /**
-     * Combines the two BitArrays using bitwise OR.
-     */
-    public void putAll(BitSet array) {
-      assert data.length == array.data.length :
-          "BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")";
-      for (int i = 0; i < data.length; i++) {
-        data[i] |= array.data[i];
-      }
-    }
-
-    /**
-     * Clear the bit set.
-     */
-    public void clear() {
-      Arrays.fill(data, 0);
-    }
-  }
-}

[4/4] orc git commit: ORC-101 using little endian encoding of bloom filter bitsets and update spec.

Posted by om...@apache.org.

ORC-101 using little endian encoding of bloom filter bitsets and update spec.


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/604dcc80
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/604dcc80
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/604dcc80

Branch: refs/heads/master
Commit: 604dcc801fb2cdb68fe8284c0facf66a32bfe119
Parents: 9d39cb8
Author: Owen O'Malley <om...@apache.org>
Authored: Tue Sep 20 15:51:36 2016 -0500
Committer: Owen O'Malley <om...@apache.org>
Committed: Wed Sep 21 11:38:57 2016 -0500

----------------------------------------------------------------------
 .../java/org/apache/orc/util/BloomFilterIO.java |   6 +-
 .../resources/orc-file-dump-bloomfilter.out     | 104 +++++++-------
 .../resources/orc-file-dump-bloomfilter2.out    | 116 ++++++++--------
 .../tools/src/test/resources/orc-file-dump.json | 134 +++++++++----------
 site/_data/releases.yml                         |   4 +
 site/_docs/spec-index.md                        |  11 +-
 site/_docs/stripes.md                           |   4 +
 7 files changed, 196 insertions(+), 183 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
index ebd8c49..a6c3940 100644
--- a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
+++ b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
@@ -24,7 +24,7 @@ import org.apache.orc.OrcProto;
 import org.apache.orc.TypeDescription;
 
 import java.nio.ByteBuffer;
-import java.util.Arrays;
+import java.nio.ByteOrder;
 
 public class BloomFilterIO  {
 
@@ -62,7 +62,8 @@ public class BloomFilterIO  {
       case BLOOM_FILTER_UTF8: {
         ByteString bits = bloomFilter.getUtf8Bitset();
         long[] values = new long[bits.size() / 8];
-        bits.asReadOnlyByteBuffer().asLongBuffer().get(values);
+        bits.asReadOnlyByteBuffer().order(ByteOrder.LITTLE_ENDIAN)
+            .asLongBuffer().get(values);
         return new BloomFilterUtf8(values, numFuncs);
       }
       default:
@@ -82,6 +83,7 @@ public class BloomFilterIO  {
     long[] bitset = bloomFilter.getBitSet();
     if (bloomFilter instanceof BloomFilterUtf8) {
       ByteBuffer buffer = ByteBuffer.allocate(bitset.length * 8);
+      buffer.order(ByteOrder.LITTLE_ENDIAN);
       buffer.asLongBuffer().put(bitset);
       builder.setUtf8Bitset(ByteString.copyFrom(buffer));
     } else {

http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
index b879bed..e23327a 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
@@ -39,17 +39,17 @@ File Statistics:
   Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
 
 Stripes:
-  Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 743
+  Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 749
     Stream: column 0 section ROW_INDEX start: 3 length 17
     Stream: column 1 section ROW_INDEX start: 20 length 166
     Stream: column 2 section ROW_INDEX start: 186 length 169
     Stream: column 3 section ROW_INDEX start: 355 length 87
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 304
-    Stream: column 1 section DATA start: 746 length 20035
-    Stream: column 2 section DATA start: 20781 length 40050
-    Stream: column 3 section DATA start: 60831 length 3543
-    Stream: column 3 section LENGTH start: 64374 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 64399 length 133
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 310
+    Stream: column 1 section DATA start: 752 length 20035
+    Stream: column 2 section DATA start: 20787 length 40050
+    Stream: column 3 section DATA start: 60837 length 3543
+    Stream: column 3 section LENGTH start: 64380 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 64405 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -67,17 +67,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 64618 data: 63775 rows: 5000 tail: 86 index: 736
-    Stream: column 0 section ROW_INDEX start: 64618 length 17
-    Stream: column 1 section ROW_INDEX start: 64635 length 164
-    Stream: column 2 section ROW_INDEX start: 64799 length 168
-    Stream: column 3 section ROW_INDEX start: 64967 length 83
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 65050 length 304
-    Stream: column 1 section DATA start: 65354 length 20035
-    Stream: column 2 section DATA start: 85389 length 40050
-    Stream: column 3 section DATA start: 125439 length 3532
-    Stream: column 3 section LENGTH start: 128971 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 128996 length 133
+  Stripe: offset: 64624 data: 63775 rows: 5000 tail: 86 index: 742
+    Stream: column 0 section ROW_INDEX start: 64624 length 17
+    Stream: column 1 section ROW_INDEX start: 64641 length 164
+    Stream: column 2 section ROW_INDEX start: 64805 length 168
+    Stream: column 3 section ROW_INDEX start: 64973 length 83
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 65056 length 310
+    Stream: column 1 section DATA start: 65366 length 20035
+    Stream: column 2 section DATA start: 85401 length 40050
+    Stream: column 3 section DATA start: 125451 length 3532
+    Stream: column 3 section LENGTH start: 128983 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 129008 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -95,17 +95,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 129215 data: 63787 rows: 5000 tail: 86 index: 742
-    Stream: column 0 section ROW_INDEX start: 129215 length 17
-    Stream: column 1 section ROW_INDEX start: 129232 length 163
-    Stream: column 2 section ROW_INDEX start: 129395 length 168
-    Stream: column 3 section ROW_INDEX start: 129563 length 90
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 129653 length 304
-    Stream: column 1 section DATA start: 129957 length 20035
-    Stream: column 2 section DATA start: 149992 length 40050
-    Stream: column 3 section DATA start: 190042 length 3544
-    Stream: column 3 section LENGTH start: 193586 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 193611 length 133
+  Stripe: offset: 129227 data: 63787 rows: 5000 tail: 86 index: 748
+    Stream: column 0 section ROW_INDEX start: 129227 length 17
+    Stream: column 1 section ROW_INDEX start: 129244 length 163
+    Stream: column 2 section ROW_INDEX start: 129407 length 168
+    Stream: column 3 section ROW_INDEX start: 129575 length 90
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 129665 length 310
+    Stream: column 1 section DATA start: 129975 length 20035
+    Stream: column 2 section DATA start: 150010 length 40050
+    Stream: column 3 section DATA start: 190060 length 3544
+    Stream: column 3 section LENGTH start: 193604 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 193629 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -123,17 +123,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 193830 data: 63817 rows: 5000 tail: 85 index: 744
-    Stream: column 0 section ROW_INDEX start: 193830 length 17
-    Stream: column 1 section ROW_INDEX start: 193847 length 165
-    Stream: column 2 section ROW_INDEX start: 194012 length 167
-    Stream: column 3 section ROW_INDEX start: 194179 length 91
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 194270 length 304
-    Stream: column 1 section DATA start: 194574 length 20035
-    Stream: column 2 section DATA start: 214609 length 40050
-    Stream: column 3 section DATA start: 254659 length 3574
-    Stream: column 3 section LENGTH start: 258233 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 258258 length 133
+  Stripe: offset: 193848 data: 63817 rows: 5000 tail: 85 index: 750
+    Stream: column 0 section ROW_INDEX start: 193848 length 17
+    Stream: column 1 section ROW_INDEX start: 193865 length 165
+    Stream: column 2 section ROW_INDEX start: 194030 length 167
+    Stream: column 3 section ROW_INDEX start: 194197 length 91
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 194288 length 310
+    Stream: column 1 section DATA start: 194598 length 20035
+    Stream: column 2 section DATA start: 214633 length 40050
+    Stream: column 3 section DATA start: 254683 length 3574
+    Stream: column 3 section LENGTH start: 258257 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 258282 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -151,17 +151,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 258476 data: 12943 rows: 1000 tail: 78 index: 382
-    Stream: column 0 section ROW_INDEX start: 258476 length 12
-    Stream: column 1 section ROW_INDEX start: 258488 length 38
-    Stream: column 2 section ROW_INDEX start: 258526 length 41
-    Stream: column 3 section ROW_INDEX start: 258567 length 40
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 258607 length 251
-    Stream: column 1 section DATA start: 258858 length 4007
-    Stream: column 2 section DATA start: 262865 length 8010
-    Stream: column 3 section DATA start: 270875 length 768
-    Stream: column 3 section LENGTH start: 271643 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 271668 length 133
+  Stripe: offset: 258500 data: 12943 rows: 1000 tail: 78 index: 375
+    Stream: column 0 section ROW_INDEX start: 258500 length 12
+    Stream: column 1 section ROW_INDEX start: 258512 length 38
+    Stream: column 2 section ROW_INDEX start: 258550 length 41
+    Stream: column 3 section ROW_INDEX start: 258591 length 40
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 258631 length 244
+    Stream: column 1 section DATA start: 258875 length 4007
+    Stream: column 2 section DATA start: 262882 length 8010
+    Stream: column 3 section DATA start: 270892 length 768
+    Stream: column 3 section LENGTH start: 271660 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 271685 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -172,7 +172,7 @@ Stripes:
       Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
 
-File length: 272427 bytes
+File length: 272444 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 ________________________________________________________________________________________________________________________

http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
index 75cd5f4..8296382 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -39,7 +39,7 @@ File Statistics:
   Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
 
 Stripes:
-  Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14950
+  Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14949
     Stream: column 0 section ROW_INDEX start: 3 length 17
     Stream: column 1 section ROW_INDEX start: 20 length 166
     Stream: column 2 section ROW_INDEX start: 186 length 169
@@ -47,12 +47,12 @@ Stripes:
     Stream: column 2 section BLOOM_FILTER_UTF8 start: 6890 length 6046
     Stream: column 3 section ROW_INDEX start: 12936 length 87
     Stream: column 3 section BLOOM_FILTER start: 13023 length 1038
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 892
-    Stream: column 1 section DATA start: 14953 length 20035
-    Stream: column 2 section DATA start: 34988 length 40050
-    Stream: column 3 section DATA start: 75038 length 3543
-    Stream: column 3 section LENGTH start: 78581 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 78606 length 133
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 891
+    Stream: column 1 section DATA start: 14952 length 20035
+    Stream: column 2 section DATA start: 34987 length 40050
+    Stream: column 3 section DATA start: 75037 length 3543
+    Stream: column 3 section LENGTH start: 78580 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 78605 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -70,20 +70,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 0.5155 expectedFpp: 0.009676614
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 loadFactor: 0.9736 expectedFpp: 0.829482
-  Stripe: offset: 78843 data: 63775 rows: 5000 tail: 103 index: 14941
-    Stream: column 0 section ROW_INDEX start: 78843 length 17
-    Stream: column 1 section ROW_INDEX start: 78860 length 164
-    Stream: column 2 section ROW_INDEX start: 79024 length 168
-    Stream: column 2 section BLOOM_FILTER start: 79192 length 6533
-    Stream: column 2 section BLOOM_FILTER_UTF8 start: 85725 length 6046
-    Stream: column 3 section ROW_INDEX start: 91771 length 83
-    Stream: column 3 section BLOOM_FILTER start: 91854 length 1038
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 92892 length 892
-    Stream: column 1 section DATA start: 93784 length 20035
-    Stream: column 2 section DATA start: 113819 length 40050
-    Stream: column 3 section DATA start: 153869 length 3532
-    Stream: column 3 section LENGTH start: 157401 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 157426 length 133
+  Stripe: offset: 78842 data: 63775 rows: 5000 tail: 103 index: 14940
+    Stream: column 0 section ROW_INDEX start: 78842 length 17
+    Stream: column 1 section ROW_INDEX start: 78859 length 164
+    Stream: column 2 section ROW_INDEX start: 79023 length 168
+    Stream: column 2 section BLOOM_FILTER start: 79191 length 6533
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 85724 length 6046
+    Stream: column 3 section ROW_INDEX start: 91770 length 83
+    Stream: column 3 section BLOOM_FILTER start: 91853 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 92891 length 891
+    Stream: column 1 section DATA start: 93782 length 20035
+    Stream: column 2 section DATA start: 113817 length 40050
+    Stream: column 3 section DATA start: 153867 length 3532
+    Stream: column 3 section LENGTH start: 157399 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 157424 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -101,20 +101,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 0.5169 expectedFpp: 0.009855959
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 0.5173 expectedFpp: 0.009911705
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 loadFactor: 0.9733 expectedFpp: 0.8276205
-  Stripe: offset: 157662 data: 63787 rows: 5000 tail: 104 index: 14947
-    Stream: column 0 section ROW_INDEX start: 157662 length 17
-    Stream: column 1 section ROW_INDEX start: 157679 length 163
-    Stream: column 2 section ROW_INDEX start: 157842 length 168
-    Stream: column 2 section BLOOM_FILTER start: 158010 length 6533
-    Stream: column 2 section BLOOM_FILTER_UTF8 start: 164543 length 6046
-    Stream: column 3 section ROW_INDEX start: 170589 length 90
-    Stream: column 3 section BLOOM_FILTER start: 170679 length 1038
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 171717 length 892
-    Stream: column 1 section DATA start: 172609 length 20035
-    Stream: column 2 section DATA start: 192644 length 40050
-    Stream: column 3 section DATA start: 232694 length 3544
-    Stream: column 3 section LENGTH start: 236238 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 236263 length 133
+  Stripe: offset: 157660 data: 63787 rows: 5000 tail: 104 index: 14946
+    Stream: column 0 section ROW_INDEX start: 157660 length 17
+    Stream: column 1 section ROW_INDEX start: 157677 length 163
+    Stream: column 2 section ROW_INDEX start: 157840 length 168
+    Stream: column 2 section BLOOM_FILTER start: 158008 length 6533
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 164541 length 6046
+    Stream: column 3 section ROW_INDEX start: 170587 length 90
+    Stream: column 3 section BLOOM_FILTER start: 170677 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 171715 length 891
+    Stream: column 1 section DATA start: 172606 length 20035
+    Stream: column 2 section DATA start: 192641 length 40050
+    Stream: column 3 section DATA start: 232691 length 3544
+    Stream: column 3 section LENGTH start: 236235 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 236260 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -132,20 +132,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 0.5149 expectedFpp: 0.009594797
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 0.5135 expectedFpp: 0.009419539
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 loadFactor: 0.9722 expectedFpp: 0.82082444
-  Stripe: offset: 236500 data: 63817 rows: 5000 tail: 103 index: 14940
-    Stream: column 0 section ROW_INDEX start: 236500 length 17
-    Stream: column 1 section ROW_INDEX start: 236517 length 165
-    Stream: column 2 section ROW_INDEX start: 236682 length 167
-    Stream: column 2 section BLOOM_FILTER start: 236849 length 6524
-    Stream: column 2 section BLOOM_FILTER_UTF8 start: 243373 length 6046
-    Stream: column 3 section ROW_INDEX start: 249419 length 91
-    Stream: column 3 section BLOOM_FILTER start: 249510 length 1038
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 250548 length 892
-    Stream: column 1 section DATA start: 251440 length 20035
-    Stream: column 2 section DATA start: 271475 length 40050
-    Stream: column 3 section DATA start: 311525 length 3574
-    Stream: column 3 section LENGTH start: 315099 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 315124 length 133
+  Stripe: offset: 236497 data: 63817 rows: 5000 tail: 103 index: 14939
+    Stream: column 0 section ROW_INDEX start: 236497 length 17
+    Stream: column 1 section ROW_INDEX start: 236514 length 165
+    Stream: column 2 section ROW_INDEX start: 236679 length 167
+    Stream: column 2 section BLOOM_FILTER start: 236846 length 6524
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 243370 length 6046
+    Stream: column 3 section ROW_INDEX start: 249416 length 91
+    Stream: column 3 section BLOOM_FILTER start: 249507 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 250545 length 891
+    Stream: column 1 section DATA start: 251436 length 20035
+    Stream: column 2 section DATA start: 271471 length 40050
+    Stream: column 3 section DATA start: 311521 length 3574
+    Stream: column 3 section LENGTH start: 315095 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 315120 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -163,15 +163,15 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 0.5147 expectedFpp: 0.009567649
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 0.5201 expectedFpp: 0.010295142
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 loadFactor: 0.9743 expectedFpp: 0.8332165
-  Stripe: offset: 315360 data: 12943 rows: 1000 tail: 96 index: 3542
-    Stream: column 0 section ROW_INDEX start: 315360 length 12
-    Stream: column 1 section ROW_INDEX start: 315372 length 38
-    Stream: column 2 section ROW_INDEX start: 315410 length 41
-    Stream: column 2 section BLOOM_FILTER start: 315451 length 1337
-    Stream: column 2 section BLOOM_FILTER_UTF8 start: 316788 length 1211
-    Stream: column 3 section ROW_INDEX start: 317999 length 40
-    Stream: column 3 section BLOOM_FILTER start: 318039 length 472
-    Stream: column 3 section BLOOM_FILTER_UTF8 start: 318511 length 391
+  Stripe: offset: 315356 data: 12943 rows: 1000 tail: 96 index: 3546
+    Stream: column 0 section ROW_INDEX start: 315356 length 12
+    Stream: column 1 section ROW_INDEX start: 315368 length 38
+    Stream: column 2 section ROW_INDEX start: 315406 length 41
+    Stream: column 2 section BLOOM_FILTER start: 315447 length 1337
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 316784 length 1211
+    Stream: column 3 section ROW_INDEX start: 317995 length 40
+    Stream: column 3 section BLOOM_FILTER start: 318035 length 472
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 318507 length 395
     Stream: column 1 section DATA start: 318902 length 4007
     Stream: column 2 section DATA start: 322909 length 8010
     Stream: column 3 section DATA start: 330919 length 768

http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/java/tools/src/test/resources/orc-file-dump.json
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.json b/java/tools/src/test/resources/orc-file-dump.json
index 3dd0dc0..b3e9d12 100644
--- a/java/tools/src/test/resources/orc-file-dump.json
+++ b/java/tools/src/test/resources/orc-file-dump.json
@@ -254,9 +254,9 @@
       "stripeNumber": 1,
       "stripeInformation": {
         "offset": 3,
-        "indexLength": 762,
+        "indexLength": 768,
         "dataLength": 63770,
-        "footerLength": 89,
+        "footerLength": 88,
         "rowCount": 5000
       },
       "streams": [
@@ -288,42 +288,42 @@
           "columnId": 3,
           "section": "BLOOM_FILTER_UTF8",
           "startOffset": 461,
-          "length": 304
+          "length": 310
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 765,
+          "startOffset": 771,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 20800,
+          "startOffset": 20806,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 60850,
+          "startOffset": 60856,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 60867,
+          "startOffset": 60873,
           "length": 3510
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 64377,
+          "startOffset": 64383,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 64402,
+          "startOffset": 64408,
           "length": 133
         }
       ],
@@ -494,8 +494,8 @@
     {
       "stripeNumber": 2,
       "stripeInformation": {
-        "offset": 64624,
-        "indexLength": 753,
+        "offset": 64629,
+        "indexLength": 759,
         "dataLength": 63763,
         "footerLength": 87,
         "rowCount": 5000
@@ -504,67 +504,67 @@
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 64624,
+          "startOffset": 64629,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 64641,
+          "startOffset": 64646,
           "length": 166
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 64807,
+          "startOffset": 64812,
           "length": 166
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 64973,
+          "startOffset": 64978,
           "length": 100
         },
         {
           "columnId": 3,
           "section": "BLOOM_FILTER_UTF8",
-          "startOffset": 65073,
-          "length": 304
+          "startOffset": 65078,
+          "length": 310
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 65377,
+          "startOffset": 65388,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 85412,
+          "startOffset": 85423,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 125462,
+          "startOffset": 125473,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 125479,
+          "startOffset": 125490,
           "length": 3503
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 128982,
+          "startOffset": 128993,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 129007,
+          "startOffset": 129018,
           "length": 133
         }
       ],
@@ -735,77 +735,77 @@
     {
       "stripeNumber": 3,
       "stripeInformation": {
-        "offset": 129227,
-        "indexLength": 754,
+        "offset": 129238,
+        "indexLength": 760,
         "dataLength": 63770,
-        "footerLength": 89,
+        "footerLength": 88,
         "rowCount": 5000
       },
       "streams": [
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 129227,
+          "startOffset": 129238,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 129244,
+          "startOffset": 129255,
           "length": 164
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 129408,
+          "startOffset": 129419,
           "length": 167
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 129575,
+          "startOffset": 129586,
           "length": 102
         },
         {
           "columnId": 3,
           "section": "BLOOM_FILTER_UTF8",
-          "startOffset": 129677,
-          "length": 304
+          "startOffset": 129688,
+          "length": 310
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 129981,
+          "startOffset": 129998,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 150016,
+          "startOffset": 150033,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 190066,
+          "startOffset": 190083,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 190083,
+          "startOffset": 190100,
           "length": 3510
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 193593,
+          "startOffset": 193610,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 193618,
+          "startOffset": 193635,
           "length": 133
         }
       ],
@@ -976,8 +976,8 @@
     {
       "stripeNumber": 4,
       "stripeInformation": {
-        "offset": 193840,
-        "indexLength": 765,
+        "offset": 193856,
+        "indexLength": 771,
         "dataLength": 63756,
         "footerLength": 89,
         "rowCount": 5000
@@ -986,67 +986,67 @@
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 193840,
+          "startOffset": 193856,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 193857,
+          "startOffset": 193873,
           "length": 166
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 194023,
+          "startOffset": 194039,
           "length": 171
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 194194,
+          "startOffset": 194210,
           "length": 107
         },
         {
           "columnId": 3,
           "section": "BLOOM_FILTER_UTF8",
-          "startOffset": 194301,
-          "length": 304
+          "startOffset": 194317,
+          "length": 310
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 194605,
+          "startOffset": 194627,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 214640,
+          "startOffset": 214662,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 254690,
+          "startOffset": 254712,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 254707,
+          "startOffset": 254729,
           "length": 3496
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 258203,
+          "startOffset": 258225,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 258228,
+          "startOffset": 258250,
           "length": 133
         }
       ],
@@ -1217,8 +1217,8 @@
     {
       "stripeNumber": 5,
       "stripeInformation": {
-        "offset": 258450,
-        "indexLength": 383,
+        "offset": 258472,
+        "indexLength": 376,
         "dataLength": 12943,
         "footerLength": 83,
         "rowCount": 1000
@@ -1227,67 +1227,67 @@
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 258450,
+          "startOffset": 258472,
           "length": 12
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 258462,
+          "startOffset": 258484,
           "length": 38
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 258500,
+          "startOffset": 258522,
           "length": 41
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 258541,
+          "startOffset": 258563,
           "length": 41
         },
         {
           "columnId": 3,
           "section": "BLOOM_FILTER_UTF8",
-          "startOffset": 258582,
-          "length": 251
+          "startOffset": 258604,
+          "length": 244
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 258833,
+          "startOffset": 258848,
           "length": 4007
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 262840,
+          "startOffset": 262855,
           "length": 8010
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 270850,
+          "startOffset": 270865,
           "length": 16
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 270866,
+          "startOffset": 270881,
           "length": 752
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 271618,
+          "startOffset": 271633,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 271643,
+          "startOffset": 271658,
           "length": 133
         }
       ],
@@ -1348,7 +1348,7 @@
       }]
     }
   ],
-  "fileLength": 272409,
+  "fileLength": 272428,
   "paddingLength": 0,
   "paddingRatio": 0,
   "status": "OK"

http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/site/_data/releases.yml
----------------------------------------------------------------------
diff --git a/site/_data/releases.yml b/site/_data/releases.yml
index 3331688..1282115 100644
--- a/site/_data/releases.yml
+++ b/site/_data/releases.yml
@@ -9,6 +9,7 @@
   sha256: 5c394c7ed3a31d20726ded55ed9c5a0eeff1bd5b85b1cb2ee6c3c1a94560578c
   known-issues:
     ORC-40: Predicate push down is not implemented in C++.
+    ORC-101: Bloom filters for string and decimal use inconsistent encoding
 
 1.1.2:
   date: 2016-07-08
@@ -19,6 +20,7 @@
   known-issues:
     HIVE-14214: Schema evolution and predicate pushdown don't work together.
     ORC-40: Predicate push down is not implemented in C++.
+    ORC-101: Bloom filters for string and decimal use inconsistent encoding
 
 1.1.1:
   date: 2016-06-13
@@ -29,6 +31,7 @@
   known-issues:
     HIVE-14214: Schema evolution and predicate pushdown don't work together.
     ORC-40: Predicate push down is not implemented in C++.
+    ORC-101: Bloom filters for string and decimal use inconsistent encoding
 
 1.1.0:
   date: 2016-06-10
@@ -39,6 +42,7 @@
   known-issues:
     HIVE-14214: Schema evolution and predicate pushdown don't work together.
     ORC-40: Predicate push down is not implemented in C++.
+    ORC-101: Bloom filters for string and decimal use inconsistent encoding
 
 1.0.0:
   date: 2016-01-25

http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/site/_docs/spec-index.md
----------------------------------------------------------------------
diff --git a/site/_docs/spec-index.md b/site/_docs/spec-index.md
index 009df59..263c9a8 100644
--- a/site/_docs/spec-index.md
+++ b/site/_docs/spec-index.md
@@ -57,14 +57,17 @@ group (default to 10,000 rows) in a column. Only the row groups that
 satisfy min/max row index evaluation will be evaluated against the
 bloom filter index.
 
-Each BloomFilterEntry stores the number of hash functions ('k') used and
-the bitset backing the bloom filter. The bitset is serialized as repeated
-longs from which the number of bits ('m') for the bloom filter can be derived.
-m = bitset.length * 64.
+Each BloomFilterEntry stores the number of hash functions ('k') used
+and the bitset backing the bloom filter. The original encoding (pre
+ORC-101) of bloom filters used the bitset field encoded as a repeating
+sequence of longs in the bitset field with a little endian encoding
+(0x1 is bit 0 and 0x2 is bit 1.) After ORC-101, the encoding is a
+sequence of bytes with a little endian encoding in the utf8bitset field.
 
 ```message BloomFilter {
  optional uint32 numHashFunctions = 1;
  repeated fixed64 bitset = 2;
+ optional bytes utf8bitset = 3;
 }
 ```
 

http://git-wip-us.apache.org/repos/asf/orc/blob/604dcc80/site/_docs/stripes.md
----------------------------------------------------------------------
diff --git a/site/_docs/stripes.md b/site/_docs/stripes.md
index d53f709..cc85feb 100644
--- a/site/_docs/stripes.md
+++ b/site/_docs/stripes.md
@@ -56,6 +56,10 @@ depends on the type and encoding of the column.
  SECONDARY = 5;
  // the index for seeking to particular row groups
  ROW_INDEX = 6;
+ // original bloom filters used before ORC-101
+ BLOOM_FILTER = 7;
+ // bloom filters that consistently use utf8
+ BLOOM_FILTER_UTF8 = 8;
  }
  required Kind kind = 1;
  // the column id

[3/4] orc git commit: ORC-101 Correct bloom filters for strings and decimals to use utf8 encoding.

Posted by om...@apache.org.

ORC-101 Correct bloom filters for strings and decimals to use utf8 encoding.


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/9d39cb80
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/9d39cb80
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/9d39cb80

Branch: refs/heads/master
Commit: 9d39cb80f455f7c341bd4a9421651badb1d137f3
Parents: 7118e96
Author: Owen O'Malley <om...@apache.org>
Authored: Tue Sep 13 13:28:44 2016 -0700
Committer: Owen O'Malley <om...@apache.org>
Committed: Tue Sep 20 15:12:57 2016 -0500

----------------------------------------------------------------------
 c++/include/orc/Reader.hh                       |   1 +
 c++/src/Reader.cc                               |   2 +
 .../src/java/org/apache/orc/BloomFilterIO.java  |  50 --
 .../src/java/org/apache/orc/DataReader.java     |   7 +-
 java/core/src/java/org/apache/orc/OrcConf.java  |  10 +
 java/core/src/java/org/apache/orc/OrcFile.java  |  51 +-
 .../java/org/apache/orc/TypeDescription.java    |  26 +
 .../orc/impl/ConvertTreeReaderFactory.java      |  12 +-
 .../src/java/org/apache/orc/impl/OrcIndex.java  |  10 +-
 .../org/apache/orc/impl/RecordReaderImpl.java   |  70 ++-
 .../org/apache/orc/impl/RecordReaderUtils.java  | 192 ++++++--
 .../org/apache/orc/impl/SchemaEvolution.java    |   4 +
 .../java/org/apache/orc/impl/StreamName.java    |   1 +
 .../java/org/apache/orc/impl/WriterImpl.java    | 228 +++++++--
 .../java/org/apache/orc/util/BloomFilter.java   | 312 ++++++++++++
 .../java/org/apache/orc/util/BloomFilterIO.java |  93 ++++
 .../org/apache/orc/util/BloomFilterUtf8.java    |  55 +++
 .../test/org/apache/orc/TestVectorOrcFile.java  |   4 +-
 .../apache/orc/impl/TestRecordReaderImpl.java   | 484 +++++++++++++------
 .../test/org/apache/orc/util/TestMurmur3.java   | 225 +++++++++
 java/core/src/test/resources/log4j.properties   |   3 +
 .../src/test/resources/log4j.properties         |   3 +
 .../apache/hive/common/util/BloomFilter.java    | 313 ------------
 .../org/apache/hive/common/util/Murmur3.java    | 335 -------------
 .../src/java/org/apache/orc/util/Murmur3.java   | 335 +++++++++++++
 .../apache/hive/common/util/TestMurmur3.java    | 224 ---------
 .../src/java/org/apache/orc/tools/FileDump.java |  20 +-
 .../java/org/apache/orc/tools/JsonFileDump.java |  27 +-
 .../test/org/apache/orc/tools/TestFileDump.java |   6 +-
 java/tools/src/test/resources/log4j.properties  |  21 +
 .../resources/orc-file-dump-bloomfilter.out     | 106 ++--
 .../resources/orc-file-dump-bloomfilter2.out    | 121 +++--
 .../orc-file-dump-dictionary-threshold.out      |   2 +-
 .../tools/src/test/resources/orc-file-dump.json | 150 +++---
 java/tools/src/test/resources/orc-file-dump.out |   2 +-
 .../src/test/resources/orc-file-has-null.out    |   2 +-
 proto/orc_proto.proto                           |   2 +
 37 files changed, 2115 insertions(+), 1394 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/c++/include/orc/Reader.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index 25a0a17..eacbd80 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -53,6 +53,7 @@ namespace orc {
     WriterVersion_HIVE_4243 = 2,
     WriterVersion_HIVE_12055 = 3,
     WriterVersion_HIVE_13083 = 4,
+    WriterVersion_ORC_101 = 5,
     WriterVersion_MAX = INT64_MAX
   };
 

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/c++/src/Reader.cc
----------------------------------------------------------------------
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 9b1f1b9..91f4ea1 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -72,6 +72,8 @@ namespace orc {
       return "HIVE-12055";
     case WriterVersion_HIVE_13083:
       return "HIVE-13083";
+    case WriterVersion_ORC_101:
+      return "ORC-101";
     }
     std::stringstream buffer;
     buffer << "future - " << version;

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/BloomFilterIO.java b/java/core/src/java/org/apache/orc/BloomFilterIO.java
deleted file mode 100644
index 106227d..0000000
--- a/java/core/src/java/org/apache/orc/BloomFilterIO.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import org.apache.hive.common.util.BloomFilter;
-
-public class BloomFilterIO extends BloomFilter {
-
-  public BloomFilterIO(long expectedEntries) {
-    super(expectedEntries, DEFAULT_FPP);
-  }
-
-  public BloomFilterIO(long expectedEntries, double fpp) {
-    super(expectedEntries, fpp);
-  }
-
-  static long[] toArray(OrcProto.BloomFilter filter) {
-    long[] result = new long[filter.getBitsetCount()];
-    int i =0;
-    for(Long l: filter.getBitsetList()) {
-      result[i++] = l;
-    }
-    return result;
-  }
-
-/**
- * Initializes the BloomFilter from the given Orc BloomFilter
- */
-  public BloomFilterIO(OrcProto.BloomFilter bloomFilter) {
-    this.bitSet = new BitSet(toArray(bloomFilter));
-    this.numHashFunctions = bloomFilter.getNumHashFunctions();
-    this.numBits = (int) this.bitSet.bitSize();
-  }
-}

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/DataReader.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/DataReader.java b/java/core/src/java/org/apache/orc/DataReader.java
index a5dbb76..b3f91f2 100644
--- a/java/core/src/java/org/apache/orc/DataReader.java
+++ b/java/core/src/java/org/apache/orc/DataReader.java
@@ -31,9 +31,14 @@ public interface DataReader extends AutoCloseable {
   void open() throws IOException;
 
   OrcIndex readRowIndex(StripeInformation stripe,
+                        TypeDescription fileSchema,
                         OrcProto.StripeFooter footer,
-                        boolean[] included, OrcProto.RowIndex[] indexes,
+                        boolean ignoreNonUtf8BloomFilter,
+                        boolean[] included,
+                        OrcProto.RowIndex[] indexes,
                         boolean[] sargColumns,
+                        OrcFile.WriterVersion version,
+                        OrcProto.Stream.Kind[] bloomFilterKinds,
                         OrcProto.BloomFilterIndex[] bloomFilterIndices
                         ) throws IOException;
 

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/OrcConf.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcConf.java b/java/core/src/java/org/apache/orc/OrcConf.java
index ac8e3f0..05ab13b 100644
--- a/java/core/src/java/org/apache/orc/OrcConf.java
+++ b/java/core/src/java/org/apache/orc/OrcConf.java
@@ -105,6 +105,16 @@ public enum OrcConf {
           "dictionary or not will be retained thereafter."),
   BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns",
       "", "List of columns to create bloom filters for when writing."),
+  BLOOM_FILTER_WRITE_VERSION("orc.bloom.filter.write.version",
+      "orc.bloom.filter.write.version", OrcFile.BloomFilterVersion.UTF8.toString(),
+      "Which version of the bloom filters should we write.\n" +
+          "The choices are:\n" +
+          "  original - writes two versions of the bloom filters for use by\n" +
+          "             both old and new readers.\n" +
+          "  utf8 - writes just the new bloom filters."),
+  IGNORE_NON_UTF8_BLOOM_FILTERS("orc.bloom.filter.ignore.non-utf8",
+      "orc.bloom.filter.ignore.non-utf8", false,
+      "Should the reader ignore the obsolete non-UTF8 bloom filters."),
   MAX_FILE_LENGTH("orc.max.file.length", "orc.max.file.length", Long.MAX_VALUE,
       "The maximum size of the file to read for finding the file tail. This\n" +
           "is primarily used for streaming ingest to read intermediate\n" +

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java
index ddfa9f7..6b2d48e 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -108,6 +108,7 @@ public class OrcFile {
     HIVE_4243(2), // use real column names from Hive tables
     HIVE_12055(3), // vectorized writer
     HIVE_13083(4), // decimal writer updating present stream wrongly
+    ORC_101(5),    // bloom filters use utf8
 
     // Don't use any magic numbers here except for the below:
     FUTURE(Integer.MAX_VALUE); // a version from a future writer
@@ -144,8 +145,12 @@ public class OrcFile {
       if (val == FUTURE.id) return FUTURE; // Special handling for the magic value.
       return values[val];
     }
+
+    public boolean includes(WriterVersion other) {
+      return id >= other.id;
+    }
   }
-  public static final WriterVersion CURRENT_WRITER = WriterVersion.HIVE_13083;
+  public static final WriterVersion CURRENT_WRITER = WriterVersion.ORC_101;
 
   public enum EncodingStrategy {
     SPEED, COMPRESSION
@@ -231,6 +236,33 @@ public class OrcFile {
     void preFooterWrite(WriterContext context) throws IOException;
   }
 
+  public static enum BloomFilterVersion {
+    // Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams to support
+    // both old and new readers.
+    ORIGINAL("original"),
+    // Only include the BLOOM_FILTER_UTF8 streams that consistently use UTF8.
+    // See ORC-101
+    UTF8("utf8");
+
+    private final String id;
+    private BloomFilterVersion(String id) {
+      this.id = id;
+    }
+
+    public String toString() {
+      return id;
+    }
+
+    public static BloomFilterVersion fromString(String s) {
+      for (BloomFilterVersion version: values()) {
+        if (version.id.equals(s)) {
+          return version;
+        }
+      }
+      throw new IllegalArgumentException("Unknown BloomFilterVersion " + s);
+    }
+  }
+
   /**
    * Options for creating ORC file writers.
    */
@@ -253,6 +285,7 @@ public class OrcFile {
     private double paddingTolerance;
     private String bloomFilterColumns;
     private double bloomFilterFpp;
+    private BloomFilterVersion bloomFilterVersion;
 
     protected WriterOptions(Properties tableProperties, Configuration conf) {
       configuration = conf;
@@ -286,6 +319,10 @@ public class OrcFile {
           conf);
       bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(tableProperties,
           conf);
+      bloomFilterVersion =
+          BloomFilterVersion.fromString(
+              OrcConf.BLOOM_FILTER_WRITE_VERSION.getString(tableProperties,
+                  conf));
     }
 
     /**
@@ -430,6 +467,14 @@ public class OrcFile {
     }
 
     /**
+     * Set the version of the bloom filters to write.
+     */
+    public WriterOptions bloomFilterVersion(BloomFilterVersion version) {
+      this.bloomFilterVersion = version;
+      return this;
+    }
+
+    /**
      * A package local option to set the memory manager.
      */
     protected WriterOptions memory(MemoryManager value) {
@@ -508,6 +553,10 @@ public class OrcFile {
     public double getBloomFilterFpp() {
       return bloomFilterFpp;
     }
+
+    public BloomFilterVersion getBloomFilterVersion() {
+      return bloomFilterVersion;
+    }
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java
index da9fe49..bc6787d 100644
--- a/java/core/src/java/org/apache/orc/TypeDescription.java
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -842,4 +842,30 @@ public class TypeDescription
     printJsonToBuffer("", buffer, 0);
     return buffer.toString();
   }
+
+  /**
+   * Locate a subtype by its id.
+   * @param goal the column id to look for
+   * @return the subtype
+   */
+  public TypeDescription findSubtype(int goal) {
+    // call getId method to make sure the ids are assigned
+    int id = getId();
+    if (goal < id || goal > maxId) {
+      throw new IllegalArgumentException("Unknown type id " + id + " in " +
+          toJson());
+    }
+    if (goal == id) {
+      return this;
+    } else {
+      TypeDescription prev = null;
+      for(TypeDescription next: children) {
+        if (next.id > goal) {
+          return prev.findSubtype(goal);
+        }
+        prev = next;
+      }
+      return prev.findSubtype(goal);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
index 36b9a20..20e0faa 100644
--- a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
+++ b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
@@ -1408,7 +1408,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
     public void setConvertVectorElement(int elementNum) {
       long longValue = longColVector.vector[elementNum];
       String string = anyIntegerAsLongTreeReader.getString(longValue);
-      byte[] bytes = string.getBytes();
+      byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
       assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
     }
 
@@ -1450,7 +1450,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
       float floatValue = (float) doubleColVector.vector[elementNum];
       if (!Float.isNaN(floatValue)) {
         String string = String.valueOf(floatValue);
-        byte[] bytes = string.getBytes();
+        byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
         assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
       } else {
         bytesColVector.noNulls = false;
@@ -1495,7 +1495,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
       double doubleValue = doubleColVector.vector[elementNum];
       if (!Double.isNaN(doubleValue)) {
         String string = String.valueOf(doubleValue);
-        byte[] bytes = string.getBytes();
+        byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
         assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
       } else {
         bytesColVector.noNulls = false;
@@ -1544,7 +1544,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
     @Override
     public void setConvertVectorElement(int elementNum) {
       String string = decimalColVector.vector[elementNum].getHiveDecimal().toString();
-      byte[] bytes = string.getBytes();
+      byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
       assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
     }
 
@@ -1584,7 +1584,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
     public void setConvertVectorElement(int elementNum) throws IOException {
       String string =
           timestampColVector.asScratchTimestamp(elementNum).toString();
-      byte[] bytes = string.getBytes();
+      byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
       assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
     }
 
@@ -1626,7 +1626,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
     public void setConvertVectorElement(int elementNum) throws IOException {
       date.setTime(DateWritable.daysToMillis((int) longColVector.vector[elementNum]));
       String string = date.toString();
-      byte[] bytes = string.getBytes();
+      byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
       assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
     }
 

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/OrcIndex.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/OrcIndex.java b/java/core/src/java/org/apache/orc/impl/OrcIndex.java
index 50a15f2..edcb3ba 100644
--- a/java/core/src/java/org/apache/orc/impl/OrcIndex.java
+++ b/java/core/src/java/org/apache/orc/impl/OrcIndex.java
@@ -22,10 +22,14 @@ import org.apache.orc.OrcProto;
 
 public final class OrcIndex {
   OrcProto.RowIndex[] rowGroupIndex;
+  OrcProto.Stream.Kind[] bloomFilterKinds;
   OrcProto.BloomFilterIndex[] bloomFilterIndex;
 
-  public OrcIndex(OrcProto.RowIndex[] rgIndex, OrcProto.BloomFilterIndex[] bfIndex) {
+  public OrcIndex(OrcProto.RowIndex[] rgIndex,
+                  OrcProto.Stream.Kind[] bloomFilterKinds,
+                  OrcProto.BloomFilterIndex[] bfIndex) {
     this.rowGroupIndex = rgIndex;
+    this.bloomFilterKinds = bloomFilterKinds;
     this.bloomFilterIndex = bfIndex;
   }
 
@@ -37,6 +41,10 @@ public final class OrcIndex {
     return bloomFilterIndex;
   }
 
+  public OrcProto.Stream.Kind[] getBloomFilterKinds() {
+    return bloomFilterKinds;
+  }
+
   public void setRowGroupIndex(OrcProto.RowIndex[] rowGroupIndex) {
     this.rowGroupIndex = rowGroupIndex;
   }

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index e8ad54d..c7ce2bb 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -27,7 +27,9 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.OrcFile;
+import org.apache.orc.util.BloomFilter;
+import org.apache.orc.util.BloomFilterIO;
 import org.apache.orc.BooleanColumnStatistics;
 import org.apache.orc.ColumnStatistics;
 import org.apache.orc.CompressionCodec;
@@ -88,10 +90,13 @@ public class RecordReaderImpl implements RecordReader {
   private final TreeReaderFactory.TreeReader reader;
   private final OrcProto.RowIndex[] indexes;
   private final OrcProto.BloomFilterIndex[] bloomFilterIndices;
+  private final OrcProto.Stream.Kind[] bloomFilterKind;
   private final SargApplier sargApp;
   // an array about which row groups aren't skipped
   private boolean[] includedRowGroups = null;
   private final DataReader dataReader;
+  private final boolean ignoreNonUtf8BloomFilter;
+  private final OrcFile.WriterVersion writerVersion;
 
   /**
    * Given a list of column names, find the given column and return the index.
@@ -134,6 +139,7 @@ public class RecordReaderImpl implements RecordReader {
   protected RecordReaderImpl(ReaderImpl fileReader,
                              Reader.Options options) throws IOException {
     this.included = options.getInclude();
+    this.writerVersion = fileReader.getWriterVersion();
     included[0] = true;
     if (options.getSchema() == null) {
       if (LOG.isInfoEnabled()) {
@@ -162,11 +168,14 @@ public class RecordReaderImpl implements RecordReader {
     this.types = fileReader.types;
     this.bufferSize = fileReader.bufferSize;
     this.rowIndexStride = fileReader.rowIndexStride;
+    this.ignoreNonUtf8BloomFilter =
+        OrcConf.IGNORE_NON_UTF8_BLOOM_FILTERS.getBoolean(fileReader.conf);
     SearchArgument sarg = options.getSearchArgument();
     if (sarg != null && rowIndexStride != 0) {
       sargApp = new SargApplier(sarg, options.getColumnNames(),
                                 rowIndexStride,
-                                included.length, evolution);
+                                included.length, evolution,
+                                writerVersion);
     } else {
       sargApp = null;
     }
@@ -218,6 +227,7 @@ public class RecordReaderImpl implements RecordReader {
     writerIncluded = evolution.getFileIncluded();
     indexes = new OrcProto.RowIndex[types.size()];
     bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()];
+    bloomFilterKind = new OrcProto.Stream.Kind[types.size()];
     advanceToNextRow(reader, 0L, true);
   }
 
@@ -339,20 +349,23 @@ public class RecordReaderImpl implements RecordReader {
    * that is referenced in the predicate.
    * @param statsProto the statistics for the column mentioned in the predicate
    * @param predicate the leaf predicate we need to evaluation
-   * @param bloomFilter
+   * @param bloomFilter the bloom filter
+   * @param writerVersion the version of software that wrote the file
+   * @param type what is the kind of this column
    * @return the set of truth values that may be returned for the given
    *   predicate.
    */
   static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto,
-      PredicateLeaf predicate, OrcProto.BloomFilter bloomFilter) {
+                                           PredicateLeaf predicate,
+                                           OrcProto.Stream.Kind kind,
+                                           OrcProto.BloomFilter bloomFilter,
+                                           OrcFile.WriterVersion writerVersion,
+                                           TypeDescription.Category type) {
     ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto);
     Object minValue = getMin(cs);
     Object maxValue = getMax(cs);
-    BloomFilterIO bf = null;
-    if (bloomFilter != null) {
-      bf = new BloomFilterIO(bloomFilter);
-    }
-    return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(), bf);
+    return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(),
+        BloomFilterIO.deserialize(kind, writerVersion, type, bloomFilter));
   }
 
   /**
@@ -365,14 +378,14 @@ public class RecordReaderImpl implements RecordReader {
    */
   public static TruthValue evaluatePredicate(ColumnStatistics stats,
                                              PredicateLeaf predicate,
-                                             BloomFilterIO bloomFilter) {
+                                             BloomFilter bloomFilter) {
     Object minValue = getMin(stats);
     Object maxValue = getMax(stats);
     return evaluatePredicateRange(predicate, minValue, maxValue, stats.hasNull(), bloomFilter);
   }
 
   static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min,
-      Object max, boolean hasNull, BloomFilterIO bloomFilter) {
+      Object max, boolean hasNull, BloomFilter bloomFilter) {
     // if we didn't have any values, everything must have been null
     if (min == null) {
       if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) {
@@ -421,7 +434,7 @@ public class RecordReaderImpl implements RecordReader {
   }
 
   private static boolean shouldEvaluateBloomFilter(PredicateLeaf predicate,
-      TruthValue result, BloomFilterIO bloomFilter) {
+      TruthValue result, BloomFilter bloomFilter) {
     // evaluate bloom filter only when
     // 1) Bloom filter is available
     // 2) Min/Max evaluation yield YES or MAYBE
@@ -531,7 +544,7 @@ public class RecordReaderImpl implements RecordReader {
   }
 
   private static TruthValue evaluatePredicateBloomFilter(PredicateLeaf predicate,
-      final Object predObj, BloomFilterIO bloomFilter, boolean hasNull) {
+      final Object predObj, BloomFilter bloomFilter, boolean hasNull) {
     switch (predicate.getOperator()) {
       case NULL_SAFE_EQUALS:
         // null safe equals does not return *_NULL variant. So set hasNull to false
@@ -553,7 +566,7 @@ public class RecordReaderImpl implements RecordReader {
     }
   }
 
-  private static TruthValue checkInBloomFilter(BloomFilterIO bf, Object predObj, boolean hasNull) {
+  private static TruthValue checkInBloomFilter(BloomFilter bf, Object predObj, boolean hasNull) {
     TruthValue result = hasNull ? TruthValue.NO_NULL : TruthValue.NO;
 
     if (predObj instanceof Long) {
@@ -708,6 +721,7 @@ public class RecordReaderImpl implements RecordReader {
     public final static boolean[] READ_ALL_RGS = null;
     public final static boolean[] READ_NO_RGS = new boolean[0];
 
+    private final OrcFile.WriterVersion writerVersion;
     private final SearchArgument sarg;
     private final List<PredicateLeaf> sargLeaves;
     private final int[] filterColumns;
@@ -716,10 +730,13 @@ public class RecordReaderImpl implements RecordReader {
     private final boolean[] sargColumns;
     private SchemaEvolution evolution;
 
-    public SargApplier(SearchArgument sarg, String[] columnNames,
+    public SargApplier(SearchArgument sarg,
+                       String[] columnNames,
                        long rowIndexStride,
                        int includedCount,
-                       SchemaEvolution evolution) {
+                       SchemaEvolution evolution,
+                       OrcFile.WriterVersion writerVersion) {
+      this.writerVersion = writerVersion;
       this.sarg = sarg;
       sargLeaves = sarg.getLeaves();
       filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves,
@@ -745,8 +762,11 @@ public class RecordReaderImpl implements RecordReader {
      * row groups must be read.
      * @throws IOException
      */
-    public boolean[] pickRowGroups(StripeInformation stripe, OrcProto.RowIndex[] indexes,
-        OrcProto.BloomFilterIndex[] bloomFilterIndices, boolean returnNone) throws IOException {
+    public boolean[] pickRowGroups(StripeInformation stripe,
+                                   OrcProto.RowIndex[] indexes,
+                                   OrcProto.Stream.Kind[] bloomFilterKinds,
+                                   OrcProto.BloomFilterIndex[] bloomFilterIndices,
+                                   boolean returnNone) throws IOException {
       long rowsInStripe = stripe.getNumberOfRows();
       int groupsInStripe = (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride);
       boolean[] result = new boolean[groupsInStripe]; // TODO: avoid alloc?
@@ -765,11 +785,15 @@ public class RecordReaderImpl implements RecordReader {
             }
             OrcProto.ColumnStatistics stats = entry.getStatistics();
             OrcProto.BloomFilter bf = null;
+            OrcProto.Stream.Kind bfk = null;
             if (bloomFilterIndices != null && bloomFilterIndices[columnIx] != null) {
+              bfk = bloomFilterKinds[columnIx];
               bf = bloomFilterIndices[columnIx].getBloomFilter(rowGroup);
             }
             if (evolution != null && evolution.isPPDSafeConversion(columnIx)) {
-              leafValues[pred] = evaluatePredicateProto(stats, sargLeaves.get(pred), bf);
+              leafValues[pred] = evaluatePredicateProto(stats,
+                  sargLeaves.get(pred), bfk, bf, writerVersion,
+                  evolution.getFileSchema().findSubtype(columnIx).getCategory());
             } else {
               leafValues[pred] = TruthValue.YES_NO_NULL;
             }
@@ -809,7 +833,8 @@ public class RecordReaderImpl implements RecordReader {
       return null;
     }
     readRowIndex(currentStripe, writerIncluded, sargApp.sargColumns);
-    return sargApp.pickRowGroups(stripes.get(currentStripe), indexes, bloomFilterIndices, false);
+    return sargApp.pickRowGroups(stripes.get(currentStripe), indexes,
+        bloomFilterKind, bloomFilterIndices, false);
   }
 
   private void clearStreams() {
@@ -1168,8 +1193,9 @@ public class RecordReaderImpl implements RecordReader {
       sargColumns = sargColumns == null ?
           (sargApp == null ? null : sargApp.sargColumns) : sargColumns;
     }
-    return dataReader.readRowIndex(stripe, stripeFooter, included, indexes, sargColumns,
-        bloomFilterIndex);
+    return dataReader.readRowIndex(stripe, evolution.getFileType(0), stripeFooter,
+        ignoreNonUtf8BloomFilter, included, indexes, sargColumns, writerVersion,
+        bloomFilterKind, bloomFilterIndex);
   }
 
   private void seekToRowEntry(TreeReaderFactory.TreeReader reader, int rowEntry)

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
index 3d57732..cadee35 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
@@ -30,13 +30,13 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.common.io.DiskRange;
 import org.apache.hadoop.hive.common.io.DiskRangeList;
-import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper;
-import org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper;
 import org.apache.orc.CompressionCodec;
 import org.apache.orc.DataReader;
+import org.apache.orc.OrcFile;
 import org.apache.orc.OrcProto;
 
 import org.apache.orc.StripeInformation;
+import org.apache.orc.TypeDescription;
 
 /**
  * Stateless methods shared between RecordReaderImpl and EncodedReaderImpl.
@@ -44,6 +44,100 @@ import org.apache.orc.StripeInformation;
 public class RecordReaderUtils {
   private static final HadoopShims SHIMS = HadoopShims.Factory.get();
 
+  static boolean hadBadBloomFilters(TypeDescription.Category category,
+                                    OrcFile.WriterVersion version) {
+    switch(category) {
+      case STRING:
+      case CHAR:
+      case VARCHAR:
+        return !version.includes(OrcFile.WriterVersion.HIVE_12055);
+      case DECIMAL:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  /**
+   * Plans the list of disk ranges that the given stripe needs to read the
+   * indexes. All of the positions are relative to the start of the stripe.
+   * @param  fileSchema the schema for the file
+   * @param footer the stripe footer
+   * @param ignoreNonUtf8BloomFilter should the reader ignore non-utf8
+   *                                 encoded bloom filters
+   * @param fileIncluded the columns (indexed by file columns) that should be
+   *                     read
+   * @param sargColumns true for the columns (indexed by file columns) that
+   *                    we need bloom filters for
+   * @param version the version of the software that wrote the file
+   * @param bloomFilterKinds (output) the stream kind of the bloom filters
+   * @return a list of merged disk ranges to read
+   */
+  static DiskRangeList planIndexReading(TypeDescription fileSchema,
+                                        OrcProto.StripeFooter footer,
+                                        boolean ignoreNonUtf8BloomFilter,
+                                        boolean[] fileIncluded,
+                                        boolean[] sargColumns,
+                                        OrcFile.WriterVersion version,
+                                        OrcProto.Stream.Kind[] bloomFilterKinds) {
+    DiskRangeList.CreateHelper result = new DiskRangeList.CreateHelper();
+    List<OrcProto.Stream> streams = footer.getStreamsList();
+    // figure out which kind of bloom filter we want for each column
+    // picks bloom_filter_utf8 if its available, otherwise bloom_filter
+    if (sargColumns != null) {
+      for (OrcProto.Stream stream : streams) {
+        if (stream.hasKind() && stream.hasColumn()) {
+          int column = stream.getColumn();
+          if (sargColumns[column]) {
+            switch (stream.getKind()) {
+              case BLOOM_FILTER:
+                if (bloomFilterKinds[column] == null &&
+                    !(ignoreNonUtf8BloomFilter &&
+                        hadBadBloomFilters(fileSchema.findSubtype(column)
+                            .getCategory(), version))) {
+                  bloomFilterKinds[column] = OrcProto.Stream.Kind.BLOOM_FILTER;
+                }
+                break;
+              case BLOOM_FILTER_UTF8:
+                bloomFilterKinds[column] = OrcProto.Stream.Kind.BLOOM_FILTER_UTF8;
+                break;
+              default:
+                break;
+            }
+          }
+        }
+      }
+    }
+    long offset = 0;
+    for(OrcProto.Stream stream: footer.getStreamsList()) {
+      if (stream.hasKind() && stream.hasColumn()) {
+        int column = stream.getColumn();
+        if (fileIncluded == null || fileIncluded[column]) {
+          boolean needStream = false;
+          switch (stream.getKind()) {
+            case ROW_INDEX:
+              needStream = true;
+              break;
+            case BLOOM_FILTER:
+              needStream = bloomFilterKinds[column] == OrcProto.Stream.Kind.BLOOM_FILTER;
+              break;
+            case BLOOM_FILTER_UTF8:
+              needStream = bloomFilterKinds[column] == OrcProto.Stream.Kind.BLOOM_FILTER_UTF8;
+              break;
+            default:
+              // PASS
+              break;
+          }
+          if (needStream) {
+            result.addOrMerge(offset, offset + stream.getLength(), true, false);
+          }
+        }
+      }
+      offset += stream.getLength();
+    }
+    return result.get();
+  }
+
   private static class DefaultDataReader implements DataReader {
     private FSDataInputStream file = null;
     private final ByteBufferAllocatorPool pool;
@@ -91,10 +185,14 @@ public class RecordReaderUtils {
 
     @Override
     public OrcIndex readRowIndex(StripeInformation stripe,
+                                 TypeDescription fileSchema,
                                  OrcProto.StripeFooter footer,
+                                 boolean ignoreNonUtf8BloomFilter,
                                  boolean[] included,
                                  OrcProto.RowIndex[] indexes,
                                  boolean[] sargColumns,
+                                 OrcFile.WriterVersion version,
+                                 OrcProto.Stream.Kind[] bloomFilterKinds,
                                  OrcProto.BloomFilterIndex[] bloomFilterIndices
                                  ) throws IOException {
       if (file == null) {
@@ -106,49 +204,61 @@ public class RecordReaderUtils {
       if (indexes == null) {
         indexes = new OrcProto.RowIndex[typeCount];
       }
+      if (bloomFilterKinds == null) {
+        bloomFilterKinds = new OrcProto.Stream.Kind[typeCount];
+      }
       if (bloomFilterIndices == null) {
         bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
       }
-      long offset = stripe.getOffset();
-      List<OrcProto.Stream> streams = footer.getStreamsList();
-      for (int i = 0; i < streams.size(); i++) {
-        OrcProto.Stream stream = streams.get(i);
-        OrcProto.Stream nextStream = null;
-        if (i < streams.size() - 1) {
-          nextStream = streams.get(i+1);
+      DiskRangeList ranges = planIndexReading(fileSchema, footer,
+          ignoreNonUtf8BloomFilter, included, sargColumns, version,
+          bloomFilterKinds);
+      ranges = readDiskRanges(file, zcr, stripe.getOffset(), ranges, false);
+      long offset = 0;
+      DiskRangeList range = ranges;
+      for(OrcProto.Stream stream: footer.getStreamsList()) {
+        // advance to find the next range
+        while (range != null && range.getEnd() <= offset) {
+          range = range.next;
         }
-        int col = stream.getColumn();
-        int len = (int) stream.getLength();
-        // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
-        // filter and combine the io to read row index and bloom filters for that column together
-        if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) {
-          boolean readBloomFilter = false;
-          if (sargColumns != null && sargColumns[col] &&
-              nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) {
-            len += nextStream.getLength();
-            i += 1;
-            readBloomFilter = true;
-          }
-          if ((included == null || included[col]) && indexes[col] == null) {
-            byte[] buffer = new byte[len];
-            file.readFully(offset, buffer, 0, buffer.length);
-            ByteBuffer bb = ByteBuffer.wrap(buffer);
-            indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index",
-                ReaderImpl.singleton(new BufferChunk(bb, 0)), stream.getLength(),
-                codec, bufferSize));
-            if (readBloomFilter) {
-              bb.position((int) stream.getLength());
-              bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create(
-                  "bloom_filter", ReaderImpl.singleton(new BufferChunk(bb, 0)),
-                  nextStream.getLength(), codec, bufferSize));
-            }
+        // no more ranges, so we are done
+        if (range == null) {
+          break;
+        }
+        int column = stream.getColumn();
+        if (stream.hasKind() && range.getOffset() <= offset) {
+          switch (stream.getKind()) {
+            case ROW_INDEX:
+              if (included == null || included[column]) {
+                ByteBuffer bb = range.getData().duplicate();
+                bb.position((int) (offset - range.getOffset()));
+                bb.limit((int) (bb.position() + stream.getLength()));
+                indexes[column] = OrcProto.RowIndex.parseFrom(
+                    InStream.createCodedInputStream("index",
+                        ReaderImpl.singleton(new BufferChunk(bb, 0)),
+                        stream.getLength(),
+                    codec, bufferSize));
+              }
+              break;
+            case BLOOM_FILTER:
+            case BLOOM_FILTER_UTF8:
+              if (sargColumns != null && sargColumns[column]) {
+                ByteBuffer bb = range.getData().duplicate();
+                bb.position((int) (offset - range.getOffset()));
+                bb.limit((int) (bb.position() + stream.getLength()));
+                bloomFilterIndices[column] = OrcProto.BloomFilterIndex.parseFrom
+                    (InStream.createCodedInputStream("bloom_filter",
+                        ReaderImpl.singleton(new BufferChunk(bb, 0)),
+                    stream.getLength(), codec, bufferSize));
+              }
+              break;
+            default:
+              break;
           }
         }
-        offset += len;
+        offset += stream.getLength();
       }
-
-      OrcIndex index = new OrcIndex(indexes, bloomFilterIndices);
-      return index;
+      return new OrcIndex(indexes, bloomFilterKinds, bloomFilterIndices);
     }
 
     @Override
@@ -234,14 +344,14 @@ public class RecordReaderUtils {
   }
 
   public static void addEntireStreamToRanges(
-      long offset, long length, CreateHelper list, boolean doMergeBuffers) {
+      long offset, long length, DiskRangeList.CreateHelper list, boolean doMergeBuffers) {
     list.addOrMerge(offset, offset + length, doMergeBuffers, false);
   }
 
   public static void addRgFilteredStreamToRanges(OrcProto.Stream stream,
       boolean[] includedRowGroups, boolean isCompressed, OrcProto.RowIndex index,
       OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, boolean hasNull,
-      long offset, long length, CreateHelper list, boolean doMergeBuffers) {
+      long offset, long length, DiskRangeList.CreateHelper list, boolean doMergeBuffers) {
     for (int group = 0; group < includedRowGroups.length; ++group) {
       if (!includedRowGroups[group]) continue;
       int posn = getIndexPosition(
@@ -399,7 +509,7 @@ public class RecordReaderUtils {
     if (range == null) return null;
     DiskRangeList prev = range.prev;
     if (prev == null) {
-      prev = new MutateHelper(range);
+      prev = new DiskRangeList.MutateHelper(range);
     }
     while (range != null) {
       if (range.hasData()) {

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
index 1e11728..20adfd8 100644
--- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
+++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
@@ -153,6 +153,10 @@ public class SchemaEvolution {
     return hasConversion;
   }
 
+  public TypeDescription getFileSchema() {
+    return fileSchema;
+  }
+
   public TypeDescription getFileType(TypeDescription readerType) {
     return getFileType(readerType.getId());
   }

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/StreamName.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/StreamName.java b/java/core/src/java/org/apache/orc/impl/StreamName.java
index b3fd145..e3561bf 100644
--- a/java/core/src/java/org/apache/orc/impl/StreamName.java
+++ b/java/core/src/java/org/apache/orc/impl/StreamName.java
@@ -78,6 +78,7 @@ public class StreamName implements Comparable<StreamName> {
       case ROW_INDEX:
       case DICTIONARY_COUNT:
       case BLOOM_FILTER:
+      case BLOOM_FILTER_UTF8:
         return Area.INDEX;
       default:
         return Area.DATA;

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/impl/WriterImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
index 3df1b76..940ef59 100644
--- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
@@ -21,6 +21,7 @@ package org.apache.orc.impl;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.sql.Timestamp;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -34,12 +35,10 @@ import io.airlift.compress.lz4.Lz4Compressor;
 import io.airlift.compress.lz4.Lz4Decompressor;
 import io.airlift.compress.lzo.LzoCompressor;
 import io.airlift.compress.lzo.LzoDecompressor;
-import io.airlift.compress.snappy.SnappyCompressor;
-import io.airlift.compress.snappy.SnappyDecompressor;
-import org.apache.commons.lang.ArrayUtils;
 import org.apache.hadoop.hive.ql.util.JavaDataModel;
 import org.apache.orc.BinaryColumnStatistics;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilter;
+import org.apache.orc.util.BloomFilterIO;
 import org.apache.orc.CompressionCodec;
 import org.apache.orc.CompressionKind;
 import org.apache.orc.OrcConf;
@@ -50,6 +49,7 @@ import org.apache.orc.StringColumnStatistics;
 import org.apache.orc.StripeInformation;
 import org.apache.orc.TypeDescription;
 import org.apache.orc.Writer;
+import org.apache.orc.util.BloomFilterUtf8;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -147,6 +147,7 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
   private final OrcFile.CompressionStrategy compressionStrategy;
   private final boolean[] bloomFilterColumns;
   private final double bloomFilterFpp;
+  private final OrcFile.BloomFilterVersion bloomFilterVersion;
   private boolean writeTimeZone;
 
   public WriterImpl(FileSystem fs,
@@ -157,6 +158,7 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
     this.conf = opts.getConfiguration();
     this.callback = opts.getCallback();
     this.schema = opts.getSchema();
+    bloomFilterVersion = opts.getBloomFilterVersion();
     if (callback != null) {
       callbackContext = new OrcFile.WriterContext(){
 
@@ -426,6 +428,7 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
         case BLOOM_FILTER:
         case DATA:
         case DICTIONARY_DATA:
+        case BLOOM_FILTER_UTF8:
           if (getCompressionStrategy() == OrcFile.CompressionStrategy.SPEED) {
             modifiers = EnumSet.of(CompressionCodec.Modifier.FAST,
                 CompressionCodec.Modifier.TEXT);
@@ -543,6 +546,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
     public boolean hasWriterTimeZone() {
       return writeTimeZone;
     }
+
+    public OrcFile.BloomFilterVersion getBloomFilterVersion() {
+      return bloomFilterVersion;
+    }
   }
 
   /**
@@ -564,9 +571,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
     private final OrcProto.RowIndexEntry.Builder rowIndexEntry;
     private final PositionedOutputStream rowIndexStream;
     private final PositionedOutputStream bloomFilterStream;
-    protected final BloomFilterIO bloomFilter;
+    private final PositionedOutputStream bloomFilterStreamUtf8;
+    protected final BloomFilter bloomFilter;
+    protected final BloomFilterUtf8 bloomFilterUtf8;
     protected final boolean createBloomFilter;
     private final OrcProto.BloomFilterIndex.Builder bloomFilterIndex;
+    private final OrcProto.BloomFilterIndex.Builder bloomFilterIndexUtf8;
     private final OrcProto.BloomFilter.Builder bloomFilterEntry;
     private boolean foundNulls;
     private OutStream isPresentOutStream;
@@ -612,15 +622,30 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
       }
       if (createBloomFilter) {
         bloomFilterEntry = OrcProto.BloomFilter.newBuilder();
-        bloomFilterIndex = OrcProto.BloomFilterIndex.newBuilder();
-        bloomFilterStream = streamFactory.createStream(id, OrcProto.Stream.Kind.BLOOM_FILTER);
-        bloomFilter = new BloomFilterIO(streamFactory.getRowIndexStride(),
+        if (streamFactory.getBloomFilterVersion() == OrcFile.BloomFilterVersion.ORIGINAL) {
+          bloomFilter = new BloomFilter(streamFactory.getRowIndexStride(),
+              streamFactory.getBloomFilterFPP());
+          bloomFilterIndex = OrcProto.BloomFilterIndex.newBuilder();
+          bloomFilterStream = streamFactory.createStream(id,
+              OrcProto.Stream.Kind.BLOOM_FILTER);;
+        } else {
+          bloomFilter = null;
+          bloomFilterIndex = null;
+          bloomFilterStream = null;
+        }
+        bloomFilterUtf8 = new BloomFilterUtf8(streamFactory.getRowIndexStride(),
             streamFactory.getBloomFilterFPP());
+        bloomFilterIndexUtf8 = OrcProto.BloomFilterIndex.newBuilder();
+        bloomFilterStreamUtf8 = streamFactory.createStream(id,
+              OrcProto.Stream.Kind.BLOOM_FILTER_UTF8);;
       } else {
         bloomFilterEntry = null;
         bloomFilterIndex = null;
+        bloomFilterIndexUtf8 = null;
+        bloomFilterStreamUtf8 = null;
         bloomFilterStream = null;
         bloomFilter = null;
+        bloomFilterUtf8 = null;
       }
     }
 
@@ -788,7 +813,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
         bloomFilterIndex.build().writeTo(bloomFilterStream);
         bloomFilterStream.flush();
         bloomFilterIndex.clear();
-        bloomFilterEntry.clear();
+      }
+      // write the bloom filter to out stream
+      if (bloomFilterStreamUtf8 != null) {
+        bloomFilterIndexUtf8.build().writeTo(bloomFilterStreamUtf8);
+        bloomFilterStreamUtf8.flush();
+        bloomFilterIndexUtf8.clear();
       }
     }
 
@@ -837,12 +867,16 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
 
     void addBloomFilterEntry() {
       if (createBloomFilter) {
-        bloomFilterEntry.setNumHashFunctions(bloomFilter.getNumHashFunctions());
-        bloomFilterEntry.addAllBitset(Arrays.asList(ArrayUtils.toObject(
-            bloomFilter.getBitSet())));
-        bloomFilterIndex.addBloomFilter(bloomFilterEntry.build());
-        bloomFilter.reset();
-        bloomFilterEntry.clear();
+        if (bloomFilter != null) {
+          BloomFilterIO.serialize(bloomFilterEntry, bloomFilter);
+          bloomFilterIndex.addBloomFilter(bloomFilterEntry.build());
+          bloomFilter.reset();
+        }
+        if (bloomFilterUtf8 != null) {
+          BloomFilterIO.serialize(bloomFilterEntry, bloomFilterUtf8);
+          bloomFilterIndexUtf8.addBloomFilter(bloomFilterEntry.build());
+          bloomFilterUtf8.reset();
+        }
       }
     }
 
@@ -946,7 +980,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
           byte value = (byte) vec.vector[0];
           indexStatistics.updateInteger(value, length);
           if (createBloomFilter) {
-            bloomFilter.addLong(value);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(value);
+            }
+            bloomFilterUtf8.addLong(value);
           }
           for(int i=0; i < length; ++i) {
             writer.write(value);
@@ -959,7 +996,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             writer.write(value);
             indexStatistics.updateInteger(value, 1);
             if (createBloomFilter) {
-              bloomFilter.addLong(value);
+              if (bloomFilter != null) {
+                bloomFilter.addLong(value);
+              }
+              bloomFilterUtf8.addLong(value);
             }
           }
         }
@@ -1017,7 +1057,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
           long value = vec.vector[0];
           indexStatistics.updateInteger(value, length);
           if (createBloomFilter) {
-            bloomFilter.addLong(value);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(value);
+            }
+            bloomFilterUtf8.addLong(value);
           }
           for(int i=0; i < length; ++i) {
             writer.write(value);
@@ -1030,7 +1073,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             writer.write(value);
             indexStatistics.updateInteger(value, 1);
             if (createBloomFilter) {
-              bloomFilter.addLong(value);
+              if (bloomFilter != null) {
+                bloomFilter.addLong(value);
+              }
+              bloomFilterUtf8.addLong(value);
             }
           }
         }
@@ -1077,7 +1123,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
           float value = (float) vec.vector[0];
           indexStatistics.updateDouble(value);
           if (createBloomFilter) {
-            bloomFilter.addDouble(value);
+            if (bloomFilter != null) {
+              bloomFilter.addDouble(value);
+            }
+            bloomFilterUtf8.addDouble(value);
           }
           for(int i=0; i < length; ++i) {
             utils.writeFloat(stream, value);
@@ -1090,7 +1139,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             utils.writeFloat(stream, value);
             indexStatistics.updateDouble(value);
             if (createBloomFilter) {
-              bloomFilter.addDouble(value);
+              if (bloomFilter != null) {
+                bloomFilter.addDouble(value);
+              }
+              bloomFilterUtf8.addDouble(value);
             }
           }
         }
@@ -1138,7 +1190,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
           double value = vec.vector[0];
           indexStatistics.updateDouble(value);
           if (createBloomFilter) {
-            bloomFilter.addDouble(value);
+            if (bloomFilter != null) {
+              bloomFilter.addDouble(value);
+            }
+            bloomFilterUtf8.addDouble(value);
           }
           for(int i=0; i < length; ++i) {
             utils.writeDouble(stream, value);
@@ -1151,7 +1206,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             utils.writeDouble(stream, value);
             indexStatistics.updateDouble(value);
             if (createBloomFilter) {
-              bloomFilter.addDouble(value);
+              if (bloomFilter != null) {
+                bloomFilter.addDouble(value);
+              }
+              bloomFilterUtf8.addDouble(value);
             }
           }
         }
@@ -1430,7 +1488,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
           indexStatistics.updateString(vec.vector[0], vec.start[0],
               vec.length[0], length);
           if (createBloomFilter) {
-            bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+            if (bloomFilter != null) {
+              // translate from UTF-8 to the default charset
+              bloomFilter.addString(new String(vec.vector[0], vec.start[0],
+                  vec.length[0], StandardCharsets.UTF_8));
+            }
+            bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
           }
         }
       } else {
@@ -1447,7 +1510,13 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             indexStatistics.updateString(vec.vector[offset + i],
                 vec.start[offset + i], vec.length[offset + i], 1);
             if (createBloomFilter) {
-              bloomFilter.addBytes(vec.vector[offset + i],
+              if (bloomFilter != null) {
+                // translate from UTF-8 to the default charset
+                bloomFilter.addString(new String(vec.vector[offset + i],
+                    vec.start[offset + i], vec.length[offset + i],
+                    StandardCharsets.UTF_8));
+              }
+              bloomFilterUtf8.addBytes(vec.vector[offset + i],
                   vec.start[offset + i], vec.length[offset + i]);
             }
           }
@@ -1504,7 +1573,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
           }
           indexStatistics.updateString(ptr, ptrOffset, itemLength, length);
           if (createBloomFilter) {
-            bloomFilter.addBytes(ptr, ptrOffset, itemLength);
+            if (bloomFilter != null) {
+              // translate from UTF-8 to the default charset
+              bloomFilter.addString(new String(vec.vector[0], vec.start[0],
+                  vec.length[0], StandardCharsets.UTF_8));
+            }
+            bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
           }
         }
       } else {
@@ -1531,7 +1605,14 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             }
             indexStatistics.updateString(ptr, ptrOffset, itemLength, 1);
             if (createBloomFilter) {
-              bloomFilter.addBytes(ptr, ptrOffset, itemLength);
+              if (bloomFilter != null) {
+                // translate from UTF-8 to the default charset
+                bloomFilter.addString(new String(vec.vector[offset + i],
+                    vec.start[offset + i], vec.length[offset + i],
+                    StandardCharsets.UTF_8));
+              }
+              bloomFilterUtf8.addBytes(vec.vector[offset + i],
+                  vec.start[offset + i], vec.length[offset + i]);
             }
           }
         }
@@ -1576,7 +1657,14 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
           indexStatistics.updateString(vec.vector[0], vec.start[0],
               itemLength, length);
           if (createBloomFilter) {
-            bloomFilter.addBytes(vec.vector[0], vec.start[0], itemLength);
+            if (bloomFilter != null) {
+              // translate from UTF-8 to the default charset
+              bloomFilter.addString(new String(vec.vector[0],
+                  vec.start[0], itemLength,
+                  StandardCharsets.UTF_8));
+            }
+            bloomFilterUtf8.addBytes(vec.vector[0],
+                vec.start[0], itemLength);
           }
         }
       } else {
@@ -1594,7 +1682,13 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             indexStatistics.updateString(vec.vector[offset + i],
                 vec.start[offset + i], itemLength, 1);
             if (createBloomFilter) {
-              bloomFilter.addBytes(vec.vector[offset + i],
+              if (bloomFilter != null) {
+                // translate from UTF-8 to the default charset
+                bloomFilter.addString(new String(vec.vector[offset + i],
+                    vec.start[offset + i], itemLength,
+                    StandardCharsets.UTF_8));
+              }
+              bloomFilterUtf8.addBytes(vec.vector[offset + i],
                   vec.start[offset + i], itemLength);
             }
           }
@@ -1646,7 +1740,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
           indexStatistics.updateBinary(vec.vector[0], vec.start[0],
               vec.length[0], length);
           if (createBloomFilter) {
-            bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+            if (bloomFilter != null) {
+              bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+            }
+            bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
           }
         }
       } else {
@@ -1658,7 +1755,11 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             indexStatistics.updateBinary(vec.vector[offset + i],
                 vec.start[offset + i], vec.length[offset + i], 1);
             if (createBloomFilter) {
-              bloomFilter.addBytes(vec.vector[offset + i],
+              if (bloomFilter != null) {
+                bloomFilter.addBytes(vec.vector[offset + i],
+                    vec.start[offset + i], vec.length[offset + i]);
+              }
+              bloomFilterUtf8.addBytes(vec.vector[offset + i],
                   vec.start[offset + i], vec.length[offset + i]);
             }
           }
@@ -1734,7 +1835,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
           long millis = val.getTime();
           indexStatistics.updateTimestamp(millis);
           if (createBloomFilter) {
-            bloomFilter.addLong(millis);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(millis);
+            }
+            bloomFilterUtf8.addLong(millis);
           }
           final long secs = millis / MILLIS_PER_SECOND - base_timestamp;
           final long nano = formatNanos(val.getNanos());
@@ -1753,7 +1857,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             nanos.write(formatNanos(val.getNanos()));
             indexStatistics.updateTimestamp(millis);
             if (createBloomFilter) {
-              bloomFilter.addLong(millis);
+              if (bloomFilter != null) {
+                bloomFilter.addLong(millis);
+              }
+              bloomFilterUtf8.addLong(millis);
             }
           }
         }
@@ -1819,7 +1926,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
           int value = (int) vec.vector[0];
           indexStatistics.updateDate(value);
           if (createBloomFilter) {
-            bloomFilter.addLong(value);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(value);
+            }
+            bloomFilterUtf8.addLong(value);
           }
           for(int i=0; i < length; ++i) {
             writer.write(value);
@@ -1832,7 +1942,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             writer.write(value);
             indexStatistics.updateDate(value);
             if (createBloomFilter) {
-              bloomFilter.addLong(value);
+              if (bloomFilter != null) {
+                bloomFilter.addLong(value);
+              }
+              bloomFilterUtf8.addLong(value);
             }
           }
         }
@@ -1901,7 +2014,11 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
           HiveDecimal value = vec.vector[0].getHiveDecimal();
           indexStatistics.updateDecimal(value);
           if (createBloomFilter) {
-            bloomFilter.addString(value.toString());
+            String str = value.toString();
+            if (bloomFilter != null) {
+              bloomFilter.addString(str);
+            }
+            bloomFilterUtf8.addString(str);
           }
           for(int i=0; i < length; ++i) {
             SerializationUtils.writeBigInteger(valueStream,
@@ -1918,7 +2035,11 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             scaleStream.write(value.scale());
             indexStatistics.updateDecimal(value);
             if (createBloomFilter) {
-              bloomFilter.addString(value.toString());
+              String str = value.toString();
+              if (bloomFilter != null) {
+                bloomFilter.addString(str);
+              }
+              bloomFilterUtf8.addString(str);
             }
           }
         }
@@ -2065,7 +2186,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             childrenWriters[0].writeBatch(vec.child, childOffset, childLength);
           }
           if (createBloomFilter) {
-            bloomFilter.addLong(childLength);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(childLength);
+            }
+            bloomFilterUtf8.addLong(childLength);
           }
         }
       } else {
@@ -2088,6 +2212,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             } else {
               currentLength += nextLength;
             }
+            if (createBloomFilter) {
+              if (bloomFilter != null) {
+                bloomFilter.addLong(nextLength);
+              }
+              bloomFilterUtf8.addLong(nextLength);
+            }
           }
         }
         if (currentLength != 0) {
@@ -2161,7 +2291,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             childrenWriters[1].writeBatch(vec.values, childOffset, childLength);
           }
           if (createBloomFilter) {
-            bloomFilter.addLong(childLength);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(childLength);
+            }
+            bloomFilterUtf8.addLong(childLength);
           }
         }
       } else {
@@ -2186,6 +2319,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             } else {
               currentLength += nextLength;
             }
+            if (createBloomFilter) {
+              if (bloomFilter != null) {
+                bloomFilter.addLong(nextLength);
+              }
+              bloomFilterUtf8.addLong(nextLength);
+            }
           }
         }
         if (currentLength != 0) {
@@ -2247,7 +2386,10 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
             tags.write(tag);
           }
           if (createBloomFilter) {
-            bloomFilter.addLong(tag);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(tag);
+            }
+            bloomFilterUtf8.addLong(tag);
           }
           childrenWriters[tag].writeBatch(vec.fields[tag], offset, length);
         }
@@ -2275,6 +2417,12 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
               currentStart[tag] = i + offset;
               currentLength[tag] = 1;
             }
+            if (createBloomFilter) {
+              if (bloomFilter != null) {
+                bloomFilter.addLong(tag);
+              }
+              bloomFilterUtf8.addLong(tag);
+            }
           }
         }
         // write out any left over sequences

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/util/BloomFilter.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilter.java b/java/core/src/java/org/apache/orc/util/BloomFilter.java
new file mode 100644
index 0000000..a6ff741
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/util/BloomFilter.java
@@ -0,0 +1,312 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are
+ * highly space efficient when compared to using a HashSet. Because of the probabilistic nature of
+ * bloom filter false positive (element not present in bloom filter but test() says true) are
+ * possible but false negatives are not possible (if element is present then test() will never
+ * say false). The false positive probability is configurable (default: 5%) depending on which
+ * storage requirement may increase or decrease. Lower the false positive probability greater
+ * is the space requirement.
+ * Bloom filters are sensitive to number of elements that will be inserted in the bloom filter.
+ * During the creation of bloom filter expected number of entries must be specified. If the number
+ * of insertions exceed the specified initial number of entries then false positive probability will
+ * increase accordingly.
+ *
+ * Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash
+ * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it suffers from hash
+ * collisions for specific sequence of repeating bytes. Check the following link for more info
+ * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw
+ *
+ * Note that this class is here for backwards compatibility, because it uses
+ * the JVM default character set for strings. All new users should
+ * BloomFilterUtf8, which always uses UTF8 for the encoding.
+ */
+public class BloomFilter {
+  public static final double DEFAULT_FPP = 0.05;
+  private final BitSet bitSet;
+  private final int numBits;
+  private final int numHashFunctions;
+
+  static void checkArgument(boolean expression, String message) {
+    if (!expression) {
+      throw new IllegalArgumentException(message);
+    }
+  }
+
+  public BloomFilter(long expectedEntries) {
+    this(expectedEntries, DEFAULT_FPP);
+  }
+
+  public BloomFilter(long expectedEntries, double fpp) {
+    checkArgument(expectedEntries > 0, "expectedEntries should be > 0");
+    checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0");
+    int nb = optimalNumOfBits(expectedEntries, fpp);
+    // make 'm' multiple of 64
+    this.numBits = nb + (Long.SIZE - (nb % Long.SIZE));
+    this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, numBits);
+    this.bitSet = new BitSet(numBits);
+  }
+
+  /**
+   * A constructor to support rebuilding the BloomFilter from a serialized representation.
+   * @param bits the serialized bits
+   * @param numFuncs the number of functions used
+   */
+  public BloomFilter(long[] bits, int numFuncs) {
+    super();
+    bitSet = new BitSet(bits);
+    this.numBits = (int) bitSet.bitSize();
+    numHashFunctions = numFuncs;
+  }
+
+  static int optimalNumOfHashFunctions(long n, long m) {
+    return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
+  }
+
+  static int optimalNumOfBits(long n, double p) {
+    return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
+  }
+
+  public void add(byte[] val) {
+    if (val == null) {
+      addBytes(val, -1, -1);
+    } else {
+      addBytes(val, 0, val.length);
+    }
+  }
+
+  public void addBytes(byte[] val, int offset, int length) {
+    // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter"
+    // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively
+    // implement a Bloom filter without any loss in the asymptotic false positive probability'
+
+    // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned
+    // in the above paper
+    long hash64 = val == null ? Murmur3.NULL_HASHCODE :
+        Murmur3.hash64(val, offset, length);
+    addHash(hash64);
+  }
+
+  private void addHash(long hash64) {
+    int hash1 = (int) hash64;
+    int hash2 = (int) (hash64 >>> 32);
+
+    for (int i = 1; i <= numHashFunctions; i++) {
+      int combinedHash = hash1 + (i * hash2);
+      // hashcode should be positive, flip all the bits if it's negative
+      if (combinedHash < 0) {
+        combinedHash = ~combinedHash;
+      }
+      int pos = combinedHash % numBits;
+      bitSet.set(pos);
+    }
+  }
+
+  public void addString(String val) {
+    if (val == null) {
+      add(null);
+    } else {
+      add(val.getBytes(Charset.defaultCharset()));
+    }
+  }
+
+  public void addLong(long val) {
+    addHash(getLongHash(val));
+  }
+
+  public void addDouble(double val) {
+    addLong(Double.doubleToLongBits(val));
+  }
+
+  public boolean test(byte[] val) {
+    if (val == null) {
+      return testBytes(val, -1, -1);
+    }
+    return testBytes(val, 0, val.length);
+  }
+
+  public boolean testBytes(byte[] val, int offset, int length) {
+    long hash64 = val == null ? Murmur3.NULL_HASHCODE :
+        Murmur3.hash64(val, offset, length);
+    return testHash(hash64);
+  }
+
+  private boolean testHash(long hash64) {
+    int hash1 = (int) hash64;
+    int hash2 = (int) (hash64 >>> 32);
+
+    for (int i = 1; i <= numHashFunctions; i++) {
+      int combinedHash = hash1 + (i * hash2);
+      // hashcode should be positive, flip all the bits if it's negative
+      if (combinedHash < 0) {
+        combinedHash = ~combinedHash;
+      }
+      int pos = combinedHash % numBits;
+      if (!bitSet.get(pos)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  public boolean testString(String val) {
+    if (val == null) {
+      return test(null);
+    } else {
+      return test(val.getBytes(Charset.defaultCharset()));
+    }
+  }
+
+  public boolean testLong(long val) {
+    return testHash(getLongHash(val));
+  }
+
+  // Thomas Wang's integer hash function
+  // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
+  private long getLongHash(long key) {
+    key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+    key = key ^ (key >> 24);
+    key = (key + (key << 3)) + (key << 8); // key * 265
+    key = key ^ (key >> 14);
+    key = (key + (key << 2)) + (key << 4); // key * 21
+    key = key ^ (key >> 28);
+    key = key + (key << 31);
+    return key;
+  }
+
+  public boolean testDouble(double val) {
+    return testLong(Double.doubleToLongBits(val));
+  }
+
+  public long sizeInBytes() {
+    return getBitSize() / 8;
+  }
+
+  public int getBitSize() {
+    return bitSet.getData().length * Long.SIZE;
+  }
+
+  public int getNumHashFunctions() {
+    return numHashFunctions;
+  }
+
+  public long[] getBitSet() {
+    return bitSet.getData();
+  }
+
+  @Override
+  public String toString() {
+    return "m: " + numBits + " k: " + numHashFunctions;
+  }
+
+  /**
+   * Merge the specified bloom filter with current bloom filter.
+   *
+   * @param that - bloom filter to merge
+   */
+  public void merge(BloomFilter that) {
+    if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions) {
+      this.bitSet.putAll(that.bitSet);
+    } else {
+      throw new IllegalArgumentException("BloomFilters are not compatible for merging." +
+          " this - " + this.toString() + " that - " + that.toString());
+    }
+  }
+
+  public void reset() {
+    this.bitSet.clear();
+  }
+
+  /**
+   * Bare metal bit set implementation. For performance reasons, this implementation does not check
+   * for index bounds nor expand the bit set size if the specified index is greater than the size.
+   */
+  public static class BitSet {
+    private final long[] data;
+
+    public BitSet(long bits) {
+      this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]);
+    }
+
+    /**
+     * Deserialize long array as bit set.
+     *
+     * @param data - bit array
+     */
+    public BitSet(long[] data) {
+      assert data.length > 0 : "data length is zero!";
+      this.data = data;
+    }
+
+    /**
+     * Sets the bit at specified index.
+     *
+     * @param index - position
+     */
+    public void set(int index) {
+      data[index >>> 6] |= (1L << index);
+    }
+
+    /**
+     * Returns true if the bit is set in the specified index.
+     *
+     * @param index - position
+     * @return - value at the bit position
+     */
+    public boolean get(int index) {
+      return (data[index >>> 6] & (1L << index)) != 0;
+    }
+
+    /**
+     * Number of bits
+     */
+    public long bitSize() {
+      return (long) data.length * Long.SIZE;
+    }
+
+    public long[] getData() {
+      return data;
+    }
+
+    /**
+     * Combines the two BitArrays using bitwise OR.
+     */
+    public void putAll(BitSet array) {
+      assert data.length == array.data.length :
+          "BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")";
+      for (int i = 0; i < data.length; i++) {
+        data[i] |= array.data[i];
+      }
+    }
+
+    /**
+     * Clear the bit set.
+     */
+    public void clear() {
+      Arrays.fill(data, 0);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
new file mode 100644
index 0000000..ebd8c49
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+import com.google.protobuf.ByteString;
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcProto;
+import org.apache.orc.TypeDescription;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+
+public class BloomFilterIO  {
+
+  private BloomFilterIO() {
+    // never called
+  }
+
+  /**
+   * Deserialize a bloom filter from the ORC file.
+   */
+  public static BloomFilter deserialize(OrcProto.Stream.Kind kind,
+                                        OrcFile.WriterVersion fileVersion,
+                                        TypeDescription.Category type,
+                                        OrcProto.BloomFilter bloomFilter) {
+    if (bloomFilter == null) {
+      return null;
+    }
+    int numFuncs = bloomFilter.getNumHashFunctions();
+    switch (kind) {
+      case BLOOM_FILTER: {
+        long values[] = new long[bloomFilter.getBitsetCount()];
+        for (int i = 0; i < values.length; ++i) {
+          values[i] = bloomFilter.getBitset(i);
+        }
+        // After HIVE-12055 the bloom filters for strings correctly use
+        // UTF8.
+        if (fileVersion.includes(OrcFile.WriterVersion.HIVE_12055) &&
+            (type == TypeDescription.Category.STRING ||
+             type == TypeDescription.Category.CHAR ||
+             type == TypeDescription.Category.VARCHAR)) {
+          return new BloomFilterUtf8(values, numFuncs);
+        }
+        return new BloomFilter(values, numFuncs);
+      }
+      case BLOOM_FILTER_UTF8: {
+        ByteString bits = bloomFilter.getUtf8Bitset();
+        long[] values = new long[bits.size() / 8];
+        bits.asReadOnlyByteBuffer().asLongBuffer().get(values);
+        return new BloomFilterUtf8(values, numFuncs);
+      }
+      default:
+        throw new IllegalArgumentException("Unknown bloom filter kind " + kind);
+    }
+  }
+
+  /**
+   * Serialize the BloomFilter to the ORC file.
+   * @param builder the builder to write to
+   * @param bloomFilter the bloom filter to serialize
+   */
+  public static void serialize(OrcProto.BloomFilter.Builder builder,
+                               BloomFilter bloomFilter) {
+    builder.clear();
+    builder.setNumHashFunctions(bloomFilter.getNumHashFunctions());
+    long[] bitset = bloomFilter.getBitSet();
+    if (bloomFilter instanceof BloomFilterUtf8) {
+      ByteBuffer buffer = ByteBuffer.allocate(bitset.length * 8);
+      buffer.asLongBuffer().put(bitset);
+      builder.setUtf8Bitset(ByteString.copyFrom(buffer));
+    } else {
+      for(int i=0; i < bitset.length; ++i) {
+        builder.addBitset(bitset[i]);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/java/org/apache/orc/util/BloomFilterUtf8.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilterUtf8.java b/java/core/src/java/org/apache/orc/util/BloomFilterUtf8.java
new file mode 100644
index 0000000..aad4fab
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/util/BloomFilterUtf8.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * This class represents the fix from ORC-101 where we fixed the bloom filter
+ * from using the JVM's default character set to always using UTF-8.
+ */
+public class BloomFilterUtf8 extends BloomFilter {
+
+  public BloomFilterUtf8(long expectedEntries, double fpp) {
+    super(expectedEntries, fpp);
+  }
+
+  public BloomFilterUtf8(long[] bits, int numFuncs) {
+    super(bits, numFuncs);
+  }
+
+
+  public void addString(String val) {
+    if (val == null) {
+      add(null);
+    } else {
+      add(val.getBytes(StandardCharsets.UTF_8));
+    }
+  }
+
+  public boolean testString(String val) {
+    if (val == null) {
+      return test(null);
+    } else {
+      return test(val.getBytes(StandardCharsets.UTF_8));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index af20d1f..5ef0ced 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -1904,8 +1904,8 @@ public class TestVectorOrcFile {
             .withZeroCopy(false)
             .build());
     OrcIndex index =
-        meta.readRowIndex(reader.getStripes().get(0), null, null, null, null,
-            null);
+        meta.readRowIndex(reader.getStripes().get(0), null, null, false, null, null,
+            null, OrcFile.WriterVersion.ORC_101, null, null);
     // check the primitive columns to make sure they have the right number of
     // items in the first row group
     for(int c=1; c < 9; ++c) {