You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2016/05/20 21:22:39 UTC
[01/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Repository: hive
Updated Branches:
refs/heads/master fd06601eb -> ffb79509b
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java
index 932ae0b..6415bf8 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java
@@ -27,6 +27,7 @@ import java.sql.Timestamp;
import java.util.HashMap;
import java.util.Map;
+import org.apache.hadoop.hive.ql.util.TimestampUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.common.type.HiveChar;
@@ -1088,13 +1089,13 @@ public final class PrimitiveObjectInspectorUtils {
result = TimestampWritable.longToTimestamp(longValue, intToTimestampInSeconds);
break;
case FLOAT:
- result = TimestampWritable.doubleToTimestamp(((FloatObjectInspector) inputOI).get(o));
+ result = TimestampUtils.doubleToTimestamp(((FloatObjectInspector) inputOI).get(o));
break;
case DOUBLE:
- result = TimestampWritable.doubleToTimestamp(((DoubleObjectInspector) inputOI).get(o));
+ result = TimestampUtils.doubleToTimestamp(((DoubleObjectInspector) inputOI).get(o));
break;
case DECIMAL:
- result = TimestampWritable.decimalToTimestamp(((HiveDecimalObjectInspector) inputOI)
+ result = TimestampUtils.decimalToTimestamp(((HiveDecimalObjectInspector) inputOI)
.getPrimitiveJavaObject(o));
break;
case STRING:
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/serde/src/test/org/apache/hadoop/hive/serde2/io/TestTimestampWritable.java
----------------------------------------------------------------------
diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/io/TestTimestampWritable.java b/serde/src/test/org/apache/hadoop/hive/serde2/io/TestTimestampWritable.java
index 6c763bc..7619efa 100644
--- a/serde/src/test/org/apache/hadoop/hive/serde2/io/TestTimestampWritable.java
+++ b/serde/src/test/org/apache/hadoop/hive/serde2/io/TestTimestampWritable.java
@@ -35,6 +35,7 @@ import java.util.List;
import java.util.Random;
import java.util.TimeZone;
+import org.apache.hadoop.hive.ql.util.TimestampUtils;
import org.junit.*;
import static org.junit.Assert.*;
@@ -70,7 +71,7 @@ public class TestTimestampWritable {
long seconds = (ts.getTime() - ts.getNanos() / 1000000) / 1000;
// It should also be possible to calculate this based on ts.getTime() only.
- assertEquals(seconds, TimestampWritable.millisToSeconds(ts.getTime()));
+ assertEquals(seconds, TimestampUtils.millisToSeconds(ts.getTime()));
return seconds;
}
@@ -335,10 +336,10 @@ public class TestTimestampWritable {
Math.pow(10, 9 - nanosPrecision));
assertEquals(String.format("Invalid nanosecond part recovered from %f", asDouble),
nanos, recoveredNanos);
- assertEquals(ts, TimestampWritable.doubleToTimestamp(asDouble));
+ assertEquals(ts, TimestampUtils.doubleToTimestamp(asDouble));
// decimalToTimestamp should be consistent with doubleToTimestamp for this level of
// precision.
- assertEquals(ts, TimestampWritable.decimalToTimestamp(
+ assertEquals(ts, TimestampUtils.decimalToTimestamp(
HiveDecimal.create(BigDecimal.valueOf(asDouble))));
}
}
@@ -358,7 +359,7 @@ public class TestTimestampWritable {
Timestamp ts = new Timestamp(
randomMillis(MIN_FOUR_DIGIT_YEAR_MILLIS, MAX_FOUR_DIGIT_YEAR_MILLIS, rand));
ts.setNanos(randomNanos(rand, 9)); // full precision
- assertEquals(ts, TimestampWritable.decimalToTimestamp(timestampToDecimal(ts)));
+ assertEquals(ts, TimestampUtils.decimalToTimestamp(timestampToDecimal(ts)));
}
}
@@ -371,8 +372,8 @@ public class TestTimestampWritable {
for (int nanos : new int[] { 100000, 900000, 999100000, 999900000 }) {
ts.setNanos(nanos);
HiveDecimal d = timestampToDecimal(ts);
- assertEquals(ts, TimestampWritable.decimalToTimestamp(d));
- assertEquals(ts, TimestampWritable.doubleToTimestamp(d.bigDecimalValue().doubleValue()));
+ assertEquals(ts, TimestampUtils.decimalToTimestamp(d));
+ assertEquals(ts, TimestampUtils.doubleToTimestamp(d.bigDecimalValue().doubleValue()));
}
}
@@ -435,20 +436,20 @@ public class TestTimestampWritable {
@Concurrent(count=4)
@Repeating(repetition=100)
public void testMillisToSeconds() {
- assertEquals(0, TimestampWritable.millisToSeconds(0));
- assertEquals(-1, TimestampWritable.millisToSeconds(-1));
- assertEquals(-1, TimestampWritable.millisToSeconds(-999));
- assertEquals(-1, TimestampWritable.millisToSeconds(-1000));
- assertEquals(-2, TimestampWritable.millisToSeconds(-1001));
- assertEquals(-2, TimestampWritable.millisToSeconds(-1999));
- assertEquals(-2, TimestampWritable.millisToSeconds(-2000));
- assertEquals(-3, TimestampWritable.millisToSeconds(-2001));
- assertEquals(-99, TimestampWritable.millisToSeconds(-99000));
- assertEquals(-100, TimestampWritable.millisToSeconds(-99001));
- assertEquals(-100, TimestampWritable.millisToSeconds(-100000));
- assertEquals(1, TimestampWritable.millisToSeconds(1500));
- assertEquals(19, TimestampWritable.millisToSeconds(19999));
- assertEquals(20, TimestampWritable.millisToSeconds(20000));
+ assertEquals(0, TimestampUtils.millisToSeconds(0));
+ assertEquals(-1, TimestampUtils.millisToSeconds(-1));
+ assertEquals(-1, TimestampUtils.millisToSeconds(-999));
+ assertEquals(-1, TimestampUtils.millisToSeconds(-1000));
+ assertEquals(-2, TimestampUtils.millisToSeconds(-1001));
+ assertEquals(-2, TimestampUtils .millisToSeconds(-1999));
+ assertEquals(-2, TimestampUtils .millisToSeconds(-2000));
+ assertEquals(-3, TimestampUtils .millisToSeconds(-2001));
+ assertEquals(-99, TimestampUtils .millisToSeconds(-99000));
+ assertEquals(-100, TimestampUtils .millisToSeconds(-99001));
+ assertEquals(-100, TimestampUtils .millisToSeconds(-100000));
+ assertEquals(1, TimestampUtils .millisToSeconds(1500));
+ assertEquals(19, TimestampUtils .millisToSeconds(19999));
+ assertEquals(20, TimestampUtils .millisToSeconds(20000));
}
private static int compareEqualLengthByteArrays(byte[] a, byte[] b) {
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java
----------------------------------------------------------------------
diff --git a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java
index 2b7c747..ef2b7f7 100644
--- a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java
+++ b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/Hadoop23Shims.java
@@ -102,42 +102,18 @@ import org.apache.tez.test.MiniTezCluster;
public class Hadoop23Shims extends HadoopShimsSecure {
HadoopShims.MiniDFSShim cluster = null;
- final boolean zeroCopy;
final boolean storagePolicy;
- final boolean fastread;
public Hadoop23Shims() {
- boolean zcr = false;
+ // in-memory HDFS
boolean storage = false;
- boolean fastread = false;
try {
- Class.forName("org.apache.hadoop.fs.CacheFlag", false,
- ShimLoader.class.getClassLoader());
- zcr = true;
- } catch (ClassNotFoundException ce) {
- }
-
- if (zcr) {
- // in-memory HDFS is only available after zcr
- try {
- Class.forName("org.apache.hadoop.hdfs.protocol.BlockStoragePolicy",
+ Class.forName("org.apache.hadoop.hdfs.protocol.BlockStoragePolicy",
false, ShimLoader.class.getClassLoader());
- storage = true;
- } catch (ClassNotFoundException ce) {
- }
- }
-
- if (storage) {
- for (Method m : Text.class.getMethods()) {
- if ("readWithKnownLength".equals(m.getName())) {
- fastread = true;
- }
- }
+ storage = true;
+ } catch (ClassNotFoundException ce) {
}
-
this.storagePolicy = storage;
- this.zeroCopy = zcr;
- this.fastread = fastread;
}
@Override
@@ -854,15 +830,6 @@ public class Hadoop23Shims extends HadoopShimsSecure {
}
@Override
- public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, ByteBufferPoolShim pool) throws IOException {
- if(zeroCopy) {
- return ZeroCopyShims.getZeroCopyReader(in, pool);
- }
- /* not supported */
- return null;
- }
-
- @Override
public Configuration getConfiguration(org.apache.hadoop.mapreduce.JobContext context) {
return context.getConfiguration();
}
@@ -1302,26 +1269,4 @@ public class Hadoop23Shims extends HadoopShimsSecure {
public long getFileId(FileSystem fs, String path) throws IOException {
return ensureDfs(fs).getClient().getFileInfo(path).getFileId();
}
-
- private final class FastTextReaderShim implements TextReaderShim {
- private final DataInputStream din;
-
- public FastTextReaderShim(InputStream in) {
- this.din = new DataInputStream(in);
- }
-
- @Override
- public void read(Text txt, int len) throws IOException {
- txt.readWithKnownLength(din, len);
- }
- }
-
- @Override
- public TextReaderShim getTextReaderShim(InputStream in) throws IOException {
- if (!fastread) {
- return super.getTextReaderShim(in);
- }
- return new FastTextReaderShim(in);
- }
-
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/ZeroCopyShims.java
----------------------------------------------------------------------
diff --git a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/ZeroCopyShims.java b/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/ZeroCopyShims.java
deleted file mode 100644
index 6ef0467..0000000
--- a/shims/0.23/src/main/java/org/apache/hadoop/hive/shims/ZeroCopyShims.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.shims;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.EnumSet;
-
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.ReadOption;
-import org.apache.hadoop.io.ByteBufferPool;
-
-import org.apache.hadoop.hive.shims.HadoopShims.ByteBufferPoolShim;
-import org.apache.hadoop.hive.shims.HadoopShims.ZeroCopyReaderShim;
-
-class ZeroCopyShims {
- private static final class ByteBufferPoolAdapter implements ByteBufferPool {
- private ByteBufferPoolShim pool;
-
- public ByteBufferPoolAdapter(ByteBufferPoolShim pool) {
- this.pool = pool;
- }
-
- @Override
- public final ByteBuffer getBuffer(boolean direct, int length) {
- return this.pool.getBuffer(direct, length);
- }
-
- @Override
- public final void putBuffer(ByteBuffer buffer) {
- this.pool.putBuffer(buffer);
- }
- }
-
- private static final class ZeroCopyAdapter implements ZeroCopyReaderShim {
- private final FSDataInputStream in;
- private final ByteBufferPoolAdapter pool;
- private final static EnumSet<ReadOption> CHECK_SUM = EnumSet
- .noneOf(ReadOption.class);
- private final static EnumSet<ReadOption> NO_CHECK_SUM = EnumSet
- .of(ReadOption.SKIP_CHECKSUMS);
-
- public ZeroCopyAdapter(FSDataInputStream in, ByteBufferPoolShim poolshim) {
- this.in = in;
- if (poolshim != null) {
- pool = new ByteBufferPoolAdapter(poolshim);
- } else {
- pool = null;
- }
- }
-
- public final ByteBuffer readBuffer(int maxLength, boolean verifyChecksums)
- throws IOException {
- EnumSet<ReadOption> options = NO_CHECK_SUM;
- if (verifyChecksums) {
- options = CHECK_SUM;
- }
- return this.in.read(this.pool, maxLength, options);
- }
-
- public final void releaseBuffer(ByteBuffer buffer) {
- this.in.releaseBuffer(buffer);
- }
- }
-
- public static ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in,
- ByteBufferPoolShim pool) throws IOException {
- return new ZeroCopyAdapter(in, pool);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShims.java
----------------------------------------------------------------------
diff --git a/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShims.java b/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShims.java
index 37eb8f6..4a96355 100644
--- a/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShims.java
+++ b/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShims.java
@@ -403,57 +403,6 @@ public interface HadoopShims {
public StoragePolicyShim getStoragePolicyShim(FileSystem fs);
/**
- * a hadoop.io ByteBufferPool shim.
- */
- public interface ByteBufferPoolShim {
- /**
- * Get a new ByteBuffer from the pool. The pool can provide this from
- * removing a buffer from its internal cache, or by allocating a
- * new buffer.
- *
- * @param direct Whether the buffer should be direct.
- * @param length The minimum length the buffer will have.
- * @return A new ByteBuffer. Its capacity can be less
- * than what was requested, but must be at
- * least 1 byte.
- */
- ByteBuffer getBuffer(boolean direct, int length);
-
- /**
- * Release a buffer back to the pool.
- * The pool may choose to put this buffer into its cache/free it.
- *
- * @param buffer a direct bytebuffer
- */
- void putBuffer(ByteBuffer buffer);
- }
-
- /**
- * Provides an HDFS ZeroCopyReader shim.
- * @param in FSDataInputStream to read from (where the cached/mmap buffers are tied to)
- * @param in ByteBufferPoolShim to allocate fallback buffers with
- *
- * @return returns null if not supported
- */
- public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, ByteBufferPoolShim pool) throws IOException;
-
- public interface ZeroCopyReaderShim {
- /**
- * Get a ByteBuffer from the FSDataInputStream - this can be either a HeapByteBuffer or an MappedByteBuffer.
- * Also move the in stream by that amount. The data read can be small than maxLength.
- *
- * @return ByteBuffer read from the stream,
- */
- public ByteBuffer readBuffer(int maxLength, boolean verifyChecksums) throws IOException;
- /**
- * Release a ByteBuffer obtained from a read on the
- * Also move the in stream by that amount. The data read can be small than maxLength.
- *
- */
- public void releaseBuffer(ByteBuffer buffer);
- }
-
- /**
* Get configuration from JobContext
*/
public Configuration getConfiguration(JobContext context);
@@ -692,23 +641,4 @@ public interface HadoopShims {
*/
long getFileId(FileSystem fs, String path) throws IOException;
- /**
- * Read data into a Text object in the fastest way possible
- */
- public interface TextReaderShim {
- /**
- * @param txt
- * @param len
- * @return bytes read
- * @throws IOException
- */
- void read(Text txt, int size) throws IOException;
- }
-
- /**
- * Wrap a TextReaderShim around an input stream. The reader shim will not
- * buffer any reads from the underlying stream and will only consume bytes
- * which are required for TextReaderShim.read() input.
- */
- public TextReaderShim getTextReaderShim(InputStream input) throws IOException;
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java
----------------------------------------------------------------------
diff --git a/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java b/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java
index 87682e6..224ce3b 100644
--- a/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java
+++ b/shims/common/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java
@@ -392,33 +392,4 @@ public abstract class HadoopShimsSecure implements HadoopShims {
@Override
abstract public void addDelegationTokens(FileSystem fs, Credentials cred, String uname) throws IOException;
-
- private final class BasicTextReaderShim implements TextReaderShim {
- private final InputStream in;
-
- public BasicTextReaderShim(InputStream in) {
- this.in = in;
- }
-
- @Override
- public void read(Text txt, int len) throws IOException {
- int offset = 0;
- byte[] bytes = new byte[len];
- while (len > 0) {
- int written = in.read(bytes, offset, len);
- if (written < 0) {
- throw new EOFException("Can't finish read from " + in + " read "
- + (offset) + " bytes out of " + bytes.length);
- }
- len -= written;
- offset += written;
- }
- txt.set(bytes);
- }
- }
-
- @Override
- public TextReaderShim getTextReaderShim(InputStream in) throws IOException {
- return new BasicTextReaderShim(in);
- }
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java
----------------------------------------------------------------------
diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java
index d971339..228461a 100644
--- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java
+++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java
@@ -281,8 +281,13 @@ public class TimestampColumnVector extends ColumnVector {
* @param timestamp
*/
public void set(int elementNum, Timestamp timestamp) {
- this.time[elementNum] = timestamp.getTime();
- this.nanos[elementNum] = timestamp.getNanos();
+ if (timestamp == null) {
+ this.noNulls = false;
+ this.isNull[elementNum] = true;
+ } else {
+ this.time[elementNum] = timestamp.getTime();
+ this.nanos[elementNum] = timestamp.getNanos();
+ }
}
/**
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
----------------------------------------------------------------------
diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
new file mode 100644
index 0000000..90817a5
--- /dev/null
+++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
@@ -0,0 +1,354 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import java.util.Arrays;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+
+/**
+ * String expression evaluation helper functions.
+ */
+public class StringExpr {
+
+ /* Compare two strings from two byte arrays each
+ * with their own start position and length.
+ * Use lexicographic unsigned byte value order.
+ * This is what's used for UTF-8 sort order.
+ * Return negative value if arg1 < arg2, 0 if arg1 = arg2,
+ * positive if arg1 > arg2.
+ */
+ public static int compare(byte[] arg1, int start1, int len1, byte[] arg2, int start2, int len2) {
+ for (int i = 0; i < len1 && i < len2; i++) {
+ // Note the "& 0xff" is just a way to convert unsigned bytes to signed integer.
+ int b1 = arg1[i + start1] & 0xff;
+ int b2 = arg2[i + start2] & 0xff;
+ if (b1 != b2) {
+ return b1 - b2;
+ }
+ }
+ return len1 - len2;
+ }
+
+ /* Determine if two strings are equal from two byte arrays each
+ * with their own start position and length.
+ * Use lexicographic unsigned byte value order.
+ * This is what's used for UTF-8 sort order.
+ */
+ public static boolean equal(byte[] arg1, final int start1, final int len1,
+ byte[] arg2, final int start2, final int len2) {
+ if (len1 != len2) {
+ return false;
+ }
+ if (len1 == 0) {
+ return true;
+ }
+
+ // do bounds check for OOB exception
+ if (arg1[start1] != arg2[start2]
+ || arg1[start1 + len1 - 1] != arg2[start2 + len2 - 1]) {
+ return false;
+ }
+
+ if (len1 == len2) {
+ // prove invariant to the compiler: len1 = len2
+ // all array access between (start1, start1+len1)
+ // and (start2, start2+len2) are valid
+ // no more OOB exceptions are possible
+ final int step = 8;
+ final int remainder = len1 % step;
+ final int wlen = len1 - remainder;
+ // suffix first
+ for (int i = wlen; i < len1; i++) {
+ if (arg1[start1 + i] != arg2[start2 + i]) {
+ return false;
+ }
+ }
+ // SIMD loop
+ for (int i = 0; i < wlen; i += step) {
+ final int s1 = start1 + i;
+ final int s2 = start2 + i;
+ boolean neq = false;
+ for (int j = 0; j < step; j++) {
+ neq = (arg1[s1 + j] != arg2[s2 + j]) || neq;
+ }
+ if (neq) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ public static int characterCount(byte[] bytes) {
+ int end = bytes.length;
+
+ // count characters
+ int j = 0;
+ int charCount = 0;
+ while(j < end) {
+ // UTF-8 continuation bytes have 2 high bits equal to 0x80.
+ if ((bytes[j] & 0xc0) != 0x80) {
+ ++charCount;
+ }
+ j++;
+ }
+ return charCount;
+ }
+
+ public static int characterCount(byte[] bytes, int start, int length) {
+ int end = start + length;
+
+ // count characters
+ int j = start;
+ int charCount = 0;
+ while(j < end) {
+ // UTF-8 continuation bytes have 2 high bits equal to 0x80.
+ if ((bytes[j] & 0xc0) != 0x80) {
+ ++charCount;
+ }
+ j++;
+ }
+ return charCount;
+ }
+
+ // A setVal with the same function signature as rightTrim, leftTrim, truncate, etc, below.
+ // Useful for class generation via templates.
+ public static void assign(BytesColumnVector outV, int i, byte[] bytes, int start, int length) {
+ // set output vector
+ outV.setVal(i, bytes, start, length);
+ }
+
+ /*
+ * Right trim a slice of a byte array and return the new byte length.
+ */
+ public static int rightTrim(byte[] bytes, int start, int length) {
+ // skip trailing blank characters
+ int j = start + length - 1;
+ while(j >= start && bytes[j] == 0x20) {
+ j--;
+ }
+
+ return (j - start) + 1;
+ }
+
+ /*
+ * Right trim a slice of a byte array and place the result into element i of a vector.
+ */
+ public static void rightTrim(BytesColumnVector outV, int i, byte[] bytes, int start, int length) {
+ // skip trailing blank characters
+ int j = start + length - 1;
+ while(j >= start && bytes[j] == 0x20) {
+ j--;
+ }
+
+ // set output vector
+ outV.setVal(i, bytes, start, (j - start) + 1);
+ }
+
+ /*
+ * Truncate a slice of a byte array to a maximum number of characters and
+ * return the new byte length.
+ */
+ public static int truncate(byte[] bytes, int start, int length, int maxLength) {
+ int end = start + length;
+
+ // count characters forward
+ int j = start;
+ int charCount = 0;
+ while(j < end) {
+ // UTF-8 continuation bytes have 2 high bits equal to 0x80.
+ if ((bytes[j] & 0xc0) != 0x80) {
+ if (charCount == maxLength) {
+ break;
+ }
+ ++charCount;
+ }
+ j++;
+ }
+ return (j - start);
+ }
+
+ /*
+ * Truncate a slice of a byte array to a maximum number of characters and
+ * place the result into element i of a vector.
+ */
+ public static void truncate(BytesColumnVector outV, int i, byte[] bytes, int start, int length, int maxLength) {
+ int end = start + length;
+
+ // count characters forward
+ int j = start;
+ int charCount = 0;
+ while(j < end) {
+ // UTF-8 continuation bytes have 2 high bits equal to 0x80.
+ if ((bytes[j] & 0xc0) != 0x80) {
+ if (charCount == maxLength) {
+ break;
+ }
+ ++charCount;
+ }
+ j++;
+ }
+
+ // set output vector
+ outV.setVal(i, bytes, start, (j - start));
+ }
+
+ /*
+ * Truncate a byte array to a maximum number of characters and
+ * return a byte array with only truncated bytes.
+ */
+ public static byte[] truncateScalar(byte[] bytes, int maxLength) {
+ int end = bytes.length;
+
+ // count characters forward
+ int j = 0;
+ int charCount = 0;
+ while(j < end) {
+ // UTF-8 continuation bytes have 2 high bits equal to 0x80.
+ if ((bytes[j] & 0xc0) != 0x80) {
+ if (charCount == maxLength) {
+ break;
+ }
+ ++charCount;
+ }
+ j++;
+ }
+ if (j == end) {
+ return bytes;
+ } else {
+ return Arrays.copyOf(bytes, j);
+ }
+ }
+
+ /*
+ * Right trim and truncate a slice of a byte array to a maximum number of characters and
+ * return the new byte length.
+ */
+ public static int rightTrimAndTruncate(byte[] bytes, int start, int length, int maxLength) {
+ int end = start + length;
+
+ // count characters forward and watch for final run of pads
+ int j = start;
+ int charCount = 0;
+ int padRunStart = -1;
+ while(j < end) {
+ // UTF-8 continuation bytes have 2 high bits equal to 0x80.
+ if ((bytes[j] & 0xc0) != 0x80) {
+ if (charCount == maxLength) {
+ break;
+ }
+ if (bytes[j] == 0x20) {
+ if (padRunStart == -1) {
+ padRunStart = j;
+ }
+ } else {
+ padRunStart = -1;
+ }
+ ++charCount;
+ } else {
+ padRunStart = -1;
+ }
+ j++;
+ }
+ if (padRunStart != -1) {
+ return (padRunStart - start);
+ } else {
+ return (j - start);
+ }
+ }
+
+ /*
+ * Right trim and truncate a slice of a byte array to a maximum number of characters and
+ * place the result into element i of a vector.
+ */
+ public static void rightTrimAndTruncate(BytesColumnVector outV, int i, byte[] bytes, int start, int length, int maxLength) {
+ int end = start + length;
+
+ // count characters forward and watch for final run of pads
+ int j = start;
+ int charCount = 0;
+ int padRunStart = -1;
+ while(j < end) {
+ // UTF-8 continuation bytes have 2 high bits equal to 0x80.
+ if ((bytes[j] & 0xc0) != 0x80) {
+ if (charCount == maxLength) {
+ break;
+ }
+ if (bytes[j] == 0x20) {
+ if (padRunStart == -1) {
+ padRunStart = j;
+ }
+ } else {
+ padRunStart = -1;
+ }
+ ++charCount;
+ } else {
+ padRunStart = -1;
+ }
+ j++;
+ }
+ // set output vector
+ if (padRunStart != -1) {
+ outV.setVal(i, bytes, start, (padRunStart - start));
+ } else {
+ outV.setVal(i, bytes, start, (j - start) );
+ }
+ }
+
+ /*
+ * Right trim and truncate a byte array to a maximum number of characters and
+ * return a byte array with only the trimmed and truncated bytes.
+ */
+ public static byte[] rightTrimAndTruncateScalar(byte[] bytes, int maxLength) {
+ int end = bytes.length;
+
+ // count characters forward and watch for final run of pads
+ int j = 0;
+ int charCount = 0;
+ int padRunStart = -1;
+ while(j < end) {
+ // UTF-8 continuation bytes have 2 high bits equal to 0x80.
+ if ((bytes[j] & 0xc0) != 0x80) {
+ if (charCount == maxLength) {
+ break;
+ }
+ if (bytes[j] == 0x20) {
+ if (padRunStart == -1) {
+ padRunStart = j;
+ }
+ } else {
+ padRunStart = -1;
+ }
+ ++charCount;
+ } else {
+ padRunStart = -1;
+ }
+ j++;
+ }
+ if (padRunStart != -1) {
+ return Arrays.copyOf(bytes, padRunStart);
+ } else if (j == end) {
+ return bytes;
+ } else {
+ return Arrays.copyOf(bytes, j);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java
----------------------------------------------------------------------
diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java
index 8c5bab2..10d8c51 100644
--- a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java
+++ b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java
@@ -32,11 +32,11 @@ import java.util.Queue;
import java.util.Set;
/**
- * The implementation of SearchArguments.
+ * The implementation of SearchArguments. Visible for testing only.
*/
-final class SearchArgumentImpl implements SearchArgument {
+public final class SearchArgumentImpl implements SearchArgument {
- static final class PredicateLeafImpl implements PredicateLeaf {
+ public static final class PredicateLeafImpl implements PredicateLeaf {
private final Operator operator;
private final Type type;
private String columnName;
@@ -53,11 +53,11 @@ final class SearchArgumentImpl implements SearchArgument {
literalList = null;
}
- PredicateLeafImpl(Operator operator,
- Type type,
- String columnName,
- Object literal,
- List<Object> literalList) {
+ public PredicateLeafImpl(Operator operator,
+ Type type,
+ String columnName,
+ Object literal,
+ List<Object> literalList) {
this.operator = operator;
this.type = type;
this.columnName = columnName;
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/storage-api/src/java/org/apache/hadoop/hive/ql/util/TimestampUtils.java
----------------------------------------------------------------------
diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/util/TimestampUtils.java b/storage-api/src/java/org/apache/hadoop/hive/ql/util/TimestampUtils.java
new file mode 100644
index 0000000..189ead5
--- /dev/null
+++ b/storage-api/src/java/org/apache/hadoop/hive/ql/util/TimestampUtils.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.util;
+
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+
+import java.math.BigDecimal;
+import java.sql.Timestamp;
+
+/**
+ * Utitilities for Timestamps and the relevant conversions.
+ */
+public class TimestampUtils {
+ public static final BigDecimal BILLION_BIG_DECIMAL = BigDecimal.valueOf(1000000000);
+
+ /**
+ * Convert the timestamp to a double measured in seconds.
+ * @return double representation of the timestamp, accurate to nanoseconds
+ */
+ public static double getDouble(Timestamp ts) {
+ long seconds = millisToSeconds(ts.getTime());
+ return seconds + ((double) ts.getNanos()) / 1000000000;
+ }
+
+ public static Timestamp doubleToTimestamp(double f) {
+ long seconds = (long) f;
+
+ // We must ensure the exactness of the double's fractional portion.
+ // 0.6 as the fraction part will be converted to 0.59999... and
+ // significantly reduce the savings from binary serialization
+ BigDecimal bd;
+ try {
+ bd = new BigDecimal(String.valueOf(f));
+ } catch (NumberFormatException nfe) {
+ return null;
+ }
+ bd = bd.subtract(new BigDecimal(seconds)).multiply(new BigDecimal(1000000000));
+ int nanos = bd.intValue();
+
+ // Convert to millis
+ long millis = seconds * 1000;
+ if (nanos < 0) {
+ millis -= 1000;
+ nanos += 1000000000;
+ }
+ Timestamp t = new Timestamp(millis);
+
+ // Set remaining fractional portion to nanos
+ t.setNanos(nanos);
+ return t;
+ }
+
+ public static Timestamp decimalToTimestamp(HiveDecimal d) {
+ BigDecimal nanoInstant = d.bigDecimalValue().multiply(BILLION_BIG_DECIMAL);
+ int nanos = nanoInstant.remainder(BILLION_BIG_DECIMAL).intValue();
+ if (nanos < 0) {
+ nanos += 1000000000;
+ }
+ long seconds =
+ nanoInstant.subtract(new BigDecimal(nanos)).divide(BILLION_BIG_DECIMAL).longValue();
+ Timestamp t = new Timestamp(seconds * 1000);
+ t.setNanos(nanos);
+
+ return t;
+ }
+
+ /**
+ * Rounds the number of milliseconds relative to the epoch down to the nearest whole number of
+ * seconds. 500 would round to 0, -500 would round to -1.
+ */
+ public static long millisToSeconds(long millis) {
+ if (millis >= 0) {
+ return millis / 1000;
+ } else {
+ return (millis - 999) / 1000;
+ }
+ }
+}
[16/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/tools/TestFileDump.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/tools/TestFileDump.java b/orc/src/test/org/apache/orc/tools/TestFileDump.java
new file mode 100644
index 0000000..ce3381e
--- /dev/null
+++ b/orc/src/test/org/apache/orc/tools/TestFileDump.java
@@ -0,0 +1,486 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcConf;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestFileDump {
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir"));
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Before
+ public void openFileSystem () throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ fs.setWorkingDirectory(workDir);
+ testFilePath = new Path("TestFileDump.testDump.orc");
+ fs.delete(testFilePath, false);
+ }
+
+ static TypeDescription getMyRecordType() {
+ return TypeDescription.createStruct()
+ .addField("i", TypeDescription.createInt())
+ .addField("l", TypeDescription.createLong())
+ .addField("s", TypeDescription.createString());
+ }
+
+ static void appendMyRecord(VectorizedRowBatch batch,
+ int i,
+ long l,
+ String str) {
+ ((LongColumnVector) batch.cols[0]).vector[batch.size] = i;
+ ((LongColumnVector) batch.cols[1]).vector[batch.size] = l;
+ if (str == null) {
+ batch.cols[2].noNulls = false;
+ batch.cols[2].isNull[batch.size] = true;
+ } else {
+ ((BytesColumnVector) batch.cols[2]).setVal(batch.size,
+ str.getBytes());
+ }
+ batch.size += 1;
+ }
+
+ static TypeDescription getAllTypesType() {
+ return TypeDescription.createStruct()
+ .addField("b", TypeDescription.createBoolean())
+ .addField("bt", TypeDescription.createByte())
+ .addField("s", TypeDescription.createShort())
+ .addField("i", TypeDescription.createInt())
+ .addField("l", TypeDescription.createLong())
+ .addField("f", TypeDescription.createFloat())
+ .addField("d", TypeDescription.createDouble())
+ .addField("de", TypeDescription.createDecimal())
+ .addField("t", TypeDescription.createTimestamp())
+ .addField("dt", TypeDescription.createDate())
+ .addField("str", TypeDescription.createString())
+ .addField("c", TypeDescription.createChar().withMaxLength(5))
+ .addField("vc", TypeDescription.createVarchar().withMaxLength(10))
+ .addField("m", TypeDescription.createMap(
+ TypeDescription.createString(),
+ TypeDescription.createString()))
+ .addField("a", TypeDescription.createList(TypeDescription.createInt()))
+ .addField("st", TypeDescription.createStruct()
+ .addField("i", TypeDescription.createInt())
+ .addField("s", TypeDescription.createString()));
+ }
+
+ static void appendAllTypes(VectorizedRowBatch batch,
+ boolean b,
+ byte bt,
+ short s,
+ int i,
+ long l,
+ float f,
+ double d,
+ HiveDecimalWritable de,
+ Timestamp t,
+ DateWritable dt,
+ String str,
+ String c,
+ String vc,
+ Map<String, String> m,
+ List<Integer> a,
+ int sti,
+ String sts) {
+ int row = batch.size++;
+ ((LongColumnVector) batch.cols[0]).vector[row] = b ? 1 : 0;
+ ((LongColumnVector) batch.cols[1]).vector[row] = bt;
+ ((LongColumnVector) batch.cols[2]).vector[row] = s;
+ ((LongColumnVector) batch.cols[3]).vector[row] = i;
+ ((LongColumnVector) batch.cols[4]).vector[row] = l;
+ ((DoubleColumnVector) batch.cols[5]).vector[row] = f;
+ ((DoubleColumnVector) batch.cols[6]).vector[row] = d;
+ ((DecimalColumnVector) batch.cols[7]).vector[row].set(de);
+ ((TimestampColumnVector) batch.cols[8]).set(row, t);
+ ((LongColumnVector) batch.cols[9]).vector[row] = dt.getDays();
+ ((BytesColumnVector) batch.cols[10]).setVal(row, str.getBytes());
+ ((BytesColumnVector) batch.cols[11]).setVal(row, c.getBytes());
+ ((BytesColumnVector) batch.cols[12]).setVal(row, vc.getBytes());
+ MapColumnVector map = (MapColumnVector) batch.cols[13];
+ int offset = map.childCount;
+ map.offsets[row] = offset;
+ map.lengths[row] = m.size();
+ map.childCount += map.lengths[row];
+ for(Map.Entry<String, String> entry: m.entrySet()) {
+ ((BytesColumnVector) map.keys).setVal(offset, entry.getKey().getBytes());
+ ((BytesColumnVector) map.values).setVal(offset++,
+ entry.getValue().getBytes());
+ }
+ ListColumnVector list = (ListColumnVector) batch.cols[14];
+ offset = list.childCount;
+ list.offsets[row] = offset;
+ list.lengths[row] = a.size();
+ list.childCount += list.lengths[row];
+ for(int e=0; e < a.size(); ++e) {
+ ((LongColumnVector) list.child).vector[offset + e] = a.get(e);
+ }
+ StructColumnVector struct = (StructColumnVector) batch.cols[15];
+ ((LongColumnVector) struct.fields[0]).vector[row] = sti;
+ ((BytesColumnVector) struct.fields[1]).setVal(row, sts.getBytes());
+ }
+
+ public static void checkOutput(String expected,
+ String actual) throws Exception {
+ BufferedReader eStream =
+ new BufferedReader(new FileReader
+ (TestJsonFileDump.getFileFromClasspath(expected)));
+ BufferedReader aStream =
+ new BufferedReader(new FileReader(actual));
+ String expectedLine = eStream.readLine().trim();
+ while (expectedLine != null) {
+ String actualLine = aStream.readLine().trim();
+ System.out.println("actual: " + actualLine);
+ System.out.println("expected: " + expectedLine);
+ Assert.assertEquals(expectedLine, actualLine);
+ expectedLine = eStream.readLine();
+ expectedLine = expectedLine == null ? null : expectedLine.trim();
+ }
+ Assert.assertNull(eStream.readLine());
+ Assert.assertNull(aStream.readLine());
+ eStream.close();
+ aStream.close();
+ }
+
+ @Test
+ public void testDump() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .compress(CompressionKind.ZLIB)
+ .stripeSize(100000)
+ .rowIndexStride(1000));
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ for(int i=0; i < 21000; ++i) {
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
+ words[r1.nextInt(words.length)]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=1,2,3"});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+
+ @Test
+ public void testDataDump() throws Exception {
+ TypeDescription schema = getAllTypesType();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .rowIndexStride(1000));
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ Map<String, String> m = new HashMap<String, String>(2);
+ m.put("k1", "v1");
+ appendAllTypes(batch,
+ true,
+ (byte) 10,
+ (short) 100,
+ 1000,
+ 10000L,
+ 4.0f,
+ 20.0,
+ new HiveDecimalWritable("4.2222"),
+ new Timestamp(1416967764000L),
+ new DateWritable(new Date(1416967764000L)),
+ "string",
+ "hello",
+ "hello",
+ m,
+ Arrays.asList(100, 200),
+ 10, "foo");
+ m.clear();
+ m.put("k3", "v3");
+ appendAllTypes(
+ batch,
+ false,
+ (byte)20,
+ (short)200,
+ 2000,
+ 20000L,
+ 8.0f,
+ 40.0,
+ new HiveDecimalWritable("2.2222"),
+ new Timestamp(1416967364000L),
+ new DateWritable(new Date(1411967764000L)),
+ "abcd",
+ "world",
+ "world",
+ m,
+ Arrays.asList(200, 300),
+ 20, "bar");
+ writer.addRowBatch(batch);
+
+ writer.close();
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "-d"});
+ System.out.flush();
+ System.setOut(origOut);
+ String[] lines = myOut.toString().split("\n");
+ Assert.assertEquals("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.2222\",\"t\":\"2014-11-25 18:09:24.0\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello\",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]);
+ Assert.assertEquals("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.2222\",\"t\":\"2014-11-25 18:02:44.0\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world\",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]);
+ }
+
+ // Test that if the fraction of rows that have distinct strings is greater than the configured
+ // threshold dictionary encoding is turned off. If dictionary encoding is turned off the length
+ // of the dictionary stream for the column will be 0 in the ORC file dump.
+ @Test
+ public void testDictionaryThreshold() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ Configuration conf = new Configuration();
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ conf.setFloat(OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getAttribute(), 0.49f);
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.ZLIB)
+ .rowIndexStride(1000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ int nextInt = 0;
+ for(int i=0; i < 21000; ++i) {
+ // Write out the same string twice, this guarantees the fraction of rows with
+ // distinct strings is 0.5
+ if (i % 2 == 0) {
+ nextInt = r1.nextInt(words.length);
+ // Append the value of i to the word, this guarantees when an index or word is repeated
+ // the actual string is unique.
+ words[nextInt] += "-" + i;
+ }
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(), words[nextInt]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size != 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump-dictionary-threshold.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=1,2,3"});
+ System.out.flush();
+ System.setOut(origOut);
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+
+ @Test
+ public void testBloomFilter() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.ZLIB)
+ .bufferSize(10000)
+ .rowIndexStride(1000)
+ .bloomFilterColumns("S");
+ Writer writer = OrcFile.createWriter(testFilePath, options);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ for(int i=0; i < 21000; ++i) {
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
+ words[r1.nextInt(words.length)]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump-bloomfilter.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=3"});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+
+ @Test
+ public void testBloomFilter2() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.ZLIB)
+ .bufferSize(10000)
+ .rowIndexStride(1000)
+ .bloomFilterColumns("l")
+ .bloomFilterFpp(0.01);
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ Writer writer = OrcFile.createWriter(testFilePath, options);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ for(int i=0; i < 21000; ++i) {
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
+ words[r1.nextInt(words.length)]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump-bloomfilter2.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=2"});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/tools/TestJsonFileDump.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/tools/TestJsonFileDump.java b/orc/src/test/org/apache/orc/tools/TestJsonFileDump.java
new file mode 100644
index 0000000..a514824
--- /dev/null
+++ b/orc/src/test/org/apache/orc/tools/TestJsonFileDump.java
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.net.URL;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcConf;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestJsonFileDump {
+ public static String getFileFromClasspath(String name) {
+ URL url = ClassLoader.getSystemResource(name);
+ if (url == null) {
+ throw new IllegalArgumentException("Could not find " + name);
+ }
+ return url.getPath();
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir"));
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Before
+ public void openFileSystem () throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ fs.setWorkingDirectory(workDir);
+ testFilePath = new Path("TestFileDump.testDump.orc");
+ fs.delete(testFilePath, false);
+ }
+
+ static void checkOutput(String expected,
+ String actual) throws Exception {
+ BufferedReader eStream =
+ new BufferedReader(new FileReader(getFileFromClasspath(expected)));
+ BufferedReader aStream =
+ new BufferedReader(new FileReader(actual));
+ String expectedLine = eStream.readLine();
+ while (expectedLine != null) {
+ String actualLine = aStream.readLine();
+ System.out.println("actual: " + actualLine);
+ System.out.println("expected: " + expectedLine);
+ assertEquals(expectedLine, actualLine);
+ expectedLine = eStream.readLine();
+ }
+ assertNull(eStream.readLine());
+ assertNull(aStream.readLine());
+ }
+
+ @Test
+ public void testJsonDump() throws Exception {
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("i", TypeDescription.createInt())
+ .addField("l", TypeDescription.createLong())
+ .addField("s", TypeDescription.createString());
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.ZLIB)
+ .bufferSize(10000)
+ .rowIndexStride(1000)
+ .bloomFilterColumns("s");
+ Writer writer = OrcFile.createWriter(testFilePath, options);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ for(int i=0; i < 21000; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[batch.size] = r1.nextInt();
+ ((LongColumnVector) batch.cols[1]).vector[batch.size] = r1.nextLong();
+ if (i % 100 == 0) {
+ batch.cols[2].noNulls = false;
+ batch.cols[2].isNull[batch.size] = true;
+ } else {
+ ((BytesColumnVector) batch.cols[2]).setVal(batch.size,
+ words[r1.nextInt(words.length)].getBytes());
+ }
+ batch.size += 1;
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump.json";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "-j", "-p", "--rowindex=3"});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/resources/orc-file-11-format.orc
----------------------------------------------------------------------
diff --git a/orc/src/test/resources/orc-file-11-format.orc b/orc/src/test/resources/orc-file-11-format.orc
new file mode 100644
index 0000000..41653c8
Binary files /dev/null and b/orc/src/test/resources/orc-file-11-format.orc differ
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/resources/orc-file-dump-bloomfilter.out
----------------------------------------------------------------------
diff --git a/orc/src/test/resources/orc-file-dump-bloomfilter.out b/orc/src/test/resources/orc-file-dump-bloomfilter.out
new file mode 100644
index 0000000..18fd2fb
--- /dev/null
+++ b/orc/src/test/resources/orc-file-dump-bloomfilter.out
@@ -0,0 +1,179 @@
+Structure for TestFileDump.testDump.orc
+File Version: 0.12 with HIVE_13083
+Rows: 21000
+Compression: ZLIB
+Compression size: 4096
+Type: struct<i:int,l:bigint,s:string>
+
+Stripe Statistics:
+ Stripe 1:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146021688 max: 2147223299 sum: 515792826
+ Column 2: count: 5000 hasNull: false min: -9218592812243954469 max: 9221614132680747961
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19280
+ Stripe 2:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146733128 max: 2147001622 sum: 7673427
+ Column 2: count: 5000 hasNull: false min: -9220818777591257749 max: 9222259462014003839
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19504
+ Stripe 3:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146993718 max: 2147378179 sum: 132660742551
+ Column 2: count: 5000 hasNull: false min: -9218342074710552826 max: 9222303228623055266
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19641
+ Stripe 4:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146658006 max: 2145520931 sum: 8533549236
+ Column 2: count: 5000 hasNull: false min: -9222758097219661129 max: 9221043130193737406
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19470
+ Stripe 5:
+ Column 0: count: 1000 hasNull: false
+ Column 1: count: 1000 hasNull: false min: -2146245500 max: 2146378640 sum: 51299706363
+ Column 2: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476
+ Column 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866
+
+File Statistics:
+ Column 0: count: 21000 hasNull: false
+ Column 1: count: 21000 hasNull: false min: -2146993718 max: 2147378179 sum: 193017464403
+ Column 2: count: 21000 hasNull: false min: -9222758097219661129 max: 9222303228623055266
+ Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
+
+Stripes:
+ Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 951
+ Stream: column 0 section ROW_INDEX start: 3 length 17
+ Stream: column 1 section ROW_INDEX start: 20 length 166
+ Stream: column 2 section ROW_INDEX start: 186 length 169
+ Stream: column 3 section ROW_INDEX start: 355 length 87
+ Stream: column 3 section BLOOM_FILTER start: 442 length 512
+ Stream: column 1 section DATA start: 954 length 20035
+ Stream: column 2 section DATA start: 20989 length 40050
+ Stream: column 3 section DATA start: 61039 length 3543
+ Stream: column 3 section LENGTH start: 64582 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 64607 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3862 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3884 positions: 0,659,149
+ Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3893 positions: 0,1531,3
+ Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3798 positions: 0,2281,32
+ Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3843 positions: 0,3033,45
+ Bloom filters for column 3:
+ Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 1: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 2: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe: offset: 64826 data: 63775 rows: 5000 tail: 86 index: 944
+ Stream: column 0 section ROW_INDEX start: 64826 length 17
+ Stream: column 1 section ROW_INDEX start: 64843 length 164
+ Stream: column 2 section ROW_INDEX start: 65007 length 168
+ Stream: column 3 section ROW_INDEX start: 65175 length 83
+ Stream: column 3 section BLOOM_FILTER start: 65258 length 512
+ Stream: column 1 section DATA start: 65770 length 20035
+ Stream: column 2 section DATA start: 85805 length 40050
+ Stream: column 3 section DATA start: 125855 length 3532
+ Stream: column 3 section LENGTH start: 129387 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 129412 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3923 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3869 positions: 0,761,12
+ Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,1472,70
+ Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3931 positions: 0,2250,43
+ Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3964 positions: 0,2978,88
+ Bloom filters for column 3:
+ Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 1: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 2: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe: offset: 129631 data: 63787 rows: 5000 tail: 86 index: 950
+ Stream: column 0 section ROW_INDEX start: 129631 length 17
+ Stream: column 1 section ROW_INDEX start: 129648 length 163
+ Stream: column 2 section ROW_INDEX start: 129811 length 168
+ Stream: column 3 section ROW_INDEX start: 129979 length 90
+ Stream: column 3 section BLOOM_FILTER start: 130069 length 512
+ Stream: column 1 section DATA start: 130581 length 20035
+ Stream: column 2 section DATA start: 150616 length 40050
+ Stream: column 3 section DATA start: 190666 length 3544
+ Stream: column 3 section LENGTH start: 194210 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 194235 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 4008 positions: 0,634,174
+ Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3999 positions: 0,1469,69
+ Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,2133,194
+ Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 4000 positions: 0,3005,43
+ Bloom filters for column 3:
+ Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 1: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 2: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe: offset: 194454 data: 63817 rows: 5000 tail: 86 index: 952
+ Stream: column 0 section ROW_INDEX start: 194454 length 17
+ Stream: column 1 section ROW_INDEX start: 194471 length 165
+ Stream: column 2 section ROW_INDEX start: 194636 length 167
+ Stream: column 3 section ROW_INDEX start: 194803 length 91
+ Stream: column 3 section BLOOM_FILTER start: 194894 length 512
+ Stream: column 1 section DATA start: 195406 length 20035
+ Stream: column 2 section DATA start: 215441 length 40050
+ Stream: column 3 section DATA start: 255491 length 3574
+ Stream: column 3 section LENGTH start: 259065 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 259090 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3901 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3900 positions: 0,431,431
+ Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3909 positions: 0,1485,52
+ Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3947 positions: 0,2196,104
+ Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3813 positions: 0,2934,131
+ Bloom filters for column 3:
+ Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 1: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 2: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe: offset: 259309 data: 12943 rows: 1000 tail: 78 index: 432
+ Stream: column 0 section ROW_INDEX start: 259309 length 12
+ Stream: column 1 section ROW_INDEX start: 259321 length 38
+ Stream: column 2 section ROW_INDEX start: 259359 length 41
+ Stream: column 3 section ROW_INDEX start: 259400 length 40
+ Stream: column 3 section BLOOM_FILTER start: 259440 length 301
+ Stream: column 1 section DATA start: 259741 length 4007
+ Stream: column 2 section DATA start: 263748 length 8010
+ Stream: column 3 section DATA start: 271758 length 768
+ Stream: column 3 section LENGTH start: 272526 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 272551 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866 positions: 0,0,0
+ Bloom filters for column 3:
+ Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+
+File length: 273307 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+________________________________________________________________________________________________________________________
+
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/resources/orc-file-dump-bloomfilter2.out
----------------------------------------------------------------------
diff --git a/orc/src/test/resources/orc-file-dump-bloomfilter2.out b/orc/src/test/resources/orc-file-dump-bloomfilter2.out
new file mode 100644
index 0000000..fa5cc2d
--- /dev/null
+++ b/orc/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -0,0 +1,179 @@
+Structure for TestFileDump.testDump.orc
+File Version: 0.12 with HIVE_13083
+Rows: 21000
+Compression: ZLIB
+Compression size: 4096
+Type: struct<i:int,l:bigint,s:string>
+
+Stripe Statistics:
+ Stripe 1:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146021688 max: 2147223299 sum: 515792826
+ Column 2: count: 5000 hasNull: false min: -9218592812243954469 max: 9221614132680747961
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19280
+ Stripe 2:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146733128 max: 2147001622 sum: 7673427
+ Column 2: count: 5000 hasNull: false min: -9220818777591257749 max: 9222259462014003839
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19504
+ Stripe 3:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146993718 max: 2147378179 sum: 132660742551
+ Column 2: count: 5000 hasNull: false min: -9218342074710552826 max: 9222303228623055266
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19641
+ Stripe 4:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146658006 max: 2145520931 sum: 8533549236
+ Column 2: count: 5000 hasNull: false min: -9222758097219661129 max: 9221043130193737406
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19470
+ Stripe 5:
+ Column 0: count: 1000 hasNull: false
+ Column 1: count: 1000 hasNull: false min: -2146245500 max: 2146378640 sum: 51299706363
+ Column 2: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476
+ Column 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866
+
+File Statistics:
+ Column 0: count: 21000 hasNull: false
+ Column 1: count: 21000 hasNull: false min: -2146993718 max: 2147378179 sum: 193017464403
+ Column 2: count: 21000 hasNull: false min: -9222758097219661129 max: 9222303228623055266
+ Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
+
+Stripes:
+ Stripe: offset: 3 data: 63786 rows: 5000 tail: 85 index: 6974
+ Stream: column 0 section ROW_INDEX start: 3 length 17
+ Stream: column 1 section ROW_INDEX start: 20 length 166
+ Stream: column 2 section ROW_INDEX start: 186 length 169
+ Stream: column 2 section BLOOM_FILTER start: 355 length 6535
+ Stream: column 3 section ROW_INDEX start: 6890 length 87
+ Stream: column 1 section DATA start: 6977 length 20035
+ Stream: column 2 section DATA start: 27012 length 40050
+ Stream: column 3 section DATA start: 67062 length 3543
+ Stream: column 3 section LENGTH start: 70605 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 70630 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9200577545527640566 max: 9175500305011173751 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9203618157670445774 max: 9208123824411178101 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9218592812243954469 max: 9221351515892923972 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9206585617947511272 max: 9167703224425685487 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9206645795733282496 max: 9221614132680747961 positions: 28693,14,416
+ Bloom filters for column 2:
+ Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4931 loadFactor: 0.5136 expectedFpp: 0.009432924
+ Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 4956 loadFactor: 0.5163 expectedFpp: 0.009772834
+ Entry 2: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
+ Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
+ Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 0.5155 expectedFpp: 0.009676614
+ Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 loadFactor: 0.9736 expectedFpp: 0.829482
+ Stripe: offset: 70848 data: 63775 rows: 5000 tail: 85 index: 6965
+ Stream: column 0 section ROW_INDEX start: 70848 length 17
+ Stream: column 1 section ROW_INDEX start: 70865 length 164
+ Stream: column 2 section ROW_INDEX start: 71029 length 168
+ Stream: column 2 section BLOOM_FILTER start: 71197 length 6533
+ Stream: column 3 section ROW_INDEX start: 77730 length 83
+ Stream: column 1 section DATA start: 77813 length 20035
+ Stream: column 2 section DATA start: 97848 length 40050
+ Stream: column 3 section DATA start: 137898 length 3532
+ Stream: column 3 section LENGTH start: 141430 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 141455 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9218450653857701562 max: 9189819526332228512 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9220818777591257749 max: 9178821722829648113 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9220031433030423388 max: 9210838931786956852 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9208195729739635607 max: 9222259462014003839 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9174271499932339698 max: 9212277876771676916 positions: 28693,14,416
+ Bloom filters for column 2:
+ Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
+ Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 4988 loadFactor: 0.5196 expectedFpp: 0.010223193
+ Entry 2: numHashFunctions: 7 bitCount: 9600 popCount: 5002 loadFactor: 0.521 expectedFpp: 0.01042575
+ Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 0.5169 expectedFpp: 0.009855959
+ Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 0.5173 expectedFpp: 0.009911705
+ Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 loadFactor: 0.9733 expectedFpp: 0.8276205
+ Stripe: offset: 141673 data: 63787 rows: 5000 tail: 85 index: 6971
+ Stream: column 0 section ROW_INDEX start: 141673 length 17
+ Stream: column 1 section ROW_INDEX start: 141690 length 163
+ Stream: column 2 section ROW_INDEX start: 141853 length 168
+ Stream: column 2 section BLOOM_FILTER start: 142021 length 6533
+ Stream: column 3 section ROW_INDEX start: 148554 length 90
+ Stream: column 1 section DATA start: 148644 length 20035
+ Stream: column 2 section DATA start: 168679 length 40050
+ Stream: column 3 section DATA start: 208729 length 3544
+ Stream: column 3 section LENGTH start: 212273 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 212298 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9211978436552246208 max: 9179058898902097152 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9195645160817780503 max: 9189147759444307708 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9202888157616520823 max: 9193561362676960747 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9216318198067839390 max: 9221286760675829363 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9218342074710552826 max: 9222303228623055266 positions: 28693,14,416
+ Bloom filters for column 2:
+ Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4967 loadFactor: 0.5174 expectedFpp: 0.009925688
+ Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 5002 loadFactor: 0.521 expectedFpp: 0.01042575
+ Entry 2: numHashFunctions: 7 bitCount: 9600 popCount: 4964 loadFactor: 0.5171 expectedFpp: 0.009883798
+ Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 0.5149 expectedFpp: 0.009594797
+ Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 0.5135 expectedFpp: 0.009419539
+ Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 loadFactor: 0.9722 expectedFpp: 0.82082444
+ Stripe: offset: 212516 data: 63817 rows: 5000 tail: 85 index: 6964
+ Stream: column 0 section ROW_INDEX start: 212516 length 17
+ Stream: column 1 section ROW_INDEX start: 212533 length 165
+ Stream: column 2 section ROW_INDEX start: 212698 length 167
+ Stream: column 2 section BLOOM_FILTER start: 212865 length 6524
+ Stream: column 3 section ROW_INDEX start: 219389 length 91
+ Stream: column 1 section DATA start: 219480 length 20035
+ Stream: column 2 section DATA start: 239515 length 40050
+ Stream: column 3 section DATA start: 279565 length 3574
+ Stream: column 3 section LENGTH start: 283139 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 283164 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9222731174895935707 max: 9214167447015056056 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9222758097219661129 max: 9221043130193737406 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9174483776261243438 max: 9208134757538374043 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9174329712613510612 max: 9197412874152820822 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9221162005892422758 max: 9220625004936875965 positions: 28693,14,416
+ Bloom filters for column 2:
+ Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4951 loadFactor: 0.5157 expectedFpp: 0.009704026
+ Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 4969 loadFactor: 0.5176 expectedFpp: 0.009953696
+ Entry 2: numHashFunctions: 7 bitCount: 9600 popCount: 4994 loadFactor: 0.5202 expectedFpp: 0.010309587
+ Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 0.5147 expectedFpp: 0.009567649
+ Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 0.5201 expectedFpp: 0.010295142
+ Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 loadFactor: 0.9743 expectedFpp: 0.8332165
+ Stripe: offset: 283382 data: 12943 rows: 1000 tail: 78 index: 1468
+ Stream: column 0 section ROW_INDEX start: 283382 length 12
+ Stream: column 1 section ROW_INDEX start: 283394 length 38
+ Stream: column 2 section ROW_INDEX start: 283432 length 41
+ Stream: column 2 section BLOOM_FILTER start: 283473 length 1337
+ Stream: column 3 section ROW_INDEX start: 284810 length 40
+ Stream: column 1 section DATA start: 284850 length 4007
+ Stream: column 2 section DATA start: 288857 length 8010
+ Stream: column 3 section DATA start: 296867 length 768
+ Stream: column 3 section LENGTH start: 297635 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 297660 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476 positions: 0,0,0
+ Bloom filters for column 2:
+ Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294
+ Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294
+
+File length: 298416 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+________________________________________________________________________________________________________________________
+
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/resources/orc-file-dump-dictionary-threshold.out
----------------------------------------------------------------------
diff --git a/orc/src/test/resources/orc-file-dump-dictionary-threshold.out b/orc/src/test/resources/orc-file-dump-dictionary-threshold.out
new file mode 100644
index 0000000..17a964b
--- /dev/null
+++ b/orc/src/test/resources/orc-file-dump-dictionary-threshold.out
@@ -0,0 +1,190 @@
+Structure for TestFileDump.testDump.orc
+File Version: 0.12 with HIVE_13083
+Rows: 21000
+Compression: ZLIB
+Compression size: 4096
+Type: struct<i:int,l:bigint,s:string>
+
+Stripe Statistics:
+ Stripe 1:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2147115959 max: 2145911404 sum: 159677169195
+ Column 2: count: 5000 hasNull: false min: -9216505819108477308 max: 9217851628057711416
+ Column 3: count: 5000 hasNull: false min: Darkness,-230 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744 sum: 381254
+ Stripe 2:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2147390285 max: 2147224606 sum: -14961457759
+ Column 2: count: 5000 hasNull: false min: -9222178666167296739 max: 9221301751385928177
+ Column 3: count: 5000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938 sum: 1117994
+ Stripe 3:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2145842720 max: 2146718321 sum: 141092475520
+ Column 2: count: 5000 hasNull: false min: -9221963099397084326 max: 9222722740629726770
+ Column 3: count: 5000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974 sum: 1925226
+ Stripe 4:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2145378214 max: 2147453086 sum: -153680004530
+ Column 2: count: 5000 hasNull: false min: -9222731174895935707 max: 9222919052987871506
+ Column 3: count: 5000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-
11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904 sum: 2815002
+ Stripe 5:
+ Column 0: count: 1000 hasNull: false
+ Column 1: count: 1000 hasNull: false min: -2143595397 max: 2136858458 sum: -22999664100
+ Column 2: count: 1000 hasNull: false min: -9212379634781416464 max: 9197412874152820822
+ Column 3: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006-16056-16180-16304-16332-16452-16598-16730-16810-16994-17210-17268-17786-17962-18214-18444-18446-18724-18912-18952-19164-19348-19400-19546-19776-19896-20084 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7
798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936 sum: 670762
+
+File Statistics:
+ Column 0: count: 21000 hasNull: false
+ Column 1: count: 21000 hasNull: false min: -2147390285 max: 2147453086 sum: 109128518326
+ Column 2: count: 21000 hasNull: false min: -9222731174895935707 max: 9222919052987871506
+ Column 3: count: 21000 hasNull: false min: Darkness,-230 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936 sum: 6910238
+
+Stripes:
+ Stripe: offset: 3 data: 163602 rows: 5000 tail: 68 index: 720
+ Stream: column 0 section ROW_INDEX start: 3 length 17
+ Stream: column 1 section ROW_INDEX start: 20 length 166
+ Stream: column 2 section ROW_INDEX start: 186 length 171
+ Stream: column 3 section ROW_INDEX start: 357 length 366
+ Stream: column 1 section DATA start: 723 length 20035
+ Stream: column 2 section DATA start: 20758 length 40050
+ Stream: column 3 section DATA start: 60808 length 99226
+ Stream: column 3 section LENGTH start: 160034 length 4291
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DIRECT_V2
+ Row group indices for column 1:
+ Entry 0: count: 1000 hasNull: false min: -2132329551 max: 2145911404 sum: 61941331718 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -2138433136 max: 2145210552 sum: 14574030042 positions: 0,2050,488
+ Entry 2: count: 1000 hasNull: false min: -2147115959 max: 2137805337 sum: -2032493169 positions: 4099,2054,464
+ Entry 3: count: 1000 hasNull: false min: -2137828953 max: 2145877119 sum: -3167202608 positions: 8198,2058,440
+ Entry 4: count: 1000 hasNull: false min: -2146452517 max: 2142394906 sum: 88361503212 positions: 12297,2062,416
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9206837518492372266 max: 9169230975203934579 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9188878639954124284 max: 9213664245516510068 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9211329013123260308 max: 9217851628057711416 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9185745718227889962 max: 9181722705210917931 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9216505819108477308 max: 9196474183833079923 positions: 28693,14,416
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness,-230 max: worst-54-290-346-648-908-996 sum: 18442 positions: 0,0,0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966 sum: 46338 positions: 4767,2058,0,695,18
+ Entry 2: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660 sum: 75448 positions: 16464,3340,0,1554,14
+ Entry 3: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788 sum: 104868 positions: 36532,964,0,2372,90
+ Entry 4: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744 sum: 136158 positions: 63067,3432,0,3354,108
+ Stripe: offset: 164393 data: 368335 rows: 5000 tail: 69 index: 956
+ Stream: column 0 section ROW_INDEX start: 164393 length 17
+ Stream: column 1 section ROW_INDEX start: 164410 length 157
+ Stream: column 2 section ROW_INDEX start: 164567 length 166
+ Stream: column 3 section ROW_INDEX start: 164733 length 616
+ Stream: column 1 section DATA start: 165349 length 20035
+ Stream: column 2 section DATA start: 185384 length 40050
+ Stream: column 3 section DATA start: 225434 length 302715
+ Stream: column 3 section LENGTH start: 528149 length 5535
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DIRECT_V2
+ Row group indices for column 1:
+ Entry 0: count: 1000 hasNull: false min: -2146021688 max: 2146838901 sum: -50979197646 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -2143569489 max: 2141223179 sum: 22810066834 positions: 0,2050,488
+ Entry 2: count: 1000 hasNull: false min: -2140649392 max: 2146301701 sum: -31694882346 positions: 4099,2054,464
+ Entry 3: count: 1000 hasNull: false min: -2147390285 max: 2146299933 sum: 79371934221 positions: 8198,2058,440
+ Entry 4: count: 1000 hasNull: false min: -2145928262 max: 2147224606 sum: -34469378822 positions: 12297,2062,416
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9222178666167296739 max: 9191250610515369723 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9220148577547102875 max: 9213945522531717278 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9220818777591257749 max: 9221301751385928177 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9220031433030423388 max: 9207856144487414148 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9201438531577205959 max: 9212462124593119846 positions: 28693,14,416
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726 sum: 166320 positions: 0,0,0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994 sum: 193436 positions: 43833,2480,0,967,90
+ Entry 2: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988 sum: 224740 positions: 94117,3404,0,1945,222
+ Entry 3: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984 sum: 252094 positions: 155111,2864,0,3268,48
+ Entry 4: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938 sum: 281404 positions: 224570,1006,0,4064,342
+ Stripe: offset: 533753 data: 606074 rows: 5000 tail: 69 index: 1427
+ Stream: column 0 section ROW_INDEX start: 533753 length 17
+ Stream: column 1 section ROW_INDEX start: 533770 length 167
+ Stream: column 2 section ROW_INDEX start: 533937 length 168
+ Stream: column 3 section ROW_INDEX start: 534105 length 1075
+ Stream: column 1 section DATA start: 535180 length 20035
+ Stream: column 2 section DATA start: 555215 length 40050
+ Stream: column 3 section DATA start: 595265 length 540210
+ Stream: column 3 section LENGTH start: 1135475 length 5779
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DIRECT_V2
+ Row group indices for column 1:
+ Entry 0: count: 1000 hasNull: false min: -2138229212 max: 2144818981 sum: -22823642812 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -2145842720 max: 2144179881 sum: -12562754334 positions: 0,2050,488
+ Entry 2: count: 1000 hasNull: false min: -2143045885 max: 2146718321 sum: 82993638644 positions: 4099,2054,464
+ Entry 3: count: 1000 hasNull: false min: -2144745617 max: 2146570474 sum: 25138722367 positions: 8198,2058,440
+ Entry 4: count: 1000 hasNull: false min: -2140127150 max: 2135081620 sum: 68346511655 positions: 12297,2062,416
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9204340807292138409 max: 9208698732685326961 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9221963099397084326 max: 9222722740629726770 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9210480084701091299 max: 9207767402467343058 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9195038026813631215 max: 9199201928563274421 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9215483580266514322 max: 9220102792864959501 positions: 28693,14,416
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876 sum: 313880 positions: 0,0,0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964 sum: 349542 positions: 87800,2584,0,1097,28
+ Entry 2: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976 sum: 386538 posit
ions: 185635,3966,0,2077,162
+ Entry 3: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802
-12976-13216-13246-13502-13766 sum: 421660 positions: 295550,1384,0,3369,16
+ Entry 4: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298
-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974 sum: 453606 positions: 412768,1156,0,4041,470
+ Stripe: offset: 1141323 data: 864001 rows: 5000 tail: 69 index: 1975
+ Stream: column 0 section ROW_INDEX start: 1141323 length 17
+ Stream: column 1 section ROW_INDEX start: 1141340 length 156
+ Stream: column 2 section ROW_INDEX start: 1141496 length 168
+ Stream: column 3 section ROW_INDEX start: 1141664 length 1634
+ Stream: column 1 section DATA start: 1143298 length 20035
+ Stream: column 2 section DATA start: 1163333 length 40050
+ Stream: column 3 section DATA start: 1203383 length 798014
+ Stream: column 3 section LENGTH start: 2001397 length 5902
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DIRECT_V2
+ Row group indices for column 1:
+ Entry 0: count: 1000 hasNull: false min: -2145319330 max: 2146998132 sum: -50856753363 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -2134288866 max: 2147453086 sum: -17911019023 positions: 0,2050,488
+ Entry 2: count: 1000 hasNull: false min: -2139010804 max: 2144727593 sum: -24993151857 positions: 4099,2054,464
+ Entry 3: count: 1000 hasNull: false min: -2145378214 max: 2144098933 sum: -18055164052 positions: 8198,2058,440
+ Entry 4: count: 1000 hasNull: false min: -2140494429 max: 2144595861 sum: -41863916235 positions: 12297,2062,416
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9172774601303513941 max: 9212917101275642143 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9218164880949195469 max: 9222919052987871506 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9222731174895935707 max: 9214167447015056056 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9196276654247395117 max: 9210639275226058005 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9197393848859294562 max: 9208134757538374043 positions: 28693,14,416
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836
-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610 sum: 492916 positions: 0,0,0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008
-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936 sum: 527290 positions: 139298,1396,0,1077,140
+ Entry 2: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006-16056-16180-16304-16332-16452-16598-16730-16810-16994-17210 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9
650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878 sum: 568274 positions: 286457,302,0,1926,462
+ Entry 3: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006-16056-16180-16304-16332-16452-16598-16730-16810-16994-17210-17268-17786-17962-18214 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-91
28-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788 sum: 594578 positions: 447943,3328,0,3444,250
+ Entry 4: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006-16056-16180-16304-16332-16452-16598-16730-16810-16994-17210-17268-17786-17962-18214-18444-18446-18724-18912-18952-19164 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8
390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904 sum: 631944 positions: 616471,3986,3778,547,292
+ Stripe: offset: 2007368 data: 207295 rows: 1000 tail: 67 index: 841
+ Stream: column 0 section ROW_INDEX start: 2007368 length 12
+ Stream: column 1 section ROW_INDEX start: 2007380 length 38
+ Stream: column 2 section ROW_INDEX start: 2007418 length 41
+ Stream: column 3 section ROW_INDEX start: 2007459 length 750
+ Stream: column 1 section DATA start: 2008209 length 4007
+ Stream: column 2 section DATA start: 2012216 length 8010
+ Stream: column 3 section DATA start: 2020226 length 194018
+ Stream: column 3 section LENGTH start: 2214244 length 1260
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DIRECT_V2
+ Row group indices for column 1:
+ Entry 0: count: 1000 hasNull: false min: -2143595397 max: 2136858458 sum: -22999664100 positions: 0,0,0
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9212379634781416464 max: 9197412874152820822 positions: 0,0,0
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006-16056-16180-16304-16332-16452-16598-16730-16810-16994-17210-17268-17786-17962-18214-18444-18446-18724-18912-18952-19164-19348-19400-19546-19776-19896-20084 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-
7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936 sum: 670762 positions: 0,0,0,0,0
+
+File length: 2217685 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+________________________________________________________________________________________________________________________
+
[10/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java
deleted file mode 100644
index 6d1c256..0000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java
+++ /dev/null
@@ -1,2525 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.math.BigInteger;
-import java.sql.Timestamp;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.TimeZone;
-
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
-import org.apache.hadoop.hive.serde2.io.ByteWritable;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
-import org.apache.hadoop.hive.serde2.io.ShortWritable;
-import org.apache.hadoop.hive.serde2.io.TimestampWritable;
-import org.apache.hadoop.hive.shims.HadoopShims.TextReaderShim;
-import org.apache.hadoop.hive.shims.ShimLoader;
-import org.apache.hadoop.io.BooleanWritable;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.FloatWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.impl.BitFieldReader;
-import org.apache.orc.impl.DynamicByteArray;
-import org.apache.orc.impl.InStream;
-import org.apache.orc.impl.IntegerReader;
-import org.apache.orc.OrcProto;
-import org.apache.orc.impl.PositionProvider;
-import org.apache.orc.impl.RunLengthByteReader;
-import org.apache.orc.impl.RunLengthIntegerReader;
-import org.apache.orc.impl.RunLengthIntegerReaderV2;
-import org.apache.orc.impl.SerializationUtils;
-import org.apache.orc.impl.StreamName;
-
-/**
- * Factory for creating ORC tree readers.
- */
-public class TreeReaderFactory {
-
- public abstract static class TreeReader {
- protected final int columnId;
- protected BitFieldReader present = null;
- protected boolean valuePresent = false;
- protected int vectorColumnCount;
-
- TreeReader(int columnId) throws IOException {
- this(columnId, null);
- }
-
- protected TreeReader(int columnId, InStream in) throws IOException {
- this.columnId = columnId;
- if (in == null) {
- present = null;
- valuePresent = true;
- } else {
- present = new BitFieldReader(in, 1);
- }
- vectorColumnCount = -1;
- }
-
- void setVectorColumnCount(int vectorColumnCount) {
- this.vectorColumnCount = vectorColumnCount;
- }
-
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- static IntegerReader createIntegerReader(OrcProto.ColumnEncoding.Kind kind,
- InStream in,
- boolean signed, boolean skipCorrupt) throws IOException {
- switch (kind) {
- case DIRECT_V2:
- case DICTIONARY_V2:
- return new RunLengthIntegerReaderV2(in, signed, skipCorrupt);
- case DIRECT:
- case DICTIONARY:
- return new RunLengthIntegerReader(in, signed);
- default:
- throw new IllegalArgumentException("Unknown encoding " + kind);
- }
- }
-
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- checkEncoding(stripeFooter.getColumnsList().get(columnId));
- InStream in = streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.PRESENT));
- if (in == null) {
- present = null;
- valuePresent = true;
- } else {
- present = new BitFieldReader(in, 1);
- }
- }
-
- /**
- * Seek to the given position.
- *
- * @param index the indexes loaded from the file
- * @throws IOException
- */
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- public void seek(PositionProvider index) throws IOException {
- if (present != null) {
- present.seek(index);
- }
- }
-
- protected long countNonNulls(long rows) throws IOException {
- if (present != null) {
- long result = 0;
- for (long c = 0; c < rows; ++c) {
- if (present.next() == 1) {
- result += 1;
- }
- }
- return result;
- } else {
- return rows;
- }
- }
-
- abstract void skipRows(long rows) throws IOException;
-
- void readValuePresent() throws IOException {
- if (present != null) {
- valuePresent = present.next() == 1;
- }
- }
-
- Object next(Object previous) throws IOException {
- if (present != null) {
- valuePresent = present.next() == 1;
- }
- return previous;
- }
-
- /**
- * Called at the top level to read into the given batch.
- * @param batch the batch to read into
- * @param batchSize the number of rows to read
- * @throws IOException
- */
- public void nextBatch(VectorizedRowBatch batch,
- int batchSize) throws IOException {
- batch.cols[0].reset();
- batch.cols[0].ensureSize(batchSize, false);
- nextVector(batch.cols[0], null, batchSize);
- }
-
- /**
- * Populates the isNull vector array in the previousVector object based on
- * the present stream values. This function is called from all the child
- * readers, and they all set the values based on isNull field value.
- *
- * @param previous The columnVector object whose isNull value is populated
- * @param isNull Whether the each value was null at a higher level. If
- * isNull is null, all values are non-null.
- * @param batchSize Size of the column vector
- * @throws IOException
- */
- public void nextVector(ColumnVector previous,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (present != null || isNull != null) {
- // Set noNulls and isNull vector of the ColumnVector based on
- // present stream
- previous.noNulls = true;
- boolean allNull = true;
- for (int i = 0; i < batchSize; i++) {
- if (isNull == null || !isNull[i]) {
- if (present != null && present.next() != 1) {
- previous.noNulls = false;
- previous.isNull[i] = true;
- } else {
- previous.isNull[i] = false;
- allNull = false;
- }
- } else {
- previous.noNulls = false;
- previous.isNull[i] = true;
- }
- }
- previous.isRepeating = !previous.noNulls && allNull;
- } else {
- // There is no present stream, this means that all the values are
- // present.
- previous.noNulls = true;
- for (int i = 0; i < batchSize; i++) {
- previous.isNull[i] = false;
- }
- }
- }
-
- public BitFieldReader getPresent() {
- return present;
- }
- }
-
- public static class NullTreeReader extends TreeReader {
-
- public NullTreeReader(int columnId) throws IOException {
- super(columnId);
- }
-
- @Override
- public void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter footer) {
- // PASS
- }
-
- @Override
- void skipRows(long rows) {
- // PASS
- }
-
- @Override
- public void seek(PositionProvider position) {
- // PASS
- }
-
- @Override
- public void seek(PositionProvider[] position) {
- // PASS
- }
-
- @Override
- Object next(Object previous) {
- return null;
- }
-
- @Override
- public void nextVector(ColumnVector vector, boolean[] isNull, final int batchSize) {
- vector.noNulls = false;
- vector.isNull[0] = true;
- vector.isRepeating = true;
- }
- }
-
- public static class BooleanTreeReader extends TreeReader {
- protected BitFieldReader reader = null;
-
- BooleanTreeReader(int columnId) throws IOException {
- this(columnId, null, null);
- }
-
- protected BooleanTreeReader(int columnId, InStream present, InStream data) throws IOException {
- super(columnId, present);
- if (data != null) {
- reader = new BitFieldReader(data, 1);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- reader = new BitFieldReader(streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.DATA)), 1);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- BooleanWritable result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new BooleanWritable();
- } else {
- result = (BooleanWritable) previous;
- }
- result.set(reader.next() == 1);
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- LongColumnVector result = (LongColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- // Read value entries based on isNull entries
- reader.nextVector(result, batchSize);
- }
- }
-
- public static class ByteTreeReader extends TreeReader {
- protected RunLengthByteReader reader = null;
-
- ByteTreeReader(int columnId) throws IOException {
- this(columnId, null, null);
- }
-
- protected ByteTreeReader(int columnId, InStream present, InStream data) throws IOException {
- super(columnId, present);
- this.reader = new RunLengthByteReader(data);
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- reader = new RunLengthByteReader(streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.DATA)));
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- ByteWritable result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new ByteWritable();
- } else {
- result = (ByteWritable) previous;
- }
- result.set(reader.next());
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final LongColumnVector result = (LongColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- // Read value entries based on isNull entries
- reader.nextVector(result, result.vector, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
- }
-
- public static class ShortTreeReader extends TreeReader {
- protected IntegerReader reader = null;
-
- ShortTreeReader(int columnId) throws IOException {
- this(columnId, null, null, null);
- }
-
- protected ShortTreeReader(int columnId, InStream present, InStream data,
- OrcProto.ColumnEncoding encoding)
- throws IOException {
- super(columnId, present);
- if (data != null && encoding != null) {
- checkEncoding(encoding);
- this.reader = createIntegerReader(encoding.getKind(), data, true, false);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(name), true, false);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- ShortWritable result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new ShortWritable();
- } else {
- result = (ShortWritable) previous;
- }
- result.set((short) reader.next());
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final LongColumnVector result = (LongColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- // Read value entries based on isNull entries
- reader.nextVector(result, result.vector, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
- }
-
- public static class IntTreeReader extends TreeReader {
- protected IntegerReader reader = null;
-
- IntTreeReader(int columnId) throws IOException {
- this(columnId, null, null, null);
- }
-
- protected IntTreeReader(int columnId, InStream present, InStream data,
- OrcProto.ColumnEncoding encoding)
- throws IOException {
- super(columnId, present);
- if (data != null && encoding != null) {
- checkEncoding(encoding);
- this.reader = createIntegerReader(encoding.getKind(), data, true, false);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(name), true, false);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- IntWritable result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new IntWritable();
- } else {
- result = (IntWritable) previous;
- }
- result.set((int) reader.next());
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final LongColumnVector result = (LongColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- // Read value entries based on isNull entries
- reader.nextVector(result, result.vector, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
- }
-
- public static class LongTreeReader extends TreeReader {
- protected IntegerReader reader = null;
-
- LongTreeReader(int columnId, boolean skipCorrupt) throws IOException {
- this(columnId, null, null, null, skipCorrupt);
- }
-
- protected LongTreeReader(int columnId, InStream present, InStream data,
- OrcProto.ColumnEncoding encoding,
- boolean skipCorrupt)
- throws IOException {
- super(columnId, present);
- if (data != null && encoding != null) {
- checkEncoding(encoding);
- this.reader = createIntegerReader(encoding.getKind(), data, true, skipCorrupt);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(name), true, false);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- LongWritable result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new LongWritable();
- } else {
- result = (LongWritable) previous;
- }
- result.set(reader.next());
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final LongColumnVector result = (LongColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- // Read value entries based on isNull entries
- reader.nextVector(result, result.vector, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
- }
-
- public static class FloatTreeReader extends TreeReader {
- protected InStream stream;
- private final SerializationUtils utils;
-
- FloatTreeReader(int columnId) throws IOException {
- this(columnId, null, null);
- }
-
- protected FloatTreeReader(int columnId, InStream present, InStream data) throws IOException {
- super(columnId, present);
- this.utils = new SerializationUtils();
- this.stream = data;
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- stream = streams.get(name);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- stream.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- FloatWritable result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new FloatWritable();
- } else {
- result = (FloatWritable) previous;
- }
- result.set(utils.readFloat(stream));
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final DoubleColumnVector result = (DoubleColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- final boolean hasNulls = !result.noNulls;
- boolean allNulls = hasNulls;
-
- if (hasNulls) {
- // conditions to ensure bounds checks skips
- for (int i = 0; batchSize <= result.isNull.length && i < batchSize; i++) {
- allNulls = allNulls & result.isNull[i];
- }
- if (allNulls) {
- result.vector[0] = Double.NaN;
- result.isRepeating = true;
- } else {
- // some nulls
- result.isRepeating = false;
- // conditions to ensure bounds checks skips
- for (int i = 0; batchSize <= result.isNull.length
- && batchSize <= result.vector.length && i < batchSize; i++) {
- if (!result.isNull[i]) {
- result.vector[i] = utils.readFloat(stream);
- } else {
- // If the value is not present then set NaN
- result.vector[i] = Double.NaN;
- }
- }
- }
- } else {
- // no nulls & > 1 row (check repeating)
- boolean repeating = (batchSize > 1);
- final float f1 = utils.readFloat(stream);
- result.vector[0] = f1;
- // conditions to ensure bounds checks skips
- for (int i = 1; i < batchSize && batchSize <= result.vector.length; i++) {
- final float f2 = utils.readFloat(stream);
- repeating = repeating && (f1 == f2);
- result.vector[i] = f2;
- }
- result.isRepeating = repeating;
- }
- }
-
- @Override
- protected void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- for (int i = 0; i < items; ++i) {
- utils.readFloat(stream);
- }
- }
- }
-
- public static class DoubleTreeReader extends TreeReader {
- protected InStream stream;
- private final SerializationUtils utils;
-
- DoubleTreeReader(int columnId) throws IOException {
- this(columnId, null, null);
- }
-
- protected DoubleTreeReader(int columnId, InStream present, InStream data) throws IOException {
- super(columnId, present);
- this.utils = new SerializationUtils();
- this.stream = data;
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name =
- new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- stream = streams.get(name);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- stream.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- DoubleWritable result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new DoubleWritable();
- } else {
- result = (DoubleWritable) previous;
- }
- result.set(utils.readDouble(stream));
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final DoubleColumnVector result = (DoubleColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- final boolean hasNulls = !result.noNulls;
- boolean allNulls = hasNulls;
-
- if (hasNulls) {
- // conditions to ensure bounds checks skips
- for (int i = 0; i < batchSize && batchSize <= result.isNull.length; i++) {
- allNulls = allNulls & result.isNull[i];
- }
- if (allNulls) {
- result.vector[0] = Double.NaN;
- result.isRepeating = true;
- } else {
- // some nulls
- result.isRepeating = false;
- // conditions to ensure bounds checks skips
- for (int i = 0; batchSize <= result.isNull.length
- && batchSize <= result.vector.length && i < batchSize; i++) {
- if (!result.isNull[i]) {
- result.vector[i] = utils.readDouble(stream);
- } else {
- // If the value is not present then set NaN
- result.vector[i] = Double.NaN;
- }
- }
- }
- } else {
- // no nulls
- boolean repeating = (batchSize > 1);
- final double d1 = utils.readDouble(stream);
- result.vector[0] = d1;
- // conditions to ensure bounds checks skips
- for (int i = 1; i < batchSize && batchSize <= result.vector.length; i++) {
- final double d2 = utils.readDouble(stream);
- repeating = repeating && (d1 == d2);
- result.vector[i] = d2;
- }
- result.isRepeating = repeating;
- }
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- long len = items * 8;
- while (len > 0) {
- len -= stream.skip(len);
- }
- }
- }
-
- public static class BinaryTreeReader extends TreeReader {
- protected InStream stream;
- protected IntegerReader lengths = null;
- protected final LongColumnVector scratchlcv;
-
- BinaryTreeReader(int columnId) throws IOException {
- this(columnId, null, null, null, null);
- }
-
- protected BinaryTreeReader(int columnId, InStream present, InStream data, InStream length,
- OrcProto.ColumnEncoding encoding) throws IOException {
- super(columnId, present);
- scratchlcv = new LongColumnVector();
- this.stream = data;
- if (length != null && encoding != null) {
- checkEncoding(encoding);
- this.lengths = createIntegerReader(encoding.getKind(), length, false, false);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- stream = streams.get(name);
- lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), false, false);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- stream.seek(index);
- lengths.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- BytesWritable result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new BytesWritable();
- } else {
- result = (BytesWritable) previous;
- }
- int len = (int) lengths.next();
- result.setSize(len);
- int offset = 0;
- while (len > 0) {
- int written = stream.read(result.getBytes(), offset, len);
- if (written < 0) {
- throw new EOFException("Can't finish byte read from " + stream);
- }
- len -= written;
- offset += written;
- }
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final BytesColumnVector result = (BytesColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- BytesColumnVectorUtil.readOrcByteArrays(stream, lengths, scratchlcv, result, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- long lengthToSkip = 0;
- for (int i = 0; i < items; ++i) {
- lengthToSkip += lengths.next();
- }
- while (lengthToSkip > 0) {
- lengthToSkip -= stream.skip(lengthToSkip);
- }
- }
- }
-
- public static class TimestampTreeReader extends TreeReader {
- protected IntegerReader data = null;
- protected IntegerReader nanos = null;
- private final boolean skipCorrupt;
- private Map<String, Long> baseTimestampMap;
- private long base_timestamp;
- private final TimeZone readerTimeZone;
- private TimeZone writerTimeZone;
- private boolean hasSameTZRules;
-
- TimestampTreeReader(int columnId, boolean skipCorrupt) throws IOException {
- this(columnId, null, null, null, null, skipCorrupt);
- }
-
- protected TimestampTreeReader(int columnId, InStream presentStream, InStream dataStream,
- InStream nanosStream, OrcProto.ColumnEncoding encoding, boolean skipCorrupt)
- throws IOException {
- super(columnId, presentStream);
- this.skipCorrupt = skipCorrupt;
- this.baseTimestampMap = new HashMap<>();
- this.readerTimeZone = TimeZone.getDefault();
- this.writerTimeZone = readerTimeZone;
- this.hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
- this.base_timestamp = getBaseTimestamp(readerTimeZone.getID());
- if (encoding != null) {
- checkEncoding(encoding);
-
- if (dataStream != null) {
- this.data = createIntegerReader(encoding.getKind(), dataStream, true, skipCorrupt);
- }
-
- if (nanosStream != null) {
- this.nanos = createIntegerReader(encoding.getKind(), nanosStream, false, skipCorrupt);
- }
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- data = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.DATA)), true, skipCorrupt);
- nanos = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt);
- base_timestamp = getBaseTimestamp(stripeFooter.getWriterTimezone());
- }
-
- private long getBaseTimestamp(String timeZoneId) throws IOException {
- // to make sure new readers read old files in the same way
- if (timeZoneId == null || timeZoneId.isEmpty()) {
- timeZoneId = readerTimeZone.getID();
- }
-
- if (!baseTimestampMap.containsKey(timeZoneId)) {
- writerTimeZone = TimeZone.getTimeZone(timeZoneId);
- hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- sdf.setTimeZone(writerTimeZone);
- try {
- long epoch =
- sdf.parse(WriterImpl.BASE_TIMESTAMP_STRING).getTime() / WriterImpl.MILLIS_PER_SECOND;
- baseTimestampMap.put(timeZoneId, epoch);
- return epoch;
- } catch (ParseException e) {
- throw new IOException("Unable to create base timestamp", e);
- } finally {
- sdf.setTimeZone(readerTimeZone);
- }
- }
-
- return baseTimestampMap.get(timeZoneId);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- data.seek(index);
- nanos.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- TimestampWritable result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new TimestampWritable();
- } else {
- result = (TimestampWritable) previous;
- }
- long millis = (data.next() + base_timestamp) * WriterImpl.MILLIS_PER_SECOND;
- int newNanos = parseNanos(nanos.next());
- // fix the rounding when we divided by 1000.
- if (millis >= 0) {
- millis += newNanos / WriterImpl.NANOS_PER_MILLI;
- } else {
- millis -= newNanos / WriterImpl.NANOS_PER_MILLI;
- }
- long offset = 0;
- // If reader and writer time zones have different rules, adjust the timezone difference
- // between reader and writer taking day light savings into account.
- if (!hasSameTZRules) {
- offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis);
- }
- long adjustedMillis = millis + offset;
- Timestamp ts = new Timestamp(adjustedMillis);
- // Sometimes the reader timezone might have changed after adding the adjustedMillis.
- // To account for that change, check for any difference in reader timezone after
- // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time).
- if (!hasSameTZRules &&
- (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) {
- long newOffset =
- writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis);
- adjustedMillis = millis + newOffset;
- ts.setTime(adjustedMillis);
- }
- ts.setNanos(newNanos);
- result.set(ts);
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- TimestampColumnVector result = (TimestampColumnVector) previousVector;
- super.nextVector(previousVector, isNull, batchSize);
-
- for (int i = 0; i < batchSize; i++) {
- if (result.noNulls || !result.isNull[i]) {
- long millis = data.next() + base_timestamp;
- int newNanos = parseNanos(nanos.next());
- if (millis < 0 && newNanos != 0) {
- millis -= 1;
- }
- millis *= WriterImpl.MILLIS_PER_SECOND;
- long offset = 0;
- // If reader and writer time zones have different rules, adjust the timezone difference
- // between reader and writer taking day light savings into account.
- if (!hasSameTZRules) {
- offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis);
- }
- long adjustedMillis = millis + offset;
- // Sometimes the reader timezone might have changed after adding the adjustedMillis.
- // To account for that change, check for any difference in reader timezone after
- // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time).
- if (!hasSameTZRules &&
- (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) {
- long newOffset =
- writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis);
- adjustedMillis = millis + newOffset;
- }
- result.time[i] = adjustedMillis;
- result.nanos[i] = newNanos;
- if (result.isRepeating && i != 0 &&
- (result.time[0] != result.time[i] ||
- result.nanos[0] != result.nanos[i])) {
- result.isRepeating = false;
- }
- }
- }
- }
-
- private static int parseNanos(long serialized) {
- int zeros = 7 & (int) serialized;
- int result = (int) (serialized >>> 3);
- if (zeros != 0) {
- for (int i = 0; i <= zeros; ++i) {
- result *= 10;
- }
- }
- return result;
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- data.skip(items);
- nanos.skip(items);
- }
- }
-
- public static class DateTreeReader extends TreeReader {
- protected IntegerReader reader = null;
-
- DateTreeReader(int columnId) throws IOException {
- this(columnId, null, null, null);
- }
-
- protected DateTreeReader(int columnId, InStream present, InStream data,
- OrcProto.ColumnEncoding encoding) throws IOException {
- super(columnId, present);
- if (data != null && encoding != null) {
- checkEncoding(encoding);
- reader = createIntegerReader(encoding.getKind(), data, true, false);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(name), true, false);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- DateWritable result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new DateWritable();
- } else {
- result = (DateWritable) previous;
- }
- result.set((int) reader.next());
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final LongColumnVector result = (LongColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- // Read value entries based on isNull entries
- reader.nextVector(result, result.vector, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
- }
-
- public static class DecimalTreeReader extends TreeReader {
- protected InStream valueStream;
- protected IntegerReader scaleReader = null;
- private int[] scratchScaleVector;
-
- private final int precision;
- private final int scale;
-
- DecimalTreeReader(int columnId, int precision, int scale) throws IOException {
- this(columnId, precision, scale, null, null, null, null);
- }
-
- protected DecimalTreeReader(int columnId, int precision, int scale, InStream present,
- InStream valueStream, InStream scaleStream, OrcProto.ColumnEncoding encoding)
- throws IOException {
- super(columnId, present);
- this.precision = precision;
- this.scale = scale;
- this.scratchScaleVector = new int[VectorizedRowBatch.DEFAULT_SIZE];
- this.valueStream = valueStream;
- if (scaleStream != null && encoding != null) {
- checkEncoding(encoding);
- this.scaleReader = createIntegerReader(encoding.getKind(), scaleStream, true, false);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- valueStream = streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.DATA));
- scaleReader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId, OrcProto.Stream.Kind.SECONDARY)), true, false);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- valueStream.seek(index);
- scaleReader.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- final HiveDecimalWritable result;
- if (valuePresent) {
- if (previous == null) {
- result = new HiveDecimalWritable();
- } else {
- result = (HiveDecimalWritable) previous;
- }
- result.set(HiveDecimal.create(SerializationUtils.readBigInteger
- (valueStream), (int) scaleReader.next()));
- return HiveDecimalWritable.enforcePrecisionScale(result, precision,
- scale);
- }
- return null;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final DecimalColumnVector result = (DecimalColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- if (batchSize > scratchScaleVector.length) {
- scratchScaleVector = new int[(int) batchSize];
- }
- scaleReader.nextVector(result, scratchScaleVector, batchSize);
- // Read value entries based on isNull entries
- if (result.noNulls) {
- for (int r=0; r < batchSize; ++r) {
- BigInteger bInt = SerializationUtils.readBigInteger(valueStream);
- HiveDecimal dec = HiveDecimal.create(bInt, scratchScaleVector[r]);
- result.set(r, dec);
- }
- } else if (!result.isRepeating || !result.isNull[0]) {
- for (int r=0; r < batchSize; ++r) {
- if (!result.isNull[r]) {
- BigInteger bInt = SerializationUtils.readBigInteger(valueStream);
- HiveDecimal dec = HiveDecimal.create(bInt, scratchScaleVector[r]);
- result.set(r, dec);
- }
- }
- }
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- for (int i = 0; i < items; i++) {
- SerializationUtils.readBigInteger(valueStream);
- }
- scaleReader.skip(items);
- }
- }
-
- /**
- * A tree reader that will read string columns. At the start of the
- * stripe, it creates an internal reader based on whether a direct or
- * dictionary encoding was used.
- */
- public static class StringTreeReader extends TreeReader {
- protected TreeReader reader;
-
- StringTreeReader(int columnId) throws IOException {
- super(columnId);
- }
-
- protected StringTreeReader(int columnId, InStream present, InStream data, InStream length,
- InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
- super(columnId, present);
- if (encoding != null) {
- switch (encoding.getKind()) {
- case DIRECT:
- case DIRECT_V2:
- reader = new StringDirectTreeReader(columnId, present, data, length,
- encoding.getKind());
- break;
- case DICTIONARY:
- case DICTIONARY_V2:
- reader = new StringDictionaryTreeReader(columnId, present, data, length, dictionary,
- encoding);
- break;
- default:
- throw new IllegalArgumentException("Unsupported encoding " +
- encoding.getKind());
- }
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- reader.checkEncoding(encoding);
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- // For each stripe, checks the encoding and initializes the appropriate
- // reader
- switch (stripeFooter.getColumnsList().get(columnId).getKind()) {
- case DIRECT:
- case DIRECT_V2:
- reader = new StringDirectTreeReader(columnId);
- break;
- case DICTIONARY:
- case DICTIONARY_V2:
- reader = new StringDictionaryTreeReader(columnId);
- break;
- default:
- throw new IllegalArgumentException("Unsupported encoding " +
- stripeFooter.getColumnsList().get(columnId).getKind());
- }
- reader.startStripe(streams, stripeFooter);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- reader.seek(index);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- reader.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- return reader.next(previous);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- reader.nextVector(previousVector, isNull, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skipRows(items);
- }
- }
-
- // This class collects together very similar methods for reading an ORC vector of byte arrays and
- // creating the BytesColumnVector.
- //
- public static class BytesColumnVectorUtil {
-
- private static byte[] commonReadByteArrays(InStream stream, IntegerReader lengths,
- LongColumnVector scratchlcv,
- BytesColumnVector result, final int batchSize) throws IOException {
- // Read lengths
- scratchlcv.isNull = result.isNull; // Notice we are replacing the isNull vector here...
- lengths.nextVector(scratchlcv, scratchlcv.vector, batchSize);
- int totalLength = 0;
- if (!scratchlcv.isRepeating) {
- for (int i = 0; i < batchSize; i++) {
- if (!scratchlcv.isNull[i]) {
- totalLength += (int) scratchlcv.vector[i];
- }
- }
- } else {
- if (!scratchlcv.isNull[0]) {
- totalLength = (int) (batchSize * scratchlcv.vector[0]);
- }
- }
-
- // Read all the strings for this batch
- byte[] allBytes = new byte[totalLength];
- int offset = 0;
- int len = totalLength;
- while (len > 0) {
- int bytesRead = stream.read(allBytes, offset, len);
- if (bytesRead < 0) {
- throw new EOFException("Can't finish byte read from " + stream);
- }
- len -= bytesRead;
- offset += bytesRead;
- }
-
- return allBytes;
- }
-
- // This method has the common code for reading in bytes into a BytesColumnVector.
- public static void readOrcByteArrays(InStream stream,
- IntegerReader lengths,
- LongColumnVector scratchlcv,
- BytesColumnVector result,
- final int batchSize) throws IOException {
- if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
- byte[] allBytes = commonReadByteArrays(stream, lengths, scratchlcv,
- result, (int) batchSize);
-
- // Too expensive to figure out 'repeating' by comparisons.
- result.isRepeating = false;
- int offset = 0;
- if (!scratchlcv.isRepeating) {
- for (int i = 0; i < batchSize; i++) {
- if (!scratchlcv.isNull[i]) {
- result.setRef(i, allBytes, offset, (int) scratchlcv.vector[i]);
- offset += scratchlcv.vector[i];
- } else {
- result.setRef(i, allBytes, 0, 0);
- }
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!scratchlcv.isNull[i]) {
- result.setRef(i, allBytes, offset, (int) scratchlcv.vector[0]);
- offset += scratchlcv.vector[0];
- } else {
- result.setRef(i, allBytes, 0, 0);
- }
- }
- }
- }
- }
- }
-
- /**
- * A reader for string columns that are direct encoded in the current
- * stripe.
- */
- public static class StringDirectTreeReader extends TreeReader {
- protected InStream stream;
- protected TextReaderShim data;
- protected IntegerReader lengths;
- private final LongColumnVector scratchlcv;
-
- StringDirectTreeReader(int columnId) throws IOException {
- this(columnId, null, null, null, null);
- }
-
- protected StringDirectTreeReader(int columnId, InStream present, InStream data,
- InStream length, OrcProto.ColumnEncoding.Kind encoding) throws IOException {
- super(columnId, present);
- this.scratchlcv = new LongColumnVector();
- this.stream = data;
- if (length != null && encoding != null) {
- this.lengths = createIntegerReader(encoding, length, false, false);
- this.data = ShimLoader.getHadoopShims().getTextReaderShim(this.stream);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT &&
- encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- stream = streams.get(name);
- data = ShimLoader.getHadoopShims().getTextReaderShim(this.stream);
- lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId, OrcProto.Stream.Kind.LENGTH)),
- false, false);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- stream.seek(index);
- // don't seek data stream
- lengths.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- Text result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new Text();
- } else {
- result = (Text) previous;
- }
- int len = (int) lengths.next();
- data.read(result, len);
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final BytesColumnVector result = (BytesColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- BytesColumnVectorUtil.readOrcByteArrays(stream, lengths, scratchlcv,
- result, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- long lengthToSkip = 0;
- for (int i = 0; i < items; ++i) {
- lengthToSkip += lengths.next();
- }
-
- while (lengthToSkip > 0) {
- lengthToSkip -= stream.skip(lengthToSkip);
- }
- }
-
- public IntegerReader getLengths() {
- return lengths;
- }
-
- public InStream getStream() {
- return stream;
- }
- }
-
- /**
- * A reader for string columns that are dictionary encoded in the current
- * stripe.
- */
- public static class StringDictionaryTreeReader extends TreeReader {
- private static final byte[] EMPTY_BYTE_ARRAY = new byte[0];
- private DynamicByteArray dictionaryBuffer;
- private int[] dictionaryOffsets;
- protected IntegerReader reader;
-
- private byte[] dictionaryBufferInBytesCache = null;
- private final LongColumnVector scratchlcv;
-
- StringDictionaryTreeReader(int columnId) throws IOException {
- this(columnId, null, null, null, null, null);
- }
-
- protected StringDictionaryTreeReader(int columnId, InStream present, InStream data,
- InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding)
- throws IOException {
- super(columnId, present);
- scratchlcv = new LongColumnVector();
- if (data != null && encoding != null) {
- this.reader = createIntegerReader(encoding.getKind(), data, false, false);
- }
-
- if (dictionary != null && encoding != null) {
- readDictionaryStream(dictionary);
- }
-
- if (length != null && encoding != null) {
- readDictionaryLengthStream(length, encoding);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY &&
- encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
-
- // read the dictionary blob
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DICTIONARY_DATA);
- InStream in = streams.get(name);
- readDictionaryStream(in);
-
- // read the lengths
- name = new StreamName(columnId, OrcProto.Stream.Kind.LENGTH);
- in = streams.get(name);
- readDictionaryLengthStream(in, stripeFooter.getColumnsList().get(columnId));
-
- // set up the row reader
- name = new StreamName(columnId, OrcProto.Stream.Kind.DATA);
- reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(name), false, false);
- }
-
- private void readDictionaryLengthStream(InStream in, OrcProto.ColumnEncoding encoding)
- throws IOException {
- int dictionarySize = encoding.getDictionarySize();
- if (in != null) { // Guard against empty LENGTH stream.
- IntegerReader lenReader = createIntegerReader(encoding.getKind(), in, false, false);
- int offset = 0;
- if (dictionaryOffsets == null ||
- dictionaryOffsets.length < dictionarySize + 1) {
- dictionaryOffsets = new int[dictionarySize + 1];
- }
- for (int i = 0; i < dictionarySize; ++i) {
- dictionaryOffsets[i] = offset;
- offset += (int) lenReader.next();
- }
- dictionaryOffsets[dictionarySize] = offset;
- in.close();
- }
-
- }
-
- private void readDictionaryStream(InStream in) throws IOException {
- if (in != null) { // Guard against empty dictionary stream.
- if (in.available() > 0) {
- dictionaryBuffer = new DynamicByteArray(64, in.available());
- dictionaryBuffer.readAll(in);
- // Since its start of strip invalidate the cache.
- dictionaryBufferInBytesCache = null;
- }
- in.close();
- } else {
- dictionaryBuffer = null;
- }
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- Text result = null;
- if (valuePresent) {
- int entry = (int) reader.next();
- if (previous == null) {
- result = new Text();
- } else {
- result = (Text) previous;
- }
- int offset = dictionaryOffsets[entry];
- int length = getDictionaryEntryLength(entry, offset);
- // If the column is just empty strings, the size will be zero,
- // so the buffer will be null, in that case just return result
- // as it will default to empty
- if (dictionaryBuffer != null) {
- dictionaryBuffer.setText(result, offset, length);
- } else {
- result.clear();
- }
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final BytesColumnVector result = (BytesColumnVector) previousVector;
- int offset;
- int length;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- if (dictionaryBuffer != null) {
-
- // Load dictionaryBuffer into cache.
- if (dictionaryBufferInBytesCache == null) {
- dictionaryBufferInBytesCache = dictionaryBuffer.get();
- }
-
- // Read string offsets
- scratchlcv.isNull = result.isNull;
- scratchlcv.ensureSize((int) batchSize, false);
- reader.nextVector(scratchlcv, scratchlcv.vector, batchSize);
- if (!scratchlcv.isRepeating) {
-
- // The vector has non-repeating strings. Iterate thru the batch
- // and set strings one by one
- for (int i = 0; i < batchSize; i++) {
- if (!scratchlcv.isNull[i]) {
- offset = dictionaryOffsets[(int) scratchlcv.vector[i]];
- length = getDictionaryEntryLength((int) scratchlcv.vector[i], offset);
- result.setRef(i, dictionaryBufferInBytesCache, offset, length);
- } else {
- // If the value is null then set offset and length to zero (null string)
- result.setRef(i, dictionaryBufferInBytesCache, 0, 0);
- }
- }
- } else {
- // If the value is repeating then just set the first value in the
- // vector and set the isRepeating flag to true. No need to iterate thru and
- // set all the elements to the same value
- offset = dictionaryOffsets[(int) scratchlcv.vector[0]];
- length = getDictionaryEntryLength((int) scratchlcv.vector[0], offset);
- result.setRef(0, dictionaryBufferInBytesCache, offset, length);
- }
- result.isRepeating = scratchlcv.isRepeating;
- } else {
- if (dictionaryOffsets == null) {
- // Entire stripe contains null strings.
- result.isRepeating = true;
- result.noNulls = false;
- result.isNull[0] = true;
- result.setRef(0, EMPTY_BYTE_ARRAY, 0, 0);
- } else {
- // stripe contains nulls and empty strings
- for (int i = 0; i < batchSize; i++) {
- if (!result.isNull[i]) {
- result.setRef(i, EMPTY_BYTE_ARRAY, 0, 0);
- }
- }
- }
- }
- }
-
- int getDictionaryEntryLength(int entry, int offset) {
- final int length;
- // if it isn't the last entry, subtract the offsets otherwise use
- // the buffer length.
- if (entry < dictionaryOffsets.length - 1) {
- length = dictionaryOffsets[entry + 1] - offset;
- } else {
- length = dictionaryBuffer.size() - offset;
- }
- return length;
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
-
- public IntegerReader getReader() {
- return reader;
- }
- }
-
- public static class CharTreeReader extends StringTreeReader {
- int maxLength;
-
- CharTreeReader(int columnId, int maxLength) throws IOException {
- this(columnId, maxLength, null, null, null, null, null);
- }
-
- protected CharTreeReader(int columnId, int maxLength, InStream present, InStream data,
- InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
- super(columnId, present, data, length, dictionary, encoding);
- this.maxLength = maxLength;
- }
-
- @Override
- Object next(Object previous) throws IOException {
- final HiveCharWritable result;
- if (previous == null) {
- result = new HiveCharWritable();
- } else {
- result = (HiveCharWritable) previous;
- }
- // Use the string reader implementation to populate the internal Text value
- Object textVal = super.next(result.getTextValue());
- if (textVal == null) {
- return null;
- }
- // result should now hold the value that was read in.
- // enforce char length
- result.enforceMaxLength(maxLength);
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- // Get the vector of strings from StringTreeReader, then make a 2nd pass to
- // adjust down the length (right trim and truncate) if necessary.
- super.nextVector(previousVector, isNull, batchSize);
- BytesColumnVector result = (BytesColumnVector) previousVector;
- int adjustedDownLen;
- if (result.isRepeating) {
- if (result.noNulls || !result.isNull[0]) {
- adjustedDownLen = StringExpr
- .rightTrimAndTruncate(result.vector[0], result.start[0], result.length[0], maxLength);
- if (adjustedDownLen < result.length[0]) {
- result.setRef(0, result.vector[0], result.start[0], adjustedDownLen);
- }
- }
- } else {
- if (result.noNulls) {
- for (int i = 0; i < batchSize; i++) {
- adjustedDownLen = StringExpr
- .rightTrimAndTruncate(result.vector[i], result.start[i], result.length[i],
- maxLength);
- if (adjustedDownLen < result.length[i]) {
- result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
- }
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!result.isNull[i]) {
- adjustedDownLen = StringExpr
- .rightTrimAndTruncate(result.vector[i], result.start[i], result.length[i],
- maxLength);
- if (adjustedDownLen < result.length[i]) {
- result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
- }
- }
- }
- }
- }
- }
- }
-
- public static class VarcharTreeReader extends StringTreeReader {
- int maxLength;
-
- VarcharTreeReader(int columnId, int maxLength) throws IOException {
- this(columnId, maxLength, null, null, null, null, null);
- }
-
- protected VarcharTreeReader(int columnId, int maxLength, InStream present, InStream data,
- InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
- super(columnId, present, data, length, dictionary, encoding);
- this.maxLength = maxLength;
- }
-
- @Override
- Object next(Object previous) throws IOException {
- final HiveVarcharWritable result;
- if (previous == null) {
- result = new HiveVarcharWritable();
- } else {
- result = (HiveVarcharWritable) previous;
- }
- // Use the string reader implementation to populate the internal Text value
- Object textVal = super.next(result.getTextValue());
- if (textVal == null) {
- return null;
- }
- // result should now hold the value that was read in.
- // enforce varchar length
- result.enforceMaxLength(maxLength);
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- // Get the vector of strings from StringTreeReader, then make a 2nd pass to
- // adjust down the length (truncate) if necessary.
- super.nextVector(previousVector, isNull, batchSize);
- BytesColumnVector result = (BytesColumnVector) previousVector;
-
- int adjustedDownLen;
- if (result.isRepeating) {
- if (result.noNulls || !result.isNull[0]) {
- adjustedDownLen = StringExpr
- .truncate(result.vector[0], result.start[0], result.length[0], maxLength);
- if (adjustedDownLen < result.length[0]) {
- result.setRef(0, result.vector[0], result.start[0], adjustedDownLen);
- }
- }
- } else {
- if (result.noNulls) {
- for (int i = 0; i < batchSize; i++) {
- adjustedDownLen = StringExpr
- .truncate(result.vector[i], result.start[i], result.length[i], maxLength);
- if (adjustedDownLen < result.length[i]) {
- result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
- }
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!result.isNull[i]) {
- adjustedDownLen = StringExpr
- .truncate(result.vector[i], result.start[i], result.length[i], maxLength);
- if (adjustedDownLen < result.length[i]) {
- result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
- }
- }
- }
- }
- }
- }
- }
-
- protected static class StructTreeReader extends TreeReader {
- protected final TreeReader[] fields;
-
- protected StructTreeReader(int columnId,
- TypeDescription readerSchema,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
- super(columnId);
-
- TypeDescription fileSchema = evolution.getFileType(readerSchema);
-
- List<TypeDescription> childrenTypes = readerSchema.getChildren();
- this.fields = new TreeReader[childrenTypes.size()];
- for (int i = 0; i < fields.length; ++i) {
- TypeDescription subtype = childrenTypes.get(i);
- this.fields[i] = createTreeReader(subtype, evolution, included, skipCorrupt);
- }
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- super.seek(index);
- for (TreeReader kid : fields) {
- if (kid != null) {
- kid.seek(index);
- }
- }
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- OrcStruct result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new OrcStruct(fields.length);
- } else {
- result = (OrcStruct) previous;
-
- // If the input format was initialized with a file with a
- // different number of fields, the number of fields needs to
- // be updated to the correct number
- result.setNumFields(fields.length);
- }
- for (int i = 0; i < fields.length; ++i) {
- if (fields[i] != null) {
- result.setFieldValue(i, fields[i].next(result.getFieldValue(i)));
- }
- }
- }
- return result;
- }
-
- @Override
- public void nextBatch(VectorizedRowBatch batch,
- int batchSize) throws IOException {
- for(int i=0; i < fields.length &&
- (vectorColumnCount == -1 || i < vectorColumnCount); ++i) {
- batch.cols[i].reset();
- batch.cols[i].ensureSize((int) batchSize, false);
- fields[i].nextVector(batch.cols[i], null, batchSize);
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- super.nextVector(previousVector, isNull, batchSize);
- StructColumnVector result = (StructColumnVector) previousVector;
- if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
- result.isRepeating = false;
-
- // Read all the members of struct as column vectors
- boolean[] mask = result.noNulls ? null : result.isNull;
- for (int f = 0; f < fields.length; f++) {
- if (fields[f] != null) {
- fields[f].nextVector(result.fields[f], mask, batchSize);
- }
- }
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- for (TreeReader field : fields) {
- if (field != null) {
- field.startStripe(streams, stripeFooter);
- }
- }
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- for (TreeReader field : fields) {
- if (field != null) {
- field.skipRows(items);
- }
- }
- }
- }
-
- public static class UnionTreeReader extends TreeReader {
- protected final TreeReader[] fields;
- protected RunLengthByteReader tags;
-
- protected UnionTreeReader(int fileColumn,
- TypeDescription readerSchema,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
- super(fileColumn);
- List<TypeDescription> childrenTypes = readerSchema.getChildren();
- int fieldCount = childrenTypes.size();
- this.fields = new TreeReader[fieldCount];
- for (int i = 0; i < fieldCount; ++i) {
- TypeDescription subtype = childrenTypes.get(i);
- this.fields[i] = createTreeReader(subtype, evolution, included, skipCorrupt);
- }
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- super.seek(index);
- tags.seek(index[columnId]);
- for (TreeReader kid : fields) {
- kid.seek(index);
- }
- }
-
- @Override
- Object next(Object previous) throws IOException {
- super.next(previous);
- OrcUnion result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new OrcUnion();
- } else {
- result = (OrcUnion) previous;
- }
- byte tag = tags.next();
- Object previousVal = result.getObject();
- result.set(tag, fields[tag].next(tag == result.getTag() ?
- previousVal : null));
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- UnionColumnVector result = (UnionColumnVector) previousVector;
- super.nextVector(result, isNull, batchSize);
- if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
- result.isRepeating = false;
- tags.nextVector(result.noNulls ? null : result.isNull, result.tags,
- batchSize);
- boolean[] ignore = new boolean[(int) batchSize];
- for (int f = 0; f < result.fields.length; ++f) {
- // build the ignore list for this tag
- for (int r = 0; r < batchSize; ++r) {
- ignore[r] = (!result.noNulls && result.isNull[r]) ||
- result.tags[r] != f;
- }
- fields[f].nextVector(result.fields[f], ignore, batchSize);
- }
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- tags = new RunLengthByteReader(streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.DATA)));
- for (TreeReader field : fields) {
- if (field != null) {
- field.startStripe(streams, stripeFooter);
- }
- }
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- long[] counts = new long[fields.length];
- for (int i = 0; i < items; ++i) {
- counts[tags.next()] += 1;
- }
- for (int i = 0; i < counts.length; ++i) {
- fields[i].skipRows(counts[i]);
- }
- }
- }
-
- public static class ListTreeReader extends TreeReader {
- protected final TreeReader elementReader;
- protected IntegerReader lengths = null;
-
- protected ListTreeReader(int fileColumn,
- TypeDescription readerSchema,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
- super(fileColumn);
- TypeDescription elementType = readerSchema.getChildren().get(0);
- elementReader = createTreeReader(elementType, evolution, included,
- skipCorrupt);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- super.seek(index);
- lengths.seek(index[columnId]);
- elementReader.seek(index);
- }
-
- @Override
- @SuppressWarnings("unchecked")
- Object next(Object previous) throws IOException {
- super.next(previous);
- List<Object> result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new ArrayList<>();
- } else {
- result = (ArrayList<Object>) previous;
- }
- int prevLength = result.size();
- int length = (int) lengths.next();
- // extend the list to the new length
- for (int i = prevLength; i < length; ++i) {
- result.add(null);
- }
- // read the new elements into the array
- for (int i = 0; i < length; i++) {
- result.set(i, elementReader.next(i < prevLength ?
- result.get(i) : null));
- }
- // remove any extra elements
- for (int i = prevLength - 1; i >= length; --i) {
- result.remove(i);
- }
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previous,
- boolean[] isNull,
- final int batchSize) throws IOException {
- ListColumnVector result = (ListColumnVector) previous;
- super.nextVector(result, isNull, batchSize);
- // if we have some none-null values, then read them
- if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
- lengths.nextVector(result, result.lengths, batchSize);
- // even with repeating lengths, the list doesn't repeat
- result.isRepeating = false;
- // build the offsets vector and figure out how many children to read
- result.childCount = 0;
- for (int r = 0; r < batchSize; ++r) {
- if (result.noNulls || !result.isNull[r]) {
- result.offsets[r] = result.childCount;
- result.childCount += result.lengths[r];
- }
- }
- result.child.ensureSize(result.childCount, false);
- elementReader.nextVector(result.child, null, result.childCount);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.LENGTH)), false, false);
- if (elementReader != null) {
- elementReader.startStripe(streams, stripeFooter);
- }
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- long childSkip = 0;
- for (long i = 0; i < items; ++i) {
- childSkip += lengths.next();
- }
- elementReader.skipRows(childSkip);
- }
- }
-
- public static class MapTreeReader extends TreeReader {
- protected final TreeReader keyReader;
- protected final TreeReader valueReader;
- protected IntegerReader lengths = null;
-
- protected MapTreeReader(int fileColumn,
- TypeDescription readerSchema,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
- super(fileColumn);
- TypeDescription keyType = readerSchema.getChildren().get(0);
- TypeDescription valueType = readerSchema.getChildren().get(1);
- keyReader = createTreeReader(keyType, evolution, included, skipCorrupt);
- valueReader = createTreeReader(valueType, evolution, included, skipCorrupt);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- super.seek(index);
- lengths.seek(index[columnId]);
- keyReader.seek(index);
- valueReader.seek(index);
- }
-
- @Override
- @SuppressWarnings("unchecked")
- Object next(Object previous) throws IOException {
- super.next(previous);
- Map<Object, Object> result = null;
- if (valuePresent) {
- if (previous == null) {
- result = new LinkedHashMap<>();
- } else {
- result = (LinkedHashMap<Object, Object>) previous;
- }
- // for now just clear and create new objects
- result.clear();
- int length = (int) lengths.next();
- // read the new elements into the array
- for (int i = 0; i < length; i++) {
- result.put(keyReader.next(null), valueReader.next(null));
- }
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previous,
- boolean[] isNull,
- final int batchSize) throws IOException {
- MapColumnVector result = (MapColumnVector) previous;
- super.nextVector(result, isNull, batchSize);
- if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
- lengths.nextVector(result, result.lengths, batchSize);
- // even with repeating lengths, the map doesn't repeat
- result.isRepeating = false;
- // build the offsets vector and figure out how many children to read
- result.childCount = 0;
- for (int r = 0; r < batchSize; ++r) {
- if (result.noNulls || !result.isNull[r]) {
- result.offsets[r] = result.childCount;
- result.childCount += result.lengths[r];
- }
- }
- result.keys.ensureSize(result.childCount, false);
- result.values.ensureSize(result.childCount, false);
- keyReader.nextVector(result.keys, null, result.childCount);
- valueReader.nextVector(result.values, null, result.childCount);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.LENGTH)), false, false);
- if (keyReader != null) {
- keyReader.startStripe(streams, stripeFooter);
- }
- if (valueReader != null) {
- valueReader.startStripe(streams, stripeFooter);
- }
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- long childSkip = 0;
- for (long i = 0; i < items; ++i) {
- childSkip += lengths.next();
- }
- keyReader.skipRows(childSkip);
- valueReader.skipRows(childSkip);
- }
- }
-
- public static TreeReader createTreeReader(TypeDescription readerType,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt
- ) throws IOException {
- TypeDescription fileType = evolution.getFileType(readerType);
- if (fileType == null ||
- (included != null && !included[readerType.getId()])) {
- return new NullTreeReader(0);
- }
- TypeDescription.Category readerTypeCategory = readerType.getCategory();
- if (!fileType.getCategory().equals(readerTypeCategory) &&
- (readerTypeCategory != TypeDescription.Category.STRUCT &&
- readerTypeCategory != TypeDescription.Category.MAP &&
- readerTypeCategory != TypeDescription.Category.LIST &&
- readerTypeCategory != TypeDescription.Category.UNION)) {
- // We only convert complex children.
- return ConvertTreeReaderFactory.createConvertTreeReader(readerType, evolution,
- included, skipCorrupt);
- }
- switch (readerTypeCategory) {
- case BOOLEAN:
- return new BooleanTreeReader(fileType.getId());
- case BYTE:
- return new ByteTreeReader(fileType.getId());
- case DOUBLE:
- return new DoubleTreeReader(fileType.getId());
- case FLOAT:
- return new FloatTreeReader(fileType.getId());
- case SHORT:
- return new ShortTreeReader(fileType.getId());
- case INT:
- return new IntTreeReader(fileType.getId());
- case LONG:
- return new LongTreeReader(fileType.getId(), skipCorrupt);
- case STRING:
- return new StringTreeReader(fileType.getId());
- case CHAR:
- return new CharTreeReader(fileType.getId(), readerType.getMaxLength());
- case VARCHAR:
- return new VarcharTreeReader(fileType.getId(), readerType.getMaxLength());
- case BINARY:
- return new BinaryTreeReader(fileType.getId());
- case TIMESTAMP:
- return new TimestampTreeReader(fileType.getId(), skipCorrupt);
- case DATE:
- return new DateTreeReader(fileType.getId());
- case DECIMAL:
- return new DecimalTreeReader(fileType.getId(), readerType.getPrecision(),
- readerType.getScale());
- case STRUCT:
- return new StructTreeReader(fileType.getId(), readerType,
- evolution, included, skipCorrupt);
- case LIST:
- return new ListTreeReader(fileType.getId(), readerType,
- evolution, included, skipCorrupt);
- case MAP:
- return new MapTreeReader(fileType.getId(), readerType, evolution,
- included, skipCorrupt);
- case UNION:
- return new UnionTreeReader(fileType.getId(), readerType,
- evolution, included, skipCorrupt);
- default:
- throw new IllegalArgumentException("Unsupported type " +
- readerTypeCategory);
- }
- }
-}
[14/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ConvertTreeReaderFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ConvertTreeReaderFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ConvertTreeReaderFactory.java
deleted file mode 100644
index 74a097e..0000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ConvertTreeReaderFactory.java
+++ /dev/null
@@ -1,3750 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.nio.charset.StandardCharsets;
-import java.sql.Date;
-import java.sql.Timestamp;
-import java.util.ArrayList;
-import java.util.EnumMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
-import org.apache.hadoop.hive.serde2.io.ByteWritable;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
-import org.apache.hadoop.hive.serde2.io.ShortWritable;
-import org.apache.hadoop.hive.serde2.io.TimestampWritable;
-import org.apache.hadoop.io.BooleanWritable;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.FloatWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.orc.OrcProto;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.TypeDescription.Category;
-import org.apache.orc.impl.InStream;
-import org.apache.orc.impl.PositionProvider;
-import org.apache.orc.impl.StreamName;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Convert ORC tree readers.
- */
-public class ConvertTreeReaderFactory extends TreeReaderFactory {
-
- private static final Logger LOG =
- LoggerFactory.getLogger(TreeReaderFactory.class);
-
- /**
- * Override methods like checkEncoding to pass-thru to the convert TreeReader.
- */
- public static class ConvertTreeReader extends TreeReader {
-
- private TreeReader convertTreeReader;
-
- ConvertTreeReader(int columnId) throws IOException {
- super(columnId);
- }
-
- private static List<TypeDescription.Category> numericTypeList = new ArrayList<TypeDescription.Category>();
-
- // The ordering of types here is used to determine which numeric types
- // are common/convertible to one another. Probably better to rely on the
- // ordering explicitly defined here than to assume that the enum values
- // that were arbitrarily assigned in PrimitiveCategory work for our purposes.
- private static EnumMap<TypeDescription.Category, Integer> numericTypes =
- new EnumMap<TypeDescription.Category, Integer>(TypeDescription.Category.class);
-
- static {
- registerNumericType(TypeDescription.Category.BOOLEAN, 1);
- registerNumericType(TypeDescription.Category.BYTE, 2);
- registerNumericType(TypeDescription.Category.SHORT, 3);
- registerNumericType(TypeDescription.Category.INT, 4);
- registerNumericType(TypeDescription.Category.LONG, 5);
- registerNumericType(TypeDescription.Category.FLOAT, 6);
- registerNumericType(TypeDescription.Category.DOUBLE, 7);
- registerNumericType(TypeDescription.Category.DECIMAL, 8);
- }
-
- private static void registerNumericType(TypeDescription.Category kind, int level) {
- numericTypeList.add(kind);
- numericTypes.put(kind, level);
- }
-
- protected void setConvertTreeReader(TreeReader convertTreeReader) {
- this.convertTreeReader = convertTreeReader;
- }
-
- protected TreeReader getStringGroupTreeReader(int columnId,
- TypeDescription fileType) throws IOException {
- switch (fileType.getCategory()) {
- case STRING:
- return new StringTreeReader(columnId);
- case CHAR:
- return new CharTreeReader(columnId, fileType.getMaxLength());
- case VARCHAR:
- return new VarcharTreeReader(columnId, fileType.getMaxLength());
- default:
- throw new RuntimeException("Unexpected type kind " + fileType.getCategory().name());
- }
- }
-
- protected Writable getStringGroupWritable(TypeDescription fileType)
- throws IOException {
- switch (fileType.getCategory()) {
- case STRING:
- return new Text();
- case CHAR:
- return new HiveCharWritable();
- case VARCHAR:
- return new HiveVarcharWritable();
- default:
- throw new RuntimeException("Unexpected type kind " + fileType.getCategory().name());
- }
- }
-
- protected Writable getStringGroupResultFromString(Object previous,
- TypeDescription readerType, String string) {
- switch (readerType.getCategory()) {
- case STRING:
- {
- Text textResult;
- if (previous == null) {
- textResult = new Text();
- } else {
- textResult = (Text) previous;
- }
- textResult.set(string);
- return textResult;
- }
- case CHAR:
- {
- HiveCharWritable hiveCharResult;
- if (previous == null) {
- hiveCharResult = new HiveCharWritable();
- } else {
- hiveCharResult = (HiveCharWritable) previous;
- }
- hiveCharResult.set(string, readerType.getMaxLength());
- return hiveCharResult;
- }
- case VARCHAR:
- {
- HiveVarcharWritable hiveVarcharResult;
- if (previous == null) {
- hiveVarcharResult = new HiveVarcharWritable();
- } else {
- hiveVarcharResult = (HiveVarcharWritable) previous;
- }
- hiveVarcharResult.set(string, readerType.getMaxLength());
- return hiveVarcharResult;
- }
- default:
- throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name());
- }
- }
-
- protected void assignStringGroupVectorEntry(BytesColumnVector bytesColVector,
- int elementNum, TypeDescription readerType, byte[] bytes) {
- assignStringGroupVectorEntry(bytesColVector,
- elementNum, readerType, bytes, 0, bytes.length);
- }
-
- /*
- * Assign a BytesColumnVector entry when we have a byte array, start, and
- * length for the string group which can be (STRING, CHAR, VARCHAR).
- */
- protected void assignStringGroupVectorEntry(BytesColumnVector bytesColVector,
- int elementNum, TypeDescription readerType, byte[] bytes, int start, int length) {
- switch (readerType.getCategory()) {
- case STRING:
- bytesColVector.setVal(elementNum, bytes, start, length);
- break;
- case CHAR:
- {
- int adjustedDownLen =
- StringExpr.rightTrimAndTruncate(bytes, start, length, readerType.getMaxLength());
- bytesColVector.setVal(elementNum, bytes, start, adjustedDownLen);
- }
- break;
- case VARCHAR:
- {
- int adjustedDownLen =
- StringExpr.truncate(bytes, start, length, readerType.getMaxLength());
- bytesColVector.setVal(elementNum, bytes, start, adjustedDownLen);
- }
- break;
- default:
- throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name());
- }
- }
-
- protected void convertStringGroupVectorElement(BytesColumnVector bytesColVector,
- int elementNum, TypeDescription readerType) {
- switch (readerType.getCategory()) {
- case STRING:
- // No conversion needed.
- break;
- case CHAR:
- {
- int length = bytesColVector.length[elementNum];
- int adjustedDownLen = StringExpr
- .rightTrimAndTruncate(bytesColVector.vector[elementNum],
- bytesColVector.start[elementNum], length,
- readerType.getMaxLength());
- if (adjustedDownLen < length) {
- bytesColVector.length[elementNum] = adjustedDownLen;
- }
- }
- break;
- case VARCHAR:
- {
- int length = bytesColVector.length[elementNum];
- int adjustedDownLen = StringExpr
- .truncate(bytesColVector.vector[elementNum],
- bytesColVector.start[elementNum], length,
- readerType.getMaxLength());
- if (adjustedDownLen < length) {
- bytesColVector.length[elementNum] = adjustedDownLen;
- }
- }
- break;
- default:
- throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name());
- }
- }
-
- private boolean isParseError;
-
- /*
- * We do this because we want the various parse methods return a primitive.
- *
- * @return true if there was a parse error in the last call to
- * parseLongFromString, etc.
- */
- protected boolean getIsParseError() {
- return isParseError;
- }
-
- protected long parseLongFromString(String string) {
- try {
- long longValue = Long.parseLong(string);
- isParseError = false;
- return longValue;
- } catch (NumberFormatException e) {
- isParseError = true;
- return 0;
- }
- }
-
- protected float parseFloatFromString(String string) {
- try {
- float floatValue = Float.parseFloat(string);
- isParseError = false;
- return floatValue;
- } catch (NumberFormatException e) {
- isParseError = true;
- return Float.NaN;
- }
- }
-
- protected double parseDoubleFromString(String string) {
- try {
- double value = Double.parseDouble(string);
- isParseError = false;
- return value;
- } catch (NumberFormatException e) {
- isParseError = true;
- return Double.NaN;
- }
- }
-
- /**
- * @param string
- * @return the HiveDecimal parsed, or null if there was a parse error.
- */
- protected HiveDecimal parseDecimalFromString(String string) {
- try {
- HiveDecimal value = HiveDecimal.create(string);
- return value;
- } catch (NumberFormatException e) {
- return null;
- }
- }
-
- /**
- * @param string
- * @return the Timestamp parsed, or null if there was a parse error.
- */
- protected Timestamp parseTimestampFromString(String string) {
- try {
- Timestamp value = Timestamp.valueOf(string);
- return value;
- } catch (IllegalArgumentException e) {
- return null;
- }
- }
-
- /**
- * @param string
- * @return the Date parsed, or null if there was a parse error.
- */
- protected Date parseDateFromString(String string) {
- try {
- Date value = Date.valueOf(string);
- return value;
- } catch (IllegalArgumentException e) {
- return null;
- }
- }
-
- protected String stringFromStringGroupTreeReader(
- TreeReader stringGroupTreeReader, Writable writable,
- TypeDescription fileType) throws IOException {
- switch (fileType.getCategory()) {
- case STRING:
- {
- Text readTextResult =
- (Text) ((StringTreeReader) stringGroupTreeReader).next(writable);
- if (readTextResult == null) {
- return null;
- }
- return readTextResult.toString();
- }
- case CHAR:
- {
- HiveCharWritable readHiveCharResult =
- (HiveCharWritable) ((CharTreeReader) stringGroupTreeReader).next(writable);
- if (readHiveCharResult == null) {
- return null;
- }
- return readHiveCharResult.getStrippedValue().toString();
- }
- case VARCHAR:
- {
- HiveVarcharWritable readHiveVarcharResult =
- (HiveVarcharWritable) ((VarcharTreeReader) stringGroupTreeReader).next(writable);
- if (readHiveVarcharResult == null) {
- return null;
- }
- return readHiveVarcharResult.toString();
- }
- default:
- throw new RuntimeException("Unexpected type kind " + fileType.getCategory().name());
- }
- }
-
- protected String stringFromBytesColumnVectorEntry(
- BytesColumnVector bytesColVector, int elementNum) {
- String string;
-
- string = new String(
- bytesColVector.vector[elementNum],
- bytesColVector.start[elementNum], bytesColVector.length[elementNum],
- StandardCharsets.UTF_8);
-
- return string;
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- // Pass-thru.
- convertTreeReader.checkEncoding(encoding);
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- // Pass-thru.
- convertTreeReader.startStripe(streams, stripeFooter);
- }
-
- @Override
- void seek(PositionProvider[] index) throws IOException {
- // Pass-thru.
- convertTreeReader.seek(index);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- // Pass-thru.
- convertTreeReader.seek(index);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- // Pass-thru.
- convertTreeReader.skipRows(items);
- }
-
- /**
- * Override this to use convertVector.
- * Source and result are member variables in the subclass with the right
- * type.
- * @param elementNum
- * @throws IOException
- */
- // Override this to use convertVector.
- public void setConvertVectorElement(int elementNum) throws IOException {
- throw new RuntimeException("Expected this method to be overriden");
- }
-
- // Common code used by the conversion.
- public void convertVector(ColumnVector fromColVector,
- ColumnVector resultColVector, final int batchSize) throws IOException {
-
- resultColVector.reset();
- if (fromColVector.isRepeating) {
- resultColVector.isRepeating = true;
- if (fromColVector.noNulls || !fromColVector.isNull[0]) {
- setConvertVectorElement(0);
- } else {
- resultColVector.noNulls = false;
- resultColVector.isNull[0] = true;
- }
- } else if (fromColVector.noNulls){
- for (int i = 0; i < batchSize; i++) {
- setConvertVectorElement(i);
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!fromColVector.isNull[i]) {
- setConvertVectorElement(i);
- } else {
- resultColVector.noNulls = false;
- resultColVector.isNull[i] = true;
- }
- }
- }
- }
-
- public long downCastAnyInteger(long input, TypeDescription readerType) {
- switch (readerType.getCategory()) {
- case BOOLEAN:
- return input == 0 ? 0 : 1;
- case BYTE:
- return (byte) input;
- case SHORT:
- return (short) input;
- case INT:
- return (int) input;
- case LONG:
- return input;
- default:
- throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name());
- }
- }
-
- protected Writable anyIntegerWritable(long longValue, Object previous,
- TypeDescription readerType) {
- switch (readerType.getCategory()) {
- case BOOLEAN:
- {
- BooleanWritable booleanResult;
- if (previous == null) {
- booleanResult = new BooleanWritable();
- } else {
- booleanResult = (BooleanWritable) previous;
- }
- booleanResult.set(longValue != 0);
- return booleanResult;
- }
- case BYTE:
- {
- ByteWritable byteResult;
- if (previous == null) {
- byteResult = new ByteWritable();
- } else {
- byteResult = (ByteWritable) previous;
- }
- byteResult.set((byte) longValue);
- return byteResult;
- }
- case SHORT:
- {
- ShortWritable shortResult;
- if (previous == null) {
- shortResult = new ShortWritable();
- } else {
- shortResult = (ShortWritable) previous;
- }
- shortResult.set((short) longValue);
- return shortResult;
- }
- case INT:
- {
- IntWritable intResult;
- if (previous == null) {
- intResult = new IntWritable();
- } else {
- intResult = (IntWritable) previous;
- }
- intResult.set((int) longValue);
- return intResult;
- }
- case LONG:
- {
- LongWritable longResult;
- if (previous == null) {
- longResult = new LongWritable();
- } else {
- longResult = (LongWritable) previous;
- }
- longResult.set(longValue);
- return longResult;
- }
- default:
- throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name());
- }
- }
-
- protected boolean integerDownCastNeeded(TypeDescription fileType, TypeDescription readerType) {
- Integer fileLevel = numericTypes.get(fileType.getCategory());
- Integer schemaLevel = numericTypes.get(readerType.getCategory());
- return (schemaLevel.intValue() < fileLevel.intValue());
- }
- }
-
- public static class AnyIntegerTreeReader extends ConvertTreeReader {
-
- private TypeDescription.Category fileTypeCategory;
- private TreeReader anyIntegerTreeReader;
-
- private long longValue;
-
- AnyIntegerTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.fileTypeCategory = fileType.getCategory();
- switch (fileTypeCategory) {
- case BOOLEAN:
- anyIntegerTreeReader = new BooleanTreeReader(columnId);
- break;
- case BYTE:
- anyIntegerTreeReader = new ByteTreeReader(columnId);
- break;
- case SHORT:
- anyIntegerTreeReader = new ShortTreeReader(columnId);
- break;
- case INT:
- anyIntegerTreeReader = new IntTreeReader(columnId);
- break;
- case LONG:
- anyIntegerTreeReader = new LongTreeReader(columnId, skipCorrupt);
- break;
- default:
- throw new RuntimeException("Unexpected type kind " + fileType.getCategory().name());
- }
- setConvertTreeReader(anyIntegerTreeReader);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- throw new RuntimeException("Call read() and getLong instead");
- }
-
- protected boolean read() throws IOException {
- anyIntegerTreeReader.readValuePresent();
- if (!anyIntegerTreeReader.valuePresent) {
- return false;
- }
- switch (fileTypeCategory) {
- case BOOLEAN:
- longValue = ((BooleanTreeReader) anyIntegerTreeReader).reader.next();
- break;
- case BYTE:
- longValue = ((ByteTreeReader) anyIntegerTreeReader).reader.next();
- break;
- case SHORT:
- longValue = ((ShortTreeReader) anyIntegerTreeReader).reader.next();
- break;
- case INT:
- longValue = ((IntTreeReader) anyIntegerTreeReader).reader.next();
- break;
- case LONG:
- longValue = ((LongTreeReader) anyIntegerTreeReader).reader.next();
- break;
- default:
- throw new RuntimeException("Unexpected type kind " + fileTypeCategory.name());
- }
- return true;
- }
-
- protected long getLong() throws IOException {
- return longValue;
- }
-
- protected String getString(long longValue) {
- if (fileTypeCategory == TypeDescription.Category.BOOLEAN) {
- return longValue == 0 ? "FALSE" : "TRUE";
- } else {
- return Long.toString(longValue);
- }
- }
-
- protected String getString() {
- return getString(longValue);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- anyIntegerTreeReader.nextVector(previousVector, isNull, batchSize);
- }
- }
-
- public static class AnyIntegerFromAnyIntegerTreeReader extends ConvertTreeReader {
-
- private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
-
- private final TypeDescription readerType;
- private final boolean downCastNeeded;
-
- AnyIntegerFromAnyIntegerTreeReader(int columnId, TypeDescription fileType, TypeDescription readerType, boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- anyIntegerAsLongTreeReader = new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
- setConvertTreeReader(anyIntegerAsLongTreeReader);
- downCastNeeded = integerDownCastNeeded(fileType, readerType);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- Writable result = null;
- if (anyIntegerAsLongTreeReader.read()) {
- long longValue = anyIntegerAsLongTreeReader.getLong();
- result = anyIntegerWritable(longValue, previous, readerType);
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- anyIntegerAsLongTreeReader.nextVector(previousVector, isNull, batchSize);
- LongColumnVector resultColVector = (LongColumnVector) previousVector;
- if (downCastNeeded) {
- long[] resultVector = resultColVector.vector;
- if (resultColVector.isRepeating) {
- if (resultColVector.noNulls || !resultColVector.isNull[0]) {
- resultVector[0] = downCastAnyInteger(resultVector[0], readerType);
- } else {
- resultColVector.noNulls = false;
- resultColVector.isNull[0] = true;
- }
- } else if (resultColVector.noNulls){
- for (int i = 0; i < batchSize; i++) {
- resultVector[i] = downCastAnyInteger(resultVector[i], readerType);
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!resultColVector.isNull[i]) {
- resultVector[i] = downCastAnyInteger(resultVector[i], readerType);
- } else {
- resultColVector.noNulls = false;
- resultColVector.isNull[i] = true;
- }
- }
- }
- }
- }
- }
-
- public static class AnyIntegerFromFloatTreeReader extends ConvertTreeReader {
-
- private FloatTreeReader floatTreeReader;
-
- private final TypeDescription readerType;
- private FloatWritable floatResult;
- private DoubleColumnVector doubleColVector;
- private LongColumnVector longColVector;
-
- AnyIntegerFromFloatTreeReader(int columnId, TypeDescription readerType)
- throws IOException {
- super(columnId);
- this.readerType = readerType;
- floatTreeReader = new FloatTreeReader(columnId);
- setConvertTreeReader(floatTreeReader);
- floatResult = new FloatWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- FloatWritable readfloatResult =
- (FloatWritable) floatTreeReader.next(floatResult);
-
- Writable result = null;
- if (readfloatResult != null) {
- long longValue = (long) readfloatResult.get();
- result = anyIntegerWritable(longValue, previous, readerType);
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- float floatValue = (float) doubleColVector.vector[elementNum];
- longColVector.vector[elementNum] =
- downCastAnyInteger(
- (long) floatValue, readerType);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- longColVector = (LongColumnVector) previousVector;
- }
- // Read present/isNull stream
- floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, longColVector, batchSize);
- }
- }
-
- public static class AnyIntegerFromDoubleTreeReader extends ConvertTreeReader {
-
- private DoubleTreeReader doubleTreeReader;
-
- private final TypeDescription readerType;
- private DoubleWritable doubleResult;
- private DoubleColumnVector doubleColVector;
- private LongColumnVector longColVector;
-
- AnyIntegerFromDoubleTreeReader(int columnId, TypeDescription readerType)
- throws IOException {
- super(columnId);
- this.readerType = readerType;
- doubleTreeReader = new DoubleTreeReader(columnId);
- setConvertTreeReader(doubleTreeReader);
- doubleResult = new DoubleWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- DoubleWritable readDoubleResult =
- (DoubleWritable) doubleTreeReader.next(doubleResult);
-
- Writable result = null;
- if (readDoubleResult != null) {
- long longValue = (long) readDoubleResult.get();
- result = anyIntegerWritable(longValue, previous, readerType);
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- longColVector.vector[elementNum] =
- downCastAnyInteger(
- (long) doubleColVector.vector[elementNum], readerType);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- longColVector = (LongColumnVector) previousVector;
- }
- // Read present/isNull stream
- doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, longColVector, batchSize);
- }
- }
-
- public static class AnyIntegerFromDecimalTreeReader extends ConvertTreeReader {
-
- private DecimalTreeReader decimalTreeReader;
-
- private final int precision;
- private final int scale;
- private final TypeDescription readerType;
- private HiveDecimalWritable hiveDecimalResult;
- private DecimalColumnVector decimalColVector;
- private LongColumnVector longColVector;
-
- AnyIntegerFromDecimalTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType) throws IOException {
- super(columnId);
- this.precision = fileType.getPrecision();
- this.scale = fileType.getScale();
- this.readerType = readerType;
- decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
- setConvertTreeReader(decimalTreeReader);
- hiveDecimalResult = new HiveDecimalWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- HiveDecimalWritable readHiveDecimalResult =
- (HiveDecimalWritable) decimalTreeReader.next(hiveDecimalResult);
-
- Writable result = null;
- if (readHiveDecimalResult != null) {
- long longValue = readHiveDecimalResult.getHiveDecimal().longValue();
- result = anyIntegerWritable(longValue, previous, readerType);
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- longColVector.vector[elementNum] =
- downCastAnyInteger(
- decimalColVector.vector[elementNum].getHiveDecimal().longValue(),
- readerType);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (decimalColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- decimalColVector = new DecimalColumnVector(precision, scale);
- longColVector = (LongColumnVector) previousVector;
- }
- // Read present/isNull stream
- decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
-
- convertVector(decimalColVector, longColVector, batchSize);
- }
- }
-
- public static class AnyIntegerFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- private final TypeDescription fileType;
- private final TypeDescription readerType;
- private Writable writable;
- private BytesColumnVector bytesColVector;
- private LongColumnVector longColVector;
-
- AnyIntegerFromStringGroupTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType) throws IOException {
- super(columnId);
- this.fileType = fileType;
- this.readerType = readerType;
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- writable = getStringGroupWritable(fileType);
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- String stringValue = stringFromStringGroupTreeReader(
- stringGroupTreeReader, writable, fileType);
-
- Writable result = null;
- if (stringValue != null) {
- long longValue = parseLongFromString(stringValue);
- if (!getIsParseError()) {
- result = anyIntegerWritable(longValue, previous, readerType);
- }
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
- long longValue = parseLongFromString(string);
- if (!getIsParseError()) {
- longColVector.vector[elementNum] =
- downCastAnyInteger(longValue, readerType);
- } else {
- longColVector.noNulls = false;
- longColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (bytesColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- bytesColVector = new BytesColumnVector();
- longColVector = (LongColumnVector) previousVector;
- }
- // Read present/isNull stream
- stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
-
- convertVector(bytesColVector, longColVector, batchSize);
- }
- }
-
- public static class AnyIntegerFromTimestampTreeReader extends ConvertTreeReader {
-
- private TimestampTreeReader timestampTreeReader;
-
- private final TypeDescription readerType;
- private TimestampWritable timestampResult;
- private TimestampColumnVector timestampColVector;
- private LongColumnVector longColVector;
-
- AnyIntegerFromTimestampTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
- setConvertTreeReader(timestampTreeReader);
- timestampResult = new TimestampWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- TimestampWritable readHiveTimestampResult =
- (TimestampWritable) timestampTreeReader.next(timestampResult);
-
- Writable result = null;
- if (readHiveTimestampResult != null) {
- // Use TimestampWritable's getSeconds.
- long longValue = readHiveTimestampResult.getSeconds();
- result = anyIntegerWritable(longValue, previous, readerType);
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- timestampResult.set(timestampColVector.asScratchTimestamp(elementNum));
- // Use TimestampWritable's getSeconds.
- long longValue = timestampResult.getSeconds();
- longColVector.vector[elementNum] =
- downCastAnyInteger(longValue, readerType);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (timestampColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- timestampColVector = new TimestampColumnVector();
- longColVector = (LongColumnVector) previousVector;
- }
- // Read present/isNull stream
- timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
-
- convertVector(timestampColVector, longColVector, batchSize);
- }
- }
-
- public static class FloatFromAnyIntegerTreeReader extends ConvertTreeReader {
-
- private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
-
- private LongColumnVector longColVector;
- private DoubleColumnVector doubleColVector;
-
- FloatFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- anyIntegerAsLongTreeReader =
- new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
- setConvertTreeReader(anyIntegerAsLongTreeReader);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- FloatWritable result = null;
- if (anyIntegerAsLongTreeReader.read()) {
- long longValue = anyIntegerAsLongTreeReader.getLong();
- float floatValue = (float) longValue;
- if (!Float.isNaN(floatValue)){
- if (previous == null) {
- result = new FloatWritable();
- } else {
- result = (FloatWritable) previous;
- }
- result.set(floatValue);
- }
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- float floatValue = (float) longColVector.vector[elementNum];
- if (!Float.isNaN(floatValue)) {
- doubleColVector.vector[elementNum] = floatValue;
- } else {
- doubleColVector.vector[elementNum] = Double.NaN;
- doubleColVector.noNulls = false;
- doubleColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, doubleColVector, batchSize);
- }
- }
-
- public static class FloatFromDoubleTreeReader extends ConvertTreeReader {
-
- private DoubleTreeReader doubleTreeReader;
-
- private DoubleWritable doubleResult;
-
- FloatFromDoubleTreeReader(int columnId) throws IOException {
- super(columnId);
- doubleTreeReader = new DoubleTreeReader(columnId);
- setConvertTreeReader(doubleTreeReader);
- doubleResult = new DoubleWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- DoubleWritable readDoubleResult =
- (DoubleWritable) doubleTreeReader.next(doubleResult);
-
- FloatWritable result = null;
- if (readDoubleResult != null) {
- if (previous == null) {
- result = new FloatWritable();
- } else {
- result = (FloatWritable) previous;
- }
- result.set((float) readDoubleResult.get());
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- doubleTreeReader.nextVector(previousVector, isNull, batchSize);
-
- DoubleColumnVector resultColVector = (DoubleColumnVector) previousVector;
- double[] resultVector = resultColVector.vector;
- if (resultColVector.isRepeating) {
- if (resultColVector.noNulls || !resultColVector.isNull[0]) {
- resultVector[0] = (float) resultVector[0];
- } else {
- resultColVector.noNulls = false;
- resultColVector.isNull[0] = true;
- }
- } else if (resultColVector.noNulls){
- for (int i = 0; i < batchSize; i++) {
- resultVector[i] = (float) resultVector[i];
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!resultColVector.isNull[i]) {
- resultVector[i] = (float) resultVector[i];
- } else {
- resultColVector.noNulls = false;
- resultColVector.isNull[i] = true;
- }
- }
- }
- }
- }
-
- public static class FloatFromDecimalTreeReader extends ConvertTreeReader {
-
- private DecimalTreeReader decimalTreeReader;
-
- private final int precision;
- private final int scale;
- private final TypeDescription readerType;
- private HiveDecimalWritable hiveDecimalResult;
- private DecimalColumnVector decimalColVector;
- private DoubleColumnVector doubleColVector;
-
- FloatFromDecimalTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType) throws IOException {
- super(columnId);
- this.precision = fileType.getPrecision();
- this.scale = fileType.getScale();
- this.readerType = readerType;
- decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
- setConvertTreeReader(decimalTreeReader);
- hiveDecimalResult = new HiveDecimalWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- HiveDecimalWritable readHiveDecimalResult =
- (HiveDecimalWritable) decimalTreeReader.next(hiveDecimalResult);
-
- FloatWritable result = null;
- if (readHiveDecimalResult != null) {
- double doubleValue = readHiveDecimalResult.getHiveDecimal().doubleValue();
- if (previous == null) {
- result = new FloatWritable();
- } else {
- result = (FloatWritable) previous;
- }
- result.set((float) doubleValue);
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- doubleColVector.vector[elementNum] =
- (float) decimalColVector.vector[elementNum].getHiveDecimal().doubleValue();
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (decimalColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- decimalColVector = new DecimalColumnVector(precision, scale);
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
-
- convertVector(decimalColVector, doubleColVector, batchSize);
- }
- }
-
- public static class FloatFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- private final TypeDescription fileType;
- private Writable writable;
- private BytesColumnVector bytesColVector;
- private DoubleColumnVector doubleColVector;
-
- FloatFromStringGroupTreeReader(int columnId, TypeDescription fileType)
- throws IOException {
- super(columnId);
- this.fileType = fileType;
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- writable = getStringGroupWritable(fileType);
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- String stringValue = stringFromStringGroupTreeReader(
- stringGroupTreeReader, writable, fileType);
-
- FloatWritable result = null;
- if (stringValue != null) {
- float floatValue = parseFloatFromString(stringValue);
- if (!getIsParseError()) {
- if (previous == null) {
- result = new FloatWritable();
- } else {
- result = (FloatWritable) previous;
- }
- result.set(floatValue);
- }
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
- float floatValue = parseFloatFromString(string);
- if (!getIsParseError()) {
- doubleColVector.vector[elementNum] = floatValue;
- } else {
- doubleColVector.vector[elementNum] = Double.NaN;
- doubleColVector.noNulls = false;
- doubleColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (bytesColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- bytesColVector = new BytesColumnVector();
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
-
- convertVector(bytesColVector, doubleColVector, batchSize);
- }
- }
-
- public static class FloatFromTimestampTreeReader extends ConvertTreeReader {
-
- private TimestampTreeReader timestampTreeReader;
-
- private final TypeDescription readerType;
- private TimestampWritable timestampResult;
- private TimestampColumnVector timestampColVector;
- private DoubleColumnVector doubleColVector;
-
- FloatFromTimestampTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
- setConvertTreeReader(timestampTreeReader);
- timestampResult = new TimestampWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- TimestampWritable readTimestampResult =
- (TimestampWritable) timestampTreeReader.next(timestampResult);
-
- FloatWritable result = null;
- if (readTimestampResult != null) {
- double doubleValue = readTimestampResult.getDouble();
- if (previous == null) {
- result = new FloatWritable();
- } else {
- result = (FloatWritable) previous;
- }
- result.set((float) doubleValue);
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- timestampResult.set(timestampColVector.asScratchTimestamp(elementNum));
- doubleColVector.vector[elementNum] = (float) timestampResult.getDouble();
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (timestampColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- timestampColVector = new TimestampColumnVector();
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
-
- convertVector(timestampColVector, doubleColVector, batchSize);
- }
- }
-
- public static class DoubleFromAnyIntegerTreeReader extends ConvertTreeReader {
-
- private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
-
- private LongColumnVector longColVector;
- private DoubleColumnVector doubleColVector;
-
- DoubleFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- anyIntegerAsLongTreeReader =
- new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
- setConvertTreeReader(anyIntegerAsLongTreeReader);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- DoubleWritable result = null;
- if (anyIntegerAsLongTreeReader.read()) {
- long longValue = anyIntegerAsLongTreeReader.getLong();
- double doubleValue = (double) longValue;
- if (!Double.isNaN(doubleValue)) {
- if (previous == null) {
- result = new DoubleWritable();
- } else {
- result = (DoubleWritable) previous;
- }
- result.set(doubleValue);
- }
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
-
- double doubleValue = (double) longColVector.vector[elementNum];
- if (!Double.isNaN(doubleValue)) {
- doubleColVector.vector[elementNum] = doubleValue;
- } else {
- doubleColVector.vector[elementNum] = Double.NaN;
- doubleColVector.noNulls = false;
- doubleColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, doubleColVector, batchSize);
- }
- }
-
- public static class DoubleFromFloatTreeReader extends ConvertTreeReader {
-
- private FloatTreeReader floatTreeReader;
-
- private FloatWritable floatResult;
-
- DoubleFromFloatTreeReader(int columnId) throws IOException {
- super(columnId);
- floatTreeReader = new FloatTreeReader(columnId);
- setConvertTreeReader(floatTreeReader);
- floatResult = new FloatWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- FloatWritable readFloatResult =
- (FloatWritable) floatTreeReader.next(floatResult);
-
- DoubleWritable result = null;
- if (readFloatResult != null) {
- if (previous == null) {
- result = new DoubleWritable();
- } else {
- result = (DoubleWritable) previous;
- }
- result.set(readFloatResult.get());
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- // The DoubleColumnVector produced by FloatTreeReader is what we want.
- floatTreeReader.nextVector(previousVector, isNull, batchSize);
- }
- }
-
- public static class DoubleFromDecimalTreeReader extends ConvertTreeReader {
-
- private DecimalTreeReader decimalTreeReader;
-
- private final int precision;
- private final int scale;
- private final TypeDescription readerType;
- private HiveDecimalWritable hiveDecimalResult;
- private DecimalColumnVector decimalColVector;
- private DoubleColumnVector doubleColVector;
-
- DoubleFromDecimalTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType) throws IOException {
- super(columnId);
- this.precision = fileType.getPrecision();
- this.scale = fileType.getScale();
- this.readerType = readerType;
- decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
- setConvertTreeReader(decimalTreeReader);
- hiveDecimalResult = new HiveDecimalWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- HiveDecimalWritable readHiveDecimalResult =
- (HiveDecimalWritable) decimalTreeReader.next(hiveDecimalResult);
-
- DoubleWritable result = null;
- if (readHiveDecimalResult != null) {
- double doubleValue = readHiveDecimalResult.getHiveDecimal().doubleValue();
- if (previous == null) {
- result = new DoubleWritable();
- } else {
- result = (DoubleWritable) previous;
- }
- result.set(doubleValue);
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- doubleColVector.vector[elementNum] =
- decimalColVector.vector[elementNum].getHiveDecimal().doubleValue();
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (decimalColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- decimalColVector = new DecimalColumnVector(precision, scale);
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
-
- convertVector(decimalColVector, doubleColVector, batchSize);
- }
- }
-
- public static class DoubleFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- private final TypeDescription fileType;
- private Writable writable;
- private BytesColumnVector bytesColVector;
- private DoubleColumnVector doubleColVector;
-
- DoubleFromStringGroupTreeReader(int columnId, TypeDescription fileType)
- throws IOException {
- super(columnId);
- this.fileType = fileType;
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- writable = getStringGroupWritable(fileType);
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- String stringValue = stringFromStringGroupTreeReader(
- stringGroupTreeReader, writable, fileType);
-
- DoubleWritable result = null;
- if (stringValue != null) {
- double doubleValue = parseDoubleFromString(stringValue);
- if (!getIsParseError()) {
- if (previous == null) {
- result = new DoubleWritable();
- } else {
- result = (DoubleWritable) previous;
- }
- result.set(doubleValue);
- }
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
- double doubleValue = parseDoubleFromString(string);
- if (!getIsParseError()) {
- doubleColVector.vector[elementNum] = doubleValue;
- } else {
- doubleColVector.noNulls = false;
- doubleColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (bytesColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- bytesColVector = new BytesColumnVector();
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
-
- convertVector(bytesColVector, doubleColVector, batchSize);
- }
- }
-
- public static class DoubleFromTimestampTreeReader extends ConvertTreeReader {
-
- private TimestampTreeReader timestampTreeReader;
-
- private final TypeDescription readerType;
- private TimestampWritable timestampResult;
- private TimestampColumnVector timestampColVector;
- private DoubleColumnVector doubleColVector;
-
- DoubleFromTimestampTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
- setConvertTreeReader(timestampTreeReader);
- timestampResult = new TimestampWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- TimestampWritable readTimestampResult =
- (TimestampWritable) timestampTreeReader.next(timestampResult);
-
- DoubleWritable result = null;
- if (readTimestampResult != null) {
- double doubleValue = readTimestampResult.getDouble();
- if (previous == null) {
- result = new DoubleWritable();
- } else {
- result = (DoubleWritable) previous;
- }
- result.set(doubleValue);
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- timestampResult.set(timestampColVector.asScratchTimestamp(elementNum));
- doubleColVector.vector[elementNum] = timestampResult.getDouble();
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (timestampColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- timestampColVector = new TimestampColumnVector();
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
-
- convertVector(timestampColVector, doubleColVector, batchSize);
- }
- }
-
- public static class DecimalFromAnyIntegerTreeReader extends ConvertTreeReader {
-
- private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
-
- private int precision;
- private int scale;
- private LongColumnVector longColVector;
- private DecimalColumnVector decimalColVector;
-
- DecimalFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType, boolean skipCorrupt) throws IOException {
- super(columnId);
- this.precision = readerType.getPrecision();
- this.scale = readerType.getScale();
- anyIntegerAsLongTreeReader =
- new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
- setConvertTreeReader(anyIntegerAsLongTreeReader);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- HiveDecimalWritable result = null;
- if (anyIntegerAsLongTreeReader.read()) {
- long longValue = anyIntegerAsLongTreeReader.getLong();
- result = new HiveDecimalWritable(longValue);
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- long longValue = longColVector.vector[elementNum];
- HiveDecimalWritable hiveDecimalWritable =
- new HiveDecimalWritable(longValue);
- decimalColVector.set(elementNum, hiveDecimalWritable);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- decimalColVector = (DecimalColumnVector) previousVector;
- }
- // Read present/isNull stream
- anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, decimalColVector, batchSize);
- }
- }
-
- public static class DecimalFromFloatTreeReader extends ConvertTreeReader {
-
- private FloatTreeReader floatTreeReader;
-
- private int precision;
- private int scale;
- private FloatWritable floatResult;
- private DoubleColumnVector doubleColVector;
- private DecimalColumnVector decimalColVector;
-
- DecimalFromFloatTreeReader(int columnId, TypeDescription readerType)
- throws IOException {
- super(columnId);
- this.precision = readerType.getPrecision();
- this.scale = readerType.getScale();
- floatTreeReader = new FloatTreeReader(columnId);
- setConvertTreeReader(floatTreeReader);
- floatResult = new FloatWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- FloatWritable readFloatResult =
- (FloatWritable) floatTreeReader.next(floatResult);
-
- HiveDecimalWritable result = null;
- if (readFloatResult != null) {
- HiveDecimal value =
- HiveDecimal.create(Float.toString(readFloatResult.get()));
- if (value != null) {
- if (previous == null) {
- result = new HiveDecimalWritable();
- } else {
- result = (HiveDecimalWritable) previous;
- }
- result.set(value);
- }
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- float floatValue = (float) doubleColVector.vector[elementNum];
- if (!Float.isNaN(floatValue)) {
- HiveDecimal value =
- HiveDecimal.create(Float.toString(floatValue));
- if (value != null) {
- decimalColVector.set(elementNum, value);
- } else {
- decimalColVector.noNulls = false;
- decimalColVector.isNull[elementNum] = true;
- }
- } else {
- decimalColVector.noNulls = false;
- decimalColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- decimalColVector = (DecimalColumnVector) previousVector;
- }
- // Read present/isNull stream
- floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, decimalColVector, batchSize);
- }
- }
-
- public static class DecimalFromDoubleTreeReader extends ConvertTreeReader {
-
- private DoubleTreeReader doubleTreeReader;
-
- private int precision;
- private int scale;
- private DoubleWritable doubleResult;
- private DoubleColumnVector doubleColVector;
- private DecimalColumnVector decimalColVector;
-
- DecimalFromDoubleTreeReader(int columnId, TypeDescription readerType)
- throws IOException {
- super(columnId);
- this.precision = readerType.getPrecision();
- this.scale = readerType.getScale();
- doubleTreeReader = new DoubleTreeReader(columnId);
- setConvertTreeReader(doubleTreeReader);
- doubleResult = new DoubleWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- DoubleWritable readDoubleResult =
- (DoubleWritable) doubleTreeReader.next(doubleResult);
-
- HiveDecimalWritable result = null;
- if (readDoubleResult != null) {
- HiveDecimal value =
- HiveDecimal.create(Double.toString(readDoubleResult.get()));
- if (value != null) {
- if (previous == null) {
- result = new HiveDecimalWritable();
- } else {
- result = (HiveDecimalWritable) previous;
- }
- result.set(value);
- }
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- HiveDecimal value =
- HiveDecimal.create(Double.toString(doubleColVector.vector[elementNum]));
- if (value != null) {
- decimalColVector.set(elementNum, value);
- } else {
- decimalColVector.noNulls = false;
- decimalColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- decimalColVector = (DecimalColumnVector) previousVector;
- }
- // Read present/isNull stream
- doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, decimalColVector, batchSize);
- }
- }
-
- public static class DecimalFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- private final TypeDescription fileType;
- private Writable writable;
- private BytesColumnVector bytesColVector;
- private int precision;
- private int scale;
- private DecimalColumnVector decimalColVector;
-
- DecimalFromStringGroupTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType) throws IOException {
- super(columnId);
- this.fileType = fileType;
- this.precision = readerType.getPrecision();
- this.scale = readerType.getScale();
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- writable = getStringGroupWritable(fileType);
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- String stringValue = stringFromStringGroupTreeReader(
- stringGroupTreeReader, writable, fileType);
-
- HiveDecimalWritable result = null;
- if (stringValue != null) {
- HiveDecimal value = parseDecimalFromString(stringValue);
- if (value != null) {
- if (previous == null) {
- result = new HiveDecimalWritable();
- } else {
- result = (HiveDecimalWritable) previous;
- }
- result.set(value, precision, scale);
- }
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
- HiveDecimal value = parseDecimalFromString(string);
- if (value != null) {
- decimalColVector.set(elementNum, value);
- } else {
- decimalColVector.noNulls = false;
- decimalColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (bytesColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- bytesColVector = new BytesColumnVector();
- decimalColVector = (DecimalColumnVector) previousVector;
- }
- // Read present/isNull stream
- stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
-
- convertVector(bytesColVector, decimalColVector, batchSize);
- }
- }
-
- public static class DecimalFromTimestampTreeReader extends ConvertTreeReader {
-
- private TimestampTreeReader timestampTreeReader;
-
- private final TypeDescription readerType;
- private TimestampWritable timestampResult;
- private TimestampColumnVector timestampColVector;
- private int precision;
- private int scale;
- private DecimalColumnVector decimalColVector;
-
- DecimalFromTimestampTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- this.precision = readerType.getPrecision();
- this.scale = readerType.getScale();
- timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
- setConvertTreeReader(timestampTreeReader);
- timestampResult = new TimestampWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- TimestampWritable readTimestampResult =
- (TimestampWritable) timestampTreeReader.next(timestampResult);
-
- HiveDecimalWritable result = null;
- if (readTimestampResult != null) {
- double doubleValue = readTimestampResult.getDouble();
- HiveDecimal value = HiveDecimal.create(Double.toString(doubleValue));
- if (value != null) {
- if (previous == null) {
- result = new HiveDecimalWritable();
- } else {
- result = (HiveDecimalWritable) previous;
- }
- result.set(value, precision, scale);
- }
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- timestampResult.set(timestampColVector.asScratchTimestamp(elementNum));
- double doubleValue = timestampResult.getDouble();
- HiveDecimal value = HiveDecimal.create(Double.toString(doubleValue));
- if (value != null) {
- decimalColVector.set(elementNum, value);
- } else {
- decimalColVector.noNulls = false;
- decimalColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (timestampColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- timestampColVector = new TimestampColumnVector();
- decimalColVector = (DecimalColumnVector) previousVector;
- }
- // Read present/isNull stream
- timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
-
- convertVector(timestampColVector, decimalColVector, batchSize);
- }
- }
-
- public static class StringGroupFromAnyIntegerTreeReader extends ConvertTreeReader {
-
- private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
-
- private final TypeDescription fileType;
- private final TypeDescription readerType;
- private LongColumnVector longColVector;
- private BytesColumnVector bytesColVector;
-
- StringGroupFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType, boolean skipCorrupt) throws IOException {
- super(columnId);
- this.fileType = fileType;
- this.readerType = readerType;
- anyIntegerAsLongTreeReader =
- new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
- setConvertTreeReader(anyIntegerAsLongTreeReader);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- Writable result = null;
- if (anyIntegerAsLongTreeReader.read()) {
- result = getStringGroupResultFromString(
- previous, readerType, anyIntegerAsLongTreeReader.getString());
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- long longValue = longColVector.vector[elementNum];
- String string = anyIntegerAsLongTreeReader.getString(longValue);
- byte[] bytes = string.getBytes();
- assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- bytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, bytesColVector, batchSize);
- }
- }
-
- public static class StringGroupFromFloatTreeReader extends ConvertTreeReader {
-
- private FloatTreeReader floatTreeReader;
-
- private final TypeDescription readerType;
- private FloatWritable floatResult;
- private DoubleColumnVector doubleColVector;
- private BytesColumnVector bytesColVector;
-
-
- StringGroupFromFloatTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- floatTreeReader = new FloatTreeReader(columnId);
- setConvertTreeReader(floatTreeReader);
- floatResult = new FloatWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- FloatWritable readFloatResult =
- (FloatWritable) floatTreeReader.next(floatResult);
-
- Writable result = null;
- if (readFloatResult != null) {
- float floatValue = readFloatResult.get();
- if (!Float.isNaN(floatValue)) {
- result = getStringGroupResultFromString(
- previous, readerType, String.valueOf(floatValue));
- }
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- float floatValue = (float) doubleColVector.vector[elementNum];
- if (!Float.isNaN(floatValue)) {
- String string = String.valueOf(floatValue);
- byte[] bytes = string.getBytes();
- assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
- } else {
- bytesColVector.noNulls = false;
- bytesColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- bytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, bytesColVector, batchSize);
- }
- }
-
- public static class StringGroupFromDoubleTreeReader extends ConvertTreeReader {
-
- private DoubleTreeReader doubleTreeReader;
-
- private final TypeDescription readerType;
- private DoubleWritable doubleResult;
- private DoubleColumnVector doubleColVector;
- private BytesColumnVector bytesColVector;
-
- StringGroupFromDoubleTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- doubleTreeReader = new DoubleTreeReader(columnId);
- setConvertTreeReader(doubleTreeReader);
- doubleResult = new DoubleWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- DoubleWritable readDoubleResult =
- (DoubleWritable) doubleTreeReader.next(doubleResult);
-
- Writable result = null;
- if (readDoubleResult != null) {
- double doubleValue = readDoubleResult.get();
- if (!Double.isNaN(doubleValue)) {
- result = getStringGroupResultFromString(
- previous, readerType, String.valueOf(doubleValue));
- }
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- double doubleValue = doubleColVector.vector[elementNum];
- if (!Double.isNaN(doubleValue)) {
- String string = String.valueOf(doubleValue);
- byte[] bytes = string.getBytes();
- assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
- } else {
- bytesColVector.noNulls = false;
- bytesColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- bytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, bytesColVector, batchSize);
- }
- }
-
-
-
- public static class StringGroupFromDecimalTreeReader extends ConvertTreeReader {
-
- private DecimalTreeReader decimalTreeReader;
-
- private int precision;
- private int scale;
- private final TypeDescription readerType;
- private HiveDecimalWritable hiveDecimalResult;
- private DecimalColumnVector decimalColVector;
- private BytesColumnVector bytesColVector;
-
- StringGroupFromDecimalTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType, boolean skipCorrupt) throws IOException {
- super(columnId);
- this.precision = fileType.getPrecision();
- this.scale = fileType.getScale();
- this.readerType = readerType;
- decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
- setConvertTreeReader(decimalTreeReader);
- hiveDecimalResult = new HiveDecimalWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- HiveDecimalWritable readHiveDecimalResult =
- (HiveDecimalWritable) decimalTreeReader.next(hiveDecimalResult);
-
- Writable result = null;
- if (readHiveDecimalResult != null) {
- result = getStringGroupResultFromString(
- previous, readerType, readHiveDecimalResult.getHiveDecimal().toString());
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- String string = decimalColVector.vector[elementNum].getHiveDecimal().toString();
- byte[] bytes = string.getBytes();
- assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (decimalColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- decimalColVector = new DecimalColumnVector(precision, scale);
- bytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
-
- convertVector(decimalColVector, bytesColVector, batchSize);
- }
- }
-
- public static class StringGroupFromTimestampTreeReader extends ConvertTreeReader {
-
- private TimestampTreeReader timestampTreeReader;
-
- private final TypeDescription readerType;
- private TimestampWritable timestampWritableResult;
- private TimestampColumnVector timestampColVector;
- private BytesColumnVector bytesColVector;
-
- StringGroupFromTimestampTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
- setConvertTreeReader(timestampTreeReader);
- timestampWritableResult = new TimestampWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- TimestampWritable readTimestampWritableResult =
- (TimestampWritable) timestampTreeReader.next(timestampWritableResult);
-
- Writable result = null;
- if (readTimestampWritableResult != null) {
- result = getStringGroupResultFromString(
- previous, readerType, readTimestampWritableResult.toString());
- }
-
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- String string =
- timestampColVector.asScratchTimestamp(elementNum).toString();
- byte[] bytes = string.getBytes();
- assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (timestampColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- timestampColVector = new TimestampColumnVector();
- bytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
-
- convertVector(timestampColVector, bytesColVector, batchSize);
- }
- }
-
- public static class StringGroupFromDateTreeReader extends ConvertTreeReader {
-
- private DateTreeReader dateTreeReader;
-
- private final TypeDescription readerType;
- private LongColumnVector longColVector;
- private BytesColumnVector bytesColVector;
- private DateWritable dateWritableResult;
- private Date date;
-
- StringGroupFromDateTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- dateTreeReader = new DateTreeReader(columnId);
- setConvertTreeReader(dateTreeReader);
- dateWritableResult = new DateWritable();
- date = new Date(0);
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- DateWritable readDateWritableResult =
- (DateWritable) dateTreeReader.next(dateWritableResult);
-
- Writable result = null;
- if (readDateWritableResult != null) {
- result = getStringGroupResultFromString(
- previous, readerType, readDateWritableResult.toString());
- }
-
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- date.setTime(DateWritable.daysToMillis((int) longColVector.vector[elementNum]));
- String string = date.toString();
- byte[] bytes = string.getBytes();
- assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- bytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- dateTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, bytesColVector, batchSize);
- }
- }
-
- public static class StringGroupFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- private final TypeDescription fileType;
- private final TypeDescription readerType;
- private Writable writable;
-
- StringGroupFromStringGroupTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType) throws IOException {
- super(columnId);
- this.fileType = fileType;
- this.readerType = readerType;
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- writable = getStringGroupWritable(fileType);
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- String stringValue = stringFromStringGroupTreeReader(
- stringGroupTreeReader, writable, fileType);
-
- Writable result = null;
- if (stringValue != null) {
- result = getStringGroupResultFromString(
- previous, readerType, stringValue);
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- stringGroupTreeReader.nextVector(previousVector, isNull, batchSize);
-
- BytesColumnVector resultColVector = (BytesColumnVector) previousVector;
-
- if (resultColVector.isRepeating) {
- if (resultColVector.noNulls || !resultColVector.isNull[0]) {
- convertStringGroupVectorElement(resultColVector, 0, readerType);
- } else {
- resultColVector.noNulls = false;
- resultColVector.isNull[0] = true;
- }
- } else if (resultColVector.noNulls){
- for (int i = 0; i < batchSize; i++) {
- convertStringGroupVectorElement(resultColVector, i, readerType);
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!resultColVector.isNull[i]) {
- convertStringGroupVectorElement(resultColVector, i, readerType);
- } else {
- resultColVector.noNulls = false;
- resultColVector.isNull[i] = true;
- }
- }
- }
- }
- }
-
- public static class StringGroupFromBinaryTreeReader extends ConvertTreeReader {
-
- private BinaryTreeReader binaryTreeReader;
-
- private final TypeDescription readerType;
- private BytesWritable binaryWritableResult;
- private BytesColumnVector inBytesColVector;
- private BytesColumnVector outBytesColVector;
-
- StringGroupFromBinaryTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- binaryTreeReader = new BinaryTreeReader(columnId);
- setConvertTreeReader(binaryTreeReader);
- binaryWritableResult = new BytesWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- BytesWritable readBytesWritableResult =
- (BytesWritable) binaryTreeReader.next(binaryWritableResult);
-
- Writable result = null;
- if (readBytesWritableResult != null) {
- result = getStringGroupResultFromString(
- previous, readerType, readBytesWritableResult.toString());
- }
-
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- // UNDONE: Binary to StringGroup conversion?
- byte[] bytes = inBytesColVector.vector[elementNum];
- int start = inBytesColVector.start[elementNum];
- int length = inBytesColVector.length[elementNum];
- assignStringGroupVectorEntry(outBytesColVector, elementNum, readerType, bytes, start, length);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (inBytesColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- inBytesColVector = new BytesColumnVector();
- outBytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- binaryTreeReader.nextVector(inBytesColVector, isNull, batchSize);
-
- convertVector(inBytesColVector, outBytesColVector, batchSize);
- }
- }
-
- public static class TimestampFromAnyIntegerTreeReader extends ConvertTreeReader {
-
- private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
-
- private LongColumnVector longColVector;
- private TimestampColumnVector timestampColVector;
-
- TimestampFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- anyIntegerAsLongTreeReader =
- new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
- setConvertTreeReader(anyIntegerAsLongTreeReader);
- }
-
- @Override
- Object next(Object previous) throws IOException {
- TimestampWritable result = null;
- if (anyIntegerAsLongTreeReader.read()) {
- long longValue = anyIntegerAsLongTreeReader.getLong();
- if (previous == null) {
- result = new TimestampWritable();
- } else {
- result = (TimestampWritable) previous;
- }
- // UNDONE: What does the boolean setting need to be?
- result.set(TimestampWritable.longToTimestamp(longValue, false));
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- long longValue = longColVector.vector[elementNum];
- // UNDONE: What does the boolean setting need to be?
- timestampColVector.set(elementNum, TimestampWritable.longToTimestamp(longValue, false));
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- timestampColVector = (TimestampColumnVector) previousVector;
- }
- // Read present/isNull stream
- anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, timestampColVector, batchSize);
- }
- }
-
- public static class TimestampFromFloatTreeReader extends ConvertTreeReader {
-
- private FloatTreeReader floatTreeReader;
-
- private FloatWritable floatResult;
- private DoubleColumnVector doubleColVector;
- private TimestampColumnVector timestampColVector;
-
- TimestampFromFloatTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- floatTreeReader = new FloatTreeReader(columnId);
- setConvertTreeReader(floatTreeReader);
- floatResult = new FloatWritable();
- }
-
- @Override
- Object next(Object previous) throws IOException {
-
- FloatWritable readFloatResult =
- (FloatWritable) floatTreeReader.next(floatResult);
-
- TimestampWritable result = null;
- if (readFloatResult != null) {
- float floatValue = readFloatResult.get();
- if (previous == null) {
- result = new TimestampWritable();
- } else {
- result = (TimestampWritable) previous;
- }
- result.set(TimestampWritable.doubleToTimestamp(floatValue));
- }
- return result;
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- float floatValue = (float) doubleColVe
<TRUNCATED>
[11/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderUtils.java
deleted file mode 100644
index 4192588..0000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderUtils.java
+++ /dev/null
@@ -1,586 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-
-import com.google.common.collect.Lists;
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.io.DiskRange;
-import org.apache.hadoop.hive.common.io.DiskRangeList;
-import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper;
-import org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper;
-import org.apache.hadoop.hive.shims.HadoopShims;
-import org.apache.hadoop.hive.shims.ShimLoader;
-import org.apache.hadoop.hive.shims.HadoopShims.ByteBufferPoolShim;
-import org.apache.hadoop.hive.shims.HadoopShims.ZeroCopyReaderShim;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.impl.BufferChunk;
-import org.apache.orc.CompressionCodec;
-import org.apache.orc.DataReader;
-import org.apache.orc.impl.DataReaderProperties;
-import org.apache.orc.impl.DirectDecompressionCodec;
-import org.apache.orc.OrcProto;
-
-import com.google.common.collect.ComparisonChain;
-import org.apache.orc.impl.InStream;
-import org.apache.orc.impl.OrcIndex;
-import org.apache.orc.impl.OutStream;
-
-/**
- * Stateless methods shared between RecordReaderImpl and EncodedReaderImpl.
- */
-public class RecordReaderUtils {
- private static final HadoopShims SHIMS = ShimLoader.getHadoopShims();
-
- private static class DefaultDataReader implements DataReader {
- private FSDataInputStream file = null;
- private final ByteBufferAllocatorPool pool;
- private ZeroCopyReaderShim zcr = null;
- private final FileSystem fs;
- private final Path path;
- private final boolean useZeroCopy;
- private final CompressionCodec codec;
- private final int bufferSize;
- private final int typeCount;
-
- private DefaultDataReader(DefaultDataReader other) {
- this.pool = other.pool;
- this.zcr = other.zcr;
- this.bufferSize = other.bufferSize;
- this.typeCount = other.typeCount;
- this.fs = other.fs;
- this.path = other.path;
- this.useZeroCopy = other.useZeroCopy;
- this.codec = other.codec;
- }
-
- private DefaultDataReader(DataReaderProperties properties) {
- this.fs = properties.getFileSystem();
- this.path = properties.getPath();
- this.useZeroCopy = properties.getZeroCopy();
- this.codec = WriterImpl.createCodec(properties.getCompression());
- this.bufferSize = properties.getBufferSize();
- this.typeCount = properties.getTypeCount();
- if (useZeroCopy) {
- this.pool = new ByteBufferAllocatorPool();
- } else {
- this.pool = null;
- }
- }
-
- @Override
- public void open() throws IOException {
- this.file = fs.open(path);
- if (useZeroCopy) {
- zcr = RecordReaderUtils.createZeroCopyShim(file, codec, pool);
- } else {
- zcr = null;
- }
- }
-
- @Override
- public OrcIndex readRowIndex(StripeInformation stripe,
- OrcProto.StripeFooter footer,
- boolean[] included,
- OrcProto.RowIndex[] indexes,
- boolean[] sargColumns,
- OrcProto.BloomFilterIndex[] bloomFilterIndices
- ) throws IOException {
- if (file == null) {
- open();
- }
- if (footer == null) {
- footer = readStripeFooter(stripe);
- }
- if (indexes == null) {
- indexes = new OrcProto.RowIndex[typeCount];
- }
- if (bloomFilterIndices == null) {
- bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
- }
- long offset = stripe.getOffset();
- List<OrcProto.Stream> streams = footer.getStreamsList();
- for (int i = 0; i < streams.size(); i++) {
- OrcProto.Stream stream = streams.get(i);
- OrcProto.Stream nextStream = null;
- if (i < streams.size() - 1) {
- nextStream = streams.get(i+1);
- }
- int col = stream.getColumn();
- int len = (int) stream.getLength();
- // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
- // filter and combine the io to read row index and bloom filters for that column together
- if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) {
- boolean readBloomFilter = false;
- if (sargColumns != null && sargColumns[col] &&
- nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) {
- len += nextStream.getLength();
- i += 1;
- readBloomFilter = true;
- }
- if ((included == null || included[col]) && indexes[col] == null) {
- byte[] buffer = new byte[len];
- file.readFully(offset, buffer, 0, buffer.length);
- ByteBuffer bb = ByteBuffer.wrap(buffer);
- indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index",
- Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)), stream.getLength(),
- codec, bufferSize));
- if (readBloomFilter) {
- bb.position((int) stream.getLength());
- bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create(
- "bloom_filter", Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)),
- nextStream.getLength(), codec, bufferSize));
- }
- }
- }
- offset += len;
- }
-
- OrcIndex index = new OrcIndex(indexes, bloomFilterIndices);
- return index;
- }
-
- @Override
- public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException {
- if (file == null) {
- open();
- }
- long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength();
- int tailLength = (int) stripe.getFooterLength();
-
- // read the footer
- ByteBuffer tailBuf = ByteBuffer.allocate(tailLength);
- file.readFully(offset, tailBuf.array(), tailBuf.arrayOffset(), tailLength);
- return OrcProto.StripeFooter.parseFrom(InStream.createCodedInputStream("footer",
- Lists.<DiskRange>newArrayList(new BufferChunk(tailBuf, 0)),
- tailLength, codec, bufferSize));
- }
-
- @Override
- public DiskRangeList readFileData(
- DiskRangeList range, long baseOffset, boolean doForceDirect) throws IOException {
- return RecordReaderUtils.readDiskRanges(file, zcr, baseOffset, range, doForceDirect);
- }
-
- @Override
- public void close() throws IOException {
- if (file != null) {
- file.close();
- }
- if (pool != null) {
- pool.clear();
- }
- }
-
- @Override
- public boolean isTrackingDiskRanges() {
- return zcr != null;
- }
-
- @Override
- public void releaseBuffer(ByteBuffer buffer) {
- zcr.releaseBuffer(buffer);
- }
-
- @Override
- public DataReader clone() {
- return new DefaultDataReader(this);
- }
-
- }
-
- public static DataReader createDefaultDataReader(DataReaderProperties properties) {
- return new DefaultDataReader(properties);
- }
-
- public static boolean[] findPresentStreamsByColumn(
- List<OrcProto.Stream> streamList, List<OrcProto.Type> types) {
- boolean[] hasNull = new boolean[types.size()];
- for(OrcProto.Stream stream: streamList) {
- if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.PRESENT)) {
- hasNull[stream.getColumn()] = true;
- }
- }
- return hasNull;
- }
-
- /**
- * Does region A overlap region B? The end points are inclusive on both sides.
- * @param leftA A's left point
- * @param rightA A's right point
- * @param leftB B's left point
- * @param rightB B's right point
- * @return Does region A overlap region B?
- */
- static boolean overlap(long leftA, long rightA, long leftB, long rightB) {
- if (leftA <= leftB) {
- return rightA >= leftB;
- }
- return rightB >= leftA;
- }
-
- public static void addEntireStreamToRanges(
- long offset, long length, CreateHelper list, boolean doMergeBuffers) {
- list.addOrMerge(offset, offset + length, doMergeBuffers, false);
- }
-
- public static void addRgFilteredStreamToRanges(OrcProto.Stream stream,
- boolean[] includedRowGroups, boolean isCompressed, OrcProto.RowIndex index,
- OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, boolean hasNull,
- long offset, long length, CreateHelper list, boolean doMergeBuffers) {
- for (int group = 0; group < includedRowGroups.length; ++group) {
- if (!includedRowGroups[group]) continue;
- int posn = getIndexPosition(
- encoding.getKind(), type.getKind(), stream.getKind(), isCompressed, hasNull);
- long start = index.getEntry(group).getPositions(posn);
- final long nextGroupOffset;
- boolean isLast = group == (includedRowGroups.length - 1);
- nextGroupOffset = isLast ? length : index.getEntry(group + 1).getPositions(posn);
-
- start += offset;
- long end = offset + estimateRgEndOffset(
- isCompressed, isLast, nextGroupOffset, length, compressionSize);
- list.addOrMerge(start, end, doMergeBuffers, true);
- }
- }
-
- public static long estimateRgEndOffset(boolean isCompressed, boolean isLast,
- long nextGroupOffset, long streamLength, int bufferSize) {
- // figure out the worst case last location
- // if adjacent groups have the same compressed block offset then stretch the slop
- // by factor of 2 to safely accommodate the next compression block.
- // One for the current compression block and another for the next compression block.
- long slop = isCompressed ? 2 * (OutStream.HEADER_SIZE + bufferSize) : WORST_UNCOMPRESSED_SLOP;
- return isLast ? streamLength : Math.min(streamLength, nextGroupOffset + slop);
- }
-
- private static final int BYTE_STREAM_POSITIONS = 1;
- private static final int RUN_LENGTH_BYTE_POSITIONS = BYTE_STREAM_POSITIONS + 1;
- private static final int BITFIELD_POSITIONS = RUN_LENGTH_BYTE_POSITIONS + 1;
- private static final int RUN_LENGTH_INT_POSITIONS = BYTE_STREAM_POSITIONS + 1;
-
- /**
- * Get the offset in the index positions for the column that the given
- * stream starts.
- * @param columnEncoding the encoding of the column
- * @param columnType the type of the column
- * @param streamType the kind of the stream
- * @param isCompressed is the file compressed
- * @param hasNulls does the column have a PRESENT stream?
- * @return the number of positions that will be used for that stream
- */
- public static int getIndexPosition(OrcProto.ColumnEncoding.Kind columnEncoding,
- OrcProto.Type.Kind columnType,
- OrcProto.Stream.Kind streamType,
- boolean isCompressed,
- boolean hasNulls) {
- if (streamType == OrcProto.Stream.Kind.PRESENT) {
- return 0;
- }
- int compressionValue = isCompressed ? 1 : 0;
- int base = hasNulls ? (BITFIELD_POSITIONS + compressionValue) : 0;
- switch (columnType) {
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- case FLOAT:
- case DOUBLE:
- case DATE:
- case STRUCT:
- case MAP:
- case LIST:
- case UNION:
- return base;
- case CHAR:
- case VARCHAR:
- case STRING:
- if (columnEncoding == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
- columnEncoding == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
- return base;
- } else {
- if (streamType == OrcProto.Stream.Kind.DATA) {
- return base;
- } else {
- return base + BYTE_STREAM_POSITIONS + compressionValue;
- }
- }
- case BINARY:
- if (streamType == OrcProto.Stream.Kind.DATA) {
- return base;
- }
- return base + BYTE_STREAM_POSITIONS + compressionValue;
- case DECIMAL:
- if (streamType == OrcProto.Stream.Kind.DATA) {
- return base;
- }
- return base + BYTE_STREAM_POSITIONS + compressionValue;
- case TIMESTAMP:
- if (streamType == OrcProto.Stream.Kind.DATA) {
- return base;
- }
- return base + RUN_LENGTH_INT_POSITIONS + compressionValue;
- default:
- throw new IllegalArgumentException("Unknown type " + columnType);
- }
- }
-
- // for uncompressed streams, what is the most overlap with the following set
- // of rows (long vint literal group).
- static final int WORST_UNCOMPRESSED_SLOP = 2 + 8 * 512;
-
- /**
- * Is this stream part of a dictionary?
- * @return is this part of a dictionary?
- */
- public static boolean isDictionary(OrcProto.Stream.Kind kind,
- OrcProto.ColumnEncoding encoding) {
- assert kind != OrcProto.Stream.Kind.DICTIONARY_COUNT;
- OrcProto.ColumnEncoding.Kind encodingKind = encoding.getKind();
- return kind == OrcProto.Stream.Kind.DICTIONARY_DATA ||
- (kind == OrcProto.Stream.Kind.LENGTH &&
- (encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
- encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2));
- }
-
- /**
- * Build a string representation of a list of disk ranges.
- * @param range ranges to stringify
- * @return the resulting string
- */
- public static String stringifyDiskRanges(DiskRangeList range) {
- StringBuilder buffer = new StringBuilder();
- buffer.append("[");
- boolean isFirst = true;
- while (range != null) {
- if (!isFirst) {
- buffer.append(", {");
- } else {
- buffer.append("{");
- }
- isFirst = false;
- buffer.append(range.toString());
- buffer.append("}");
- range = range.next;
- }
- buffer.append("]");
- return buffer.toString();
- }
-
- /**
- * Read the list of ranges from the file.
- * @param file the file to read
- * @param base the base of the stripe
- * @param range the disk ranges within the stripe to read
- * @return the bytes read for each disk range, which is the same length as
- * ranges
- * @throws IOException
- */
- static DiskRangeList readDiskRanges(FSDataInputStream file,
- ZeroCopyReaderShim zcr,
- long base,
- DiskRangeList range,
- boolean doForceDirect) throws IOException {
- if (range == null) return null;
- DiskRangeList prev = range.prev;
- if (prev == null) {
- prev = new MutateHelper(range);
- }
- while (range != null) {
- if (range.hasData()) {
- range = range.next;
- continue;
- }
- int len = (int) (range.getEnd() - range.getOffset());
- long off = range.getOffset();
- if (zcr != null) {
- file.seek(base + off);
- boolean hasReplaced = false;
- while (len > 0) {
- ByteBuffer partial = zcr.readBuffer(len, false);
- BufferChunk bc = new BufferChunk(partial, off);
- if (!hasReplaced) {
- range.replaceSelfWith(bc);
- hasReplaced = true;
- } else {
- range.insertAfter(bc);
- }
- range = bc;
- int read = partial.remaining();
- len -= read;
- off += read;
- }
- } else {
- // Don't use HDFS ByteBuffer API because it has no readFully, and is buggy and pointless.
- byte[] buffer = new byte[len];
- file.readFully((base + off), buffer, 0, buffer.length);
- ByteBuffer bb = null;
- if (doForceDirect) {
- bb = ByteBuffer.allocateDirect(len);
- bb.put(buffer);
- bb.position(0);
- bb.limit(len);
- } else {
- bb = ByteBuffer.wrap(buffer);
- }
- range = range.replaceSelfWith(new BufferChunk(bb, range.getOffset()));
- }
- range = range.next;
- }
- return prev.next;
- }
-
-
- static List<DiskRange> getStreamBuffers(DiskRangeList range, long offset, long length) {
- // This assumes sorted ranges (as do many other parts of ORC code.
- ArrayList<DiskRange> buffers = new ArrayList<DiskRange>();
- if (length == 0) return buffers;
- long streamEnd = offset + length;
- boolean inRange = false;
- while (range != null) {
- if (!inRange) {
- if (range.getEnd() <= offset) {
- range = range.next;
- continue; // Skip until we are in range.
- }
- inRange = true;
- if (range.getOffset() < offset) {
- // Partial first buffer, add a slice of it.
- buffers.add(range.sliceAndShift(offset, Math.min(streamEnd, range.getEnd()), -offset));
- if (range.getEnd() >= streamEnd) break; // Partial first buffer is also partial last buffer.
- range = range.next;
- continue;
- }
- } else if (range.getOffset() >= streamEnd) {
- break;
- }
- if (range.getEnd() > streamEnd) {
- // Partial last buffer (may also be the first buffer), add a slice of it.
- buffers.add(range.sliceAndShift(range.getOffset(), streamEnd, -offset));
- break;
- }
- // Buffer that belongs entirely to one stream.
- // TODO: ideally we would want to reuse the object and remove it from the list, but we cannot
- // because bufferChunks is also used by clearStreams for zcr. Create a useless dup.
- buffers.add(range.sliceAndShift(range.getOffset(), range.getEnd(), -offset));
- if (range.getEnd() == streamEnd) break;
- range = range.next;
- }
- return buffers;
- }
-
- static ZeroCopyReaderShim createZeroCopyShim(FSDataInputStream file,
- CompressionCodec codec, ByteBufferAllocatorPool pool) throws IOException {
- if ((codec == null || ((codec instanceof DirectDecompressionCodec)
- && ((DirectDecompressionCodec) codec).isAvailable()))) {
- /* codec is null or is available */
- return ShimLoader.getHadoopShims().getZeroCopyReader(file, pool);
- }
- return null;
- }
-
- // this is an implementation copied from ElasticByteBufferPool in hadoop-2,
- // which lacks a clear()/clean() operation
- public final static class ByteBufferAllocatorPool implements ByteBufferPoolShim {
- private static final class Key implements Comparable<Key> {
- private final int capacity;
- private final long insertionGeneration;
-
- Key(int capacity, long insertionGeneration) {
- this.capacity = capacity;
- this.insertionGeneration = insertionGeneration;
- }
-
- @Override
- public int compareTo(Key other) {
- return ComparisonChain.start().compare(capacity, other.capacity)
- .compare(insertionGeneration, other.insertionGeneration).result();
- }
-
- @Override
- public boolean equals(Object rhs) {
- if (rhs == null) {
- return false;
- }
- try {
- Key o = (Key) rhs;
- return (compareTo(o) == 0);
- } catch (ClassCastException e) {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return new HashCodeBuilder().append(capacity).append(insertionGeneration)
- .toHashCode();
- }
- }
-
- private final TreeMap<Key, ByteBuffer> buffers = new TreeMap<Key, ByteBuffer>();
-
- private final TreeMap<Key, ByteBuffer> directBuffers = new TreeMap<Key, ByteBuffer>();
-
- private long currentGeneration = 0;
-
- private final TreeMap<Key, ByteBuffer> getBufferTree(boolean direct) {
- return direct ? directBuffers : buffers;
- }
-
- public void clear() {
- buffers.clear();
- directBuffers.clear();
- }
-
- @Override
- public ByteBuffer getBuffer(boolean direct, int length) {
- TreeMap<Key, ByteBuffer> tree = getBufferTree(direct);
- Map.Entry<Key, ByteBuffer> entry = tree.ceilingEntry(new Key(length, 0));
- if (entry == null) {
- return direct ? ByteBuffer.allocateDirect(length) : ByteBuffer
- .allocate(length);
- }
- tree.remove(entry.getKey());
- return entry.getValue();
- }
-
- @Override
- public void putBuffer(ByteBuffer buffer) {
- TreeMap<Key, ByteBuffer> tree = getBufferTree(buffer.isDirect());
- while (true) {
- Key key = new Key(buffer.capacity(), currentGeneration++);
- if (!tree.containsKey(key)) {
- tree.put(key, buffer);
- return;
- }
- // Buffers are indexed by (capacity, generation).
- // If our key is not unique on the first try, we try again
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/SchemaEvolution.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/SchemaEvolution.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/SchemaEvolution.java
deleted file mode 100644
index 046665b..0000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/SchemaEvolution.java
+++ /dev/null
@@ -1,190 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hive.ql.io.orc;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.orc.TypeDescription;
-
-/**
- * Take the file types and the (optional) configuration column names/types and see if there
- * has been schema evolution.
- */
-public class SchemaEvolution {
- private final Map<TypeDescription, TypeDescription> readerToFile;
- private final boolean[] included;
- private final TypeDescription readerSchema;
- private static final Log LOG = LogFactory.getLog(SchemaEvolution.class);
-
- public SchemaEvolution(TypeDescription readerSchema, boolean[] included) {
- this.included = included;
- readerToFile = null;
- this.readerSchema = readerSchema;
- }
-
- public SchemaEvolution(TypeDescription fileSchema,
- TypeDescription readerSchema,
- boolean[] included) throws IOException {
- readerToFile = new HashMap<>(readerSchema.getMaximumId() + 1);
- this.included = included;
- if (checkAcidSchema(fileSchema)) {
- this.readerSchema = createEventSchema(readerSchema);
- } else {
- this.readerSchema = readerSchema;
- }
- buildMapping(fileSchema, this.readerSchema);
- }
-
- public TypeDescription getReaderSchema() {
- return readerSchema;
- }
-
- public TypeDescription getFileType(TypeDescription readerType) {
- TypeDescription result;
- if (readerToFile == null) {
- if (included == null || included[readerType.getId()]) {
- result = readerType;
- } else {
- result = null;
- }
- } else {
- result = readerToFile.get(readerType);
- }
- return result;
- }
-
- void buildMapping(TypeDescription fileType,
- TypeDescription readerType) throws IOException {
- // if the column isn't included, don't map it
- if (included != null && !included[readerType.getId()]) {
- return;
- }
- boolean isOk = true;
- // check the easy case first
- if (fileType.getCategory() == readerType.getCategory()) {
- switch (readerType.getCategory()) {
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- case DOUBLE:
- case FLOAT:
- case STRING:
- case TIMESTAMP:
- case BINARY:
- case DATE:
- // these are always a match
- break;
- case CHAR:
- case VARCHAR:
- // HIVE-13648: Look at ORC data type conversion edge cases (CHAR, VARCHAR, DECIMAL)
- isOk = fileType.getMaxLength() == readerType.getMaxLength();
- break;
- case DECIMAL:
- // HIVE-13648: Look at ORC data type conversion edge cases (CHAR, VARCHAR, DECIMAL)
- // TODO we don't enforce scale and precision checks, but probably should
- break;
- case UNION:
- case MAP:
- case LIST: {
- // these must be an exact match
- List<TypeDescription> fileChildren = fileType.getChildren();
- List<TypeDescription> readerChildren = readerType.getChildren();
- if (fileChildren.size() == readerChildren.size()) {
- for(int i=0; i < fileChildren.size(); ++i) {
- buildMapping(fileChildren.get(i), readerChildren.get(i));
- }
- } else {
- isOk = false;
- }
- break;
- }
- case STRUCT: {
- // allow either side to have fewer fields than the other
- List<TypeDescription> fileChildren = fileType.getChildren();
- List<TypeDescription> readerChildren = readerType.getChildren();
- int jointSize = Math.min(fileChildren.size(), readerChildren.size());
- for(int i=0; i < jointSize; ++i) {
- buildMapping(fileChildren.get(i), readerChildren.get(i));
- }
- break;
- }
- default:
- throw new IllegalArgumentException("Unknown type " + readerType);
- }
- } else {
- /*
- * Check for the few cases where will not convert....
- */
-
- isOk = ConvertTreeReaderFactory.canConvert(fileType, readerType);
- }
- if (isOk) {
- readerToFile.put(readerType, fileType);
- } else {
- throw new IOException(
- String.format(
- "ORC does not support type conversion from file type %s (%d) to reader type %s (%d)",
- fileType.toString(), fileType.getId(),
- readerType.toString(), readerType.getId()));
- }
- }
-
- private static boolean checkAcidSchema(TypeDescription type) {
- if (type.getCategory().equals(TypeDescription.Category.STRUCT)) {
- List<String> rootFields = type.getFieldNames();
- if (acidEventFieldNames.equals(rootFields)) {
- return true;
- }
- }
- return false;
- }
-
- /**
- * @param typeDescr
- * @return ORC types for the ACID event based on the row's type description
- */
- public static TypeDescription createEventSchema(TypeDescription typeDescr) {
- TypeDescription result = TypeDescription.createStruct()
- .addField("operation", TypeDescription.createInt())
- .addField("originalTransaction", TypeDescription.createLong())
- .addField("bucket", TypeDescription.createInt())
- .addField("rowId", TypeDescription.createLong())
- .addField("currentTransaction", TypeDescription.createLong())
- .addField("row", typeDescr.clone());
- return result;
- }
-
- public static final List<String> acidEventFieldNames= new ArrayList<String>();
- static {
- acidEventFieldNames.add("operation");
- acidEventFieldNames.add("originalTransaction");
- acidEventFieldNames.add("bucket");
- acidEventFieldNames.add("rowId");
- acidEventFieldNames.add("currentTransaction");
- acidEventFieldNames.add("row");
- }
-}
[24/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/RecordReaderUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/RecordReaderUtils.java b/orc/src/java/org/apache/orc/impl/RecordReaderUtils.java
new file mode 100644
index 0000000..1067957
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/RecordReaderUtils.java
@@ -0,0 +1,578 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+import com.google.common.collect.Lists;
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.io.DiskRange;
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper;
+import org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper;
+import org.apache.orc.CompressionCodec;
+import org.apache.orc.DataReader;
+import org.apache.orc.OrcProto;
+
+import com.google.common.collect.ComparisonChain;
+import org.apache.orc.StripeInformation;
+
+/**
+ * Stateless methods shared between RecordReaderImpl and EncodedReaderImpl.
+ */
+public class RecordReaderUtils {
+ private static final HadoopShims SHIMS = HadoopShims.Factory.get();
+
+ private static class DefaultDataReader implements DataReader {
+ private FSDataInputStream file = null;
+ private final ByteBufferAllocatorPool pool;
+ private HadoopShims.ZeroCopyReaderShim zcr = null;
+ private final FileSystem fs;
+ private final Path path;
+ private final boolean useZeroCopy;
+ private final CompressionCodec codec;
+ private final int bufferSize;
+ private final int typeCount;
+
+ private DefaultDataReader(DefaultDataReader other) {
+ this.pool = other.pool;
+ this.bufferSize = other.bufferSize;
+ this.typeCount = other.typeCount;
+ this.fs = other.fs;
+ this.path = other.path;
+ this.useZeroCopy = other.useZeroCopy;
+ this.codec = other.codec;
+ }
+
+ private DefaultDataReader(DataReaderProperties properties) {
+ this.fs = properties.getFileSystem();
+ this.path = properties.getPath();
+ this.useZeroCopy = properties.getZeroCopy();
+ this.codec = WriterImpl.createCodec(properties.getCompression());
+ this.bufferSize = properties.getBufferSize();
+ this.typeCount = properties.getTypeCount();
+ if (useZeroCopy) {
+ this.pool = new ByteBufferAllocatorPool();
+ } else {
+ this.pool = null;
+ }
+ }
+
+ @Override
+ public void open() throws IOException {
+ this.file = fs.open(path);
+ if (useZeroCopy) {
+ zcr = RecordReaderUtils.createZeroCopyShim(file, codec, pool);
+ } else {
+ zcr = null;
+ }
+ }
+
+ @Override
+ public OrcIndex readRowIndex(StripeInformation stripe,
+ OrcProto.StripeFooter footer,
+ boolean[] included,
+ OrcProto.RowIndex[] indexes,
+ boolean[] sargColumns,
+ OrcProto.BloomFilterIndex[] bloomFilterIndices
+ ) throws IOException {
+ if (file == null) {
+ open();
+ }
+ if (footer == null) {
+ footer = readStripeFooter(stripe);
+ }
+ if (indexes == null) {
+ indexes = new OrcProto.RowIndex[typeCount];
+ }
+ if (bloomFilterIndices == null) {
+ bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
+ }
+ long offset = stripe.getOffset();
+ List<OrcProto.Stream> streams = footer.getStreamsList();
+ for (int i = 0; i < streams.size(); i++) {
+ OrcProto.Stream stream = streams.get(i);
+ OrcProto.Stream nextStream = null;
+ if (i < streams.size() - 1) {
+ nextStream = streams.get(i+1);
+ }
+ int col = stream.getColumn();
+ int len = (int) stream.getLength();
+ // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
+ // filter and combine the io to read row index and bloom filters for that column together
+ if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) {
+ boolean readBloomFilter = false;
+ if (sargColumns != null && sargColumns[col] &&
+ nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) {
+ len += nextStream.getLength();
+ i += 1;
+ readBloomFilter = true;
+ }
+ if ((included == null || included[col]) && indexes[col] == null) {
+ byte[] buffer = new byte[len];
+ file.readFully(offset, buffer, 0, buffer.length);
+ ByteBuffer bb = ByteBuffer.wrap(buffer);
+ indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index",
+ Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)), stream.getLength(),
+ codec, bufferSize));
+ if (readBloomFilter) {
+ bb.position((int) stream.getLength());
+ bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create(
+ "bloom_filter", Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)),
+ nextStream.getLength(), codec, bufferSize));
+ }
+ }
+ }
+ offset += len;
+ }
+
+ OrcIndex index = new OrcIndex(indexes, bloomFilterIndices);
+ return index;
+ }
+
+ @Override
+ public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException {
+ if (file == null) {
+ open();
+ }
+ long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength();
+ int tailLength = (int) stripe.getFooterLength();
+
+ // read the footer
+ ByteBuffer tailBuf = ByteBuffer.allocate(tailLength);
+ file.readFully(offset, tailBuf.array(), tailBuf.arrayOffset(), tailLength);
+ return OrcProto.StripeFooter.parseFrom(InStream.createCodedInputStream("footer",
+ Lists.<DiskRange>newArrayList(new BufferChunk(tailBuf, 0)),
+ tailLength, codec, bufferSize));
+ }
+
+ @Override
+ public DiskRangeList readFileData(
+ DiskRangeList range, long baseOffset, boolean doForceDirect) throws IOException {
+ return RecordReaderUtils.readDiskRanges(file, zcr, baseOffset, range, doForceDirect);
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (pool != null) {
+ pool.clear();
+ }
+ // close both zcr and file
+ try (HadoopShims.ZeroCopyReaderShim myZcr = zcr) {
+ if (file != null) {
+ file.close();
+ }
+ }
+ }
+
+ @Override
+ public boolean isTrackingDiskRanges() {
+ return zcr != null;
+ }
+
+ @Override
+ public void releaseBuffer(ByteBuffer buffer) {
+ zcr.releaseBuffer(buffer);
+ }
+
+ @Override
+ public DataReader clone() {
+ return new DefaultDataReader(this);
+ }
+
+ }
+
+ public static DataReader createDefaultDataReader(DataReaderProperties properties) {
+ return new DefaultDataReader(properties);
+ }
+
+ public static boolean[] findPresentStreamsByColumn(
+ List<OrcProto.Stream> streamList, List<OrcProto.Type> types) {
+ boolean[] hasNull = new boolean[types.size()];
+ for(OrcProto.Stream stream: streamList) {
+ if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.PRESENT)) {
+ hasNull[stream.getColumn()] = true;
+ }
+ }
+ return hasNull;
+ }
+
+ /**
+ * Does region A overlap region B? The end points are inclusive on both sides.
+ * @param leftA A's left point
+ * @param rightA A's right point
+ * @param leftB B's left point
+ * @param rightB B's right point
+ * @return Does region A overlap region B?
+ */
+ static boolean overlap(long leftA, long rightA, long leftB, long rightB) {
+ if (leftA <= leftB) {
+ return rightA >= leftB;
+ }
+ return rightB >= leftA;
+ }
+
+ public static void addEntireStreamToRanges(
+ long offset, long length, CreateHelper list, boolean doMergeBuffers) {
+ list.addOrMerge(offset, offset + length, doMergeBuffers, false);
+ }
+
+ public static void addRgFilteredStreamToRanges(OrcProto.Stream stream,
+ boolean[] includedRowGroups, boolean isCompressed, OrcProto.RowIndex index,
+ OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, boolean hasNull,
+ long offset, long length, CreateHelper list, boolean doMergeBuffers) {
+ for (int group = 0; group < includedRowGroups.length; ++group) {
+ if (!includedRowGroups[group]) continue;
+ int posn = getIndexPosition(
+ encoding.getKind(), type.getKind(), stream.getKind(), isCompressed, hasNull);
+ long start = index.getEntry(group).getPositions(posn);
+ final long nextGroupOffset;
+ boolean isLast = group == (includedRowGroups.length - 1);
+ nextGroupOffset = isLast ? length : index.getEntry(group + 1).getPositions(posn);
+
+ start += offset;
+ long end = offset + estimateRgEndOffset(
+ isCompressed, isLast, nextGroupOffset, length, compressionSize);
+ list.addOrMerge(start, end, doMergeBuffers, true);
+ }
+ }
+
+ public static long estimateRgEndOffset(boolean isCompressed, boolean isLast,
+ long nextGroupOffset, long streamLength, int bufferSize) {
+ // figure out the worst case last location
+ // if adjacent groups have the same compressed block offset then stretch the slop
+ // by factor of 2 to safely accommodate the next compression block.
+ // One for the current compression block and another for the next compression block.
+ long slop = isCompressed ? 2 * (OutStream.HEADER_SIZE + bufferSize) : WORST_UNCOMPRESSED_SLOP;
+ return isLast ? streamLength : Math.min(streamLength, nextGroupOffset + slop);
+ }
+
+ private static final int BYTE_STREAM_POSITIONS = 1;
+ private static final int RUN_LENGTH_BYTE_POSITIONS = BYTE_STREAM_POSITIONS + 1;
+ private static final int BITFIELD_POSITIONS = RUN_LENGTH_BYTE_POSITIONS + 1;
+ private static final int RUN_LENGTH_INT_POSITIONS = BYTE_STREAM_POSITIONS + 1;
+
+ /**
+ * Get the offset in the index positions for the column that the given
+ * stream starts.
+ * @param columnEncoding the encoding of the column
+ * @param columnType the type of the column
+ * @param streamType the kind of the stream
+ * @param isCompressed is the file compressed
+ * @param hasNulls does the column have a PRESENT stream?
+ * @return the number of positions that will be used for that stream
+ */
+ public static int getIndexPosition(OrcProto.ColumnEncoding.Kind columnEncoding,
+ OrcProto.Type.Kind columnType,
+ OrcProto.Stream.Kind streamType,
+ boolean isCompressed,
+ boolean hasNulls) {
+ if (streamType == OrcProto.Stream.Kind.PRESENT) {
+ return 0;
+ }
+ int compressionValue = isCompressed ? 1 : 0;
+ int base = hasNulls ? (BITFIELD_POSITIONS + compressionValue) : 0;
+ switch (columnType) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ case DATE:
+ case STRUCT:
+ case MAP:
+ case LIST:
+ case UNION:
+ return base;
+ case CHAR:
+ case VARCHAR:
+ case STRING:
+ if (columnEncoding == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
+ columnEncoding == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
+ return base;
+ } else {
+ if (streamType == OrcProto.Stream.Kind.DATA) {
+ return base;
+ } else {
+ return base + BYTE_STREAM_POSITIONS + compressionValue;
+ }
+ }
+ case BINARY:
+ if (streamType == OrcProto.Stream.Kind.DATA) {
+ return base;
+ }
+ return base + BYTE_STREAM_POSITIONS + compressionValue;
+ case DECIMAL:
+ if (streamType == OrcProto.Stream.Kind.DATA) {
+ return base;
+ }
+ return base + BYTE_STREAM_POSITIONS + compressionValue;
+ case TIMESTAMP:
+ if (streamType == OrcProto.Stream.Kind.DATA) {
+ return base;
+ }
+ return base + RUN_LENGTH_INT_POSITIONS + compressionValue;
+ default:
+ throw new IllegalArgumentException("Unknown type " + columnType);
+ }
+ }
+
+ // for uncompressed streams, what is the most overlap with the following set
+ // of rows (long vint literal group).
+ static final int WORST_UNCOMPRESSED_SLOP = 2 + 8 * 512;
+
+ /**
+ * Is this stream part of a dictionary?
+ * @return is this part of a dictionary?
+ */
+ public static boolean isDictionary(OrcProto.Stream.Kind kind,
+ OrcProto.ColumnEncoding encoding) {
+ assert kind != OrcProto.Stream.Kind.DICTIONARY_COUNT;
+ OrcProto.ColumnEncoding.Kind encodingKind = encoding.getKind();
+ return kind == OrcProto.Stream.Kind.DICTIONARY_DATA ||
+ (kind == OrcProto.Stream.Kind.LENGTH &&
+ (encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
+ encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2));
+ }
+
+ /**
+ * Build a string representation of a list of disk ranges.
+ * @param range ranges to stringify
+ * @return the resulting string
+ */
+ public static String stringifyDiskRanges(DiskRangeList range) {
+ StringBuilder buffer = new StringBuilder();
+ buffer.append("[");
+ boolean isFirst = true;
+ while (range != null) {
+ if (!isFirst) {
+ buffer.append(", {");
+ } else {
+ buffer.append("{");
+ }
+ isFirst = false;
+ buffer.append(range.toString());
+ buffer.append("}");
+ range = range.next;
+ }
+ buffer.append("]");
+ return buffer.toString();
+ }
+
+ /**
+ * Read the list of ranges from the file.
+ * @param file the file to read
+ * @param base the base of the stripe
+ * @param range the disk ranges within the stripe to read
+ * @return the bytes read for each disk range, which is the same length as
+ * ranges
+ * @throws IOException
+ */
+ static DiskRangeList readDiskRanges(FSDataInputStream file,
+ HadoopShims.ZeroCopyReaderShim zcr,
+ long base,
+ DiskRangeList range,
+ boolean doForceDirect) throws IOException {
+ if (range == null) return null;
+ DiskRangeList prev = range.prev;
+ if (prev == null) {
+ prev = new MutateHelper(range);
+ }
+ while (range != null) {
+ if (range.hasData()) {
+ range = range.next;
+ continue;
+ }
+ int len = (int) (range.getEnd() - range.getOffset());
+ long off = range.getOffset();
+ if (zcr != null) {
+ file.seek(base + off);
+ boolean hasReplaced = false;
+ while (len > 0) {
+ ByteBuffer partial = zcr.readBuffer(len, false);
+ BufferChunk bc = new BufferChunk(partial, off);
+ if (!hasReplaced) {
+ range.replaceSelfWith(bc);
+ hasReplaced = true;
+ } else {
+ range.insertAfter(bc);
+ }
+ range = bc;
+ int read = partial.remaining();
+ len -= read;
+ off += read;
+ }
+ } else {
+ // Don't use HDFS ByteBuffer API because it has no readFully, and is buggy and pointless.
+ byte[] buffer = new byte[len];
+ file.readFully((base + off), buffer, 0, buffer.length);
+ ByteBuffer bb = null;
+ if (doForceDirect) {
+ bb = ByteBuffer.allocateDirect(len);
+ bb.put(buffer);
+ bb.position(0);
+ bb.limit(len);
+ } else {
+ bb = ByteBuffer.wrap(buffer);
+ }
+ range = range.replaceSelfWith(new BufferChunk(bb, range.getOffset()));
+ }
+ range = range.next;
+ }
+ return prev.next;
+ }
+
+
+ static List<DiskRange> getStreamBuffers(DiskRangeList range, long offset, long length) {
+ // This assumes sorted ranges (as do many other parts of ORC code.
+ ArrayList<DiskRange> buffers = new ArrayList<DiskRange>();
+ if (length == 0) return buffers;
+ long streamEnd = offset + length;
+ boolean inRange = false;
+ while (range != null) {
+ if (!inRange) {
+ if (range.getEnd() <= offset) {
+ range = range.next;
+ continue; // Skip until we are in range.
+ }
+ inRange = true;
+ if (range.getOffset() < offset) {
+ // Partial first buffer, add a slice of it.
+ buffers.add(range.sliceAndShift(offset, Math.min(streamEnd, range.getEnd()), -offset));
+ if (range.getEnd() >= streamEnd) break; // Partial first buffer is also partial last buffer.
+ range = range.next;
+ continue;
+ }
+ } else if (range.getOffset() >= streamEnd) {
+ break;
+ }
+ if (range.getEnd() > streamEnd) {
+ // Partial last buffer (may also be the first buffer), add a slice of it.
+ buffers.add(range.sliceAndShift(range.getOffset(), streamEnd, -offset));
+ break;
+ }
+ // Buffer that belongs entirely to one stream.
+ // TODO: ideally we would want to reuse the object and remove it from the list, but we cannot
+ // because bufferChunks is also used by clearStreams for zcr. Create a useless dup.
+ buffers.add(range.sliceAndShift(range.getOffset(), range.getEnd(), -offset));
+ if (range.getEnd() == streamEnd) break;
+ range = range.next;
+ }
+ return buffers;
+ }
+
+ static HadoopShims.ZeroCopyReaderShim createZeroCopyShim(FSDataInputStream file,
+ CompressionCodec codec, ByteBufferAllocatorPool pool) throws IOException {
+ if ((codec == null || ((codec instanceof DirectDecompressionCodec)
+ && ((DirectDecompressionCodec) codec).isAvailable()))) {
+ /* codec is null or is available */
+ return SHIMS.getZeroCopyReader(file, pool);
+ }
+ return null;
+ }
+
+ // this is an implementation copied from ElasticByteBufferPool in hadoop-2,
+ // which lacks a clear()/clean() operation
+ public final static class ByteBufferAllocatorPool implements HadoopShims.ByteBufferPoolShim {
+ private static final class Key implements Comparable<Key> {
+ private final int capacity;
+ private final long insertionGeneration;
+
+ Key(int capacity, long insertionGeneration) {
+ this.capacity = capacity;
+ this.insertionGeneration = insertionGeneration;
+ }
+
+ @Override
+ public int compareTo(Key other) {
+ return ComparisonChain.start().compare(capacity, other.capacity)
+ .compare(insertionGeneration, other.insertionGeneration).result();
+ }
+
+ @Override
+ public boolean equals(Object rhs) {
+ if (rhs == null) {
+ return false;
+ }
+ try {
+ Key o = (Key) rhs;
+ return (compareTo(o) == 0);
+ } catch (ClassCastException e) {
+ return false;
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return new HashCodeBuilder().append(capacity).append(insertionGeneration)
+ .toHashCode();
+ }
+ }
+
+ private final TreeMap<Key, ByteBuffer> buffers = new TreeMap<Key, ByteBuffer>();
+
+ private final TreeMap<Key, ByteBuffer> directBuffers = new TreeMap<Key, ByteBuffer>();
+
+ private long currentGeneration = 0;
+
+ private final TreeMap<Key, ByteBuffer> getBufferTree(boolean direct) {
+ return direct ? directBuffers : buffers;
+ }
+
+ public void clear() {
+ buffers.clear();
+ directBuffers.clear();
+ }
+
+ @Override
+ public ByteBuffer getBuffer(boolean direct, int length) {
+ TreeMap<Key, ByteBuffer> tree = getBufferTree(direct);
+ Map.Entry<Key, ByteBuffer> entry = tree.ceilingEntry(new Key(length, 0));
+ if (entry == null) {
+ return direct ? ByteBuffer.allocateDirect(length) : ByteBuffer
+ .allocate(length);
+ }
+ tree.remove(entry.getKey());
+ return entry.getValue();
+ }
+
+ @Override
+ public void putBuffer(ByteBuffer buffer) {
+ TreeMap<Key, ByteBuffer> tree = getBufferTree(buffer.isDirect());
+ while (true) {
+ Key key = new Key(buffer.capacity(), currentGeneration++);
+ if (!tree.containsKey(key)) {
+ tree.put(key, buffer);
+ return;
+ }
+ // Buffers are indexed by (capacity, generation).
+ // If our key is not unique on the first try, we try again
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/SchemaEvolution.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/SchemaEvolution.java b/orc/src/java/org/apache/orc/impl/SchemaEvolution.java
new file mode 100644
index 0000000..2c80aaa
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/SchemaEvolution.java
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.orc.TypeDescription;
+
+/**
+ * Take the file types and the (optional) configuration column names/types and see if there
+ * has been schema evolution.
+ */
+public class SchemaEvolution {
+ private final Map<TypeDescription, TypeDescription> readerToFile;
+ private final boolean[] included;
+ private final TypeDescription readerSchema;
+ private static final Log LOG = LogFactory.getLog(SchemaEvolution.class);
+
+ public SchemaEvolution(TypeDescription readerSchema, boolean[] included) {
+ this.included = included;
+ readerToFile = null;
+ this.readerSchema = readerSchema;
+ }
+
+ public SchemaEvolution(TypeDescription fileSchema,
+ TypeDescription readerSchema,
+ boolean[] included) throws IOException {
+ readerToFile = new HashMap<>(readerSchema.getMaximumId() + 1);
+ this.included = included;
+ if (checkAcidSchema(fileSchema)) {
+ this.readerSchema = createEventSchema(readerSchema);
+ } else {
+ this.readerSchema = readerSchema;
+ }
+ buildMapping(fileSchema, this.readerSchema);
+ }
+
+ public TypeDescription getReaderSchema() {
+ return readerSchema;
+ }
+
+ public TypeDescription getFileType(TypeDescription readerType) {
+ TypeDescription result;
+ if (readerToFile == null) {
+ if (included == null || included[readerType.getId()]) {
+ result = readerType;
+ } else {
+ result = null;
+ }
+ } else {
+ result = readerToFile.get(readerType);
+ }
+ return result;
+ }
+
+ void buildMapping(TypeDescription fileType,
+ TypeDescription readerType) throws IOException {
+ // if the column isn't included, don't map it
+ if (included != null && !included[readerType.getId()]) {
+ return;
+ }
+ boolean isOk = true;
+ // check the easy case first
+ if (fileType.getCategory() == readerType.getCategory()) {
+ switch (readerType.getCategory()) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case DOUBLE:
+ case FLOAT:
+ case STRING:
+ case TIMESTAMP:
+ case BINARY:
+ case DATE:
+ // these are always a match
+ break;
+ case CHAR:
+ case VARCHAR:
+ // HIVE-13648: Look at ORC data type conversion edge cases (CHAR, VARCHAR, DECIMAL)
+ isOk = fileType.getMaxLength() == readerType.getMaxLength();
+ break;
+ case DECIMAL:
+ // HIVE-13648: Look at ORC data type conversion edge cases (CHAR, VARCHAR, DECIMAL)
+ // TODO we don't enforce scale and precision checks, but probably should
+ break;
+ case UNION:
+ case MAP:
+ case LIST: {
+ // these must be an exact match
+ List<TypeDescription> fileChildren = fileType.getChildren();
+ List<TypeDescription> readerChildren = readerType.getChildren();
+ if (fileChildren.size() == readerChildren.size()) {
+ for(int i=0; i < fileChildren.size(); ++i) {
+ buildMapping(fileChildren.get(i), readerChildren.get(i));
+ }
+ } else {
+ isOk = false;
+ }
+ break;
+ }
+ case STRUCT: {
+ // allow either side to have fewer fields than the other
+ List<TypeDescription> fileChildren = fileType.getChildren();
+ List<TypeDescription> readerChildren = readerType.getChildren();
+ int jointSize = Math.min(fileChildren.size(), readerChildren.size());
+ for(int i=0; i < jointSize; ++i) {
+ buildMapping(fileChildren.get(i), readerChildren.get(i));
+ }
+ break;
+ }
+ default:
+ throw new IllegalArgumentException("Unknown type " + readerType);
+ }
+ } else {
+ /*
+ * Check for the few cases where will not convert....
+ */
+
+ isOk = ConvertTreeReaderFactory.canConvert(fileType, readerType);
+ }
+ if (isOk) {
+ readerToFile.put(readerType, fileType);
+ } else {
+ throw new IOException(
+ String.format(
+ "ORC does not support type conversion from file type %s (%d) to reader type %s (%d)",
+ fileType.toString(), fileType.getId(),
+ readerType.toString(), readerType.getId()));
+ }
+ }
+
+ private static boolean checkAcidSchema(TypeDescription type) {
+ if (type.getCategory().equals(TypeDescription.Category.STRUCT)) {
+ List<String> rootFields = type.getFieldNames();
+ if (acidEventFieldNames.equals(rootFields)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * @param typeDescr
+ * @return ORC types for the ACID event based on the row's type description
+ */
+ public static TypeDescription createEventSchema(TypeDescription typeDescr) {
+ TypeDescription result = TypeDescription.createStruct()
+ .addField("operation", TypeDescription.createInt())
+ .addField("originalTransaction", TypeDescription.createLong())
+ .addField("bucket", TypeDescription.createInt())
+ .addField("rowId", TypeDescription.createLong())
+ .addField("currentTransaction", TypeDescription.createLong())
+ .addField("row", typeDescr.clone());
+ return result;
+ }
+
+ public static final List<String> acidEventFieldNames= new ArrayList<String>();
+ static {
+ acidEventFieldNames.add("operation");
+ acidEventFieldNames.add("originalTransaction");
+ acidEventFieldNames.add("bucket");
+ acidEventFieldNames.add("rowId");
+ acidEventFieldNames.add("currentTransaction");
+ acidEventFieldNames.add("row");
+ }
+}
[22/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/tools/FileDump.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/tools/FileDump.java b/orc/src/java/org/apache/orc/tools/FileDump.java
new file mode 100644
index 0000000..e32027f
--- /dev/null
+++ b/orc/src/java/org/apache/orc/tools/FileDump.java
@@ -0,0 +1,934 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.tools;
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+import java.text.DecimalFormat;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.orc.BloomFilterIO;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.apache.orc.impl.AcidStats;
+import org.apache.orc.impl.ColumnStatisticsImpl;
+import org.apache.orc.impl.OrcAcidUtils;
+import org.apache.orc.impl.OrcIndex;
+import org.apache.orc.OrcProto;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.StripeStatistics;
+import org.apache.orc.impl.RecordReaderImpl;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONWriter;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Strings;
+import com.google.common.collect.Lists;
+
+/**
+ * A tool for printing out the file structure of ORC files.
+ */
+public final class FileDump {
+ public static final String UNKNOWN = "UNKNOWN";
+ public static final String SEPARATOR = Strings.repeat("_", 120) + "\n";
+ public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024;
+ public static final String DEFAULT_BACKUP_PATH = System.getProperty("java.io.tmpdir");
+ public static final PathFilter HIDDEN_AND_SIDE_FILE_FILTER = new PathFilter() {
+ public boolean accept(Path p) {
+ String name = p.getName();
+ return !name.startsWith("_") && !name.startsWith(".") && !name.endsWith(
+ OrcAcidUtils.DELTA_SIDE_FILE_SUFFIX);
+ }
+ };
+
+ // not used
+ private FileDump() {
+ }
+
+ public static void main(String[] args) throws Exception {
+ Configuration conf = new Configuration();
+
+ List<Integer> rowIndexCols = null;
+ Options opts = createOptions();
+ CommandLine cli = new GnuParser().parse(opts, args);
+
+ if (cli.hasOption('h')) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("orcfiledump", opts);
+ return;
+ }
+
+ boolean dumpData = cli.hasOption('d');
+ boolean recover = cli.hasOption("recover");
+ boolean skipDump = cli.hasOption("skip-dump");
+ String backupPath = DEFAULT_BACKUP_PATH;
+ if (cli.hasOption("backup-path")) {
+ backupPath = cli.getOptionValue("backup-path");
+ }
+
+ if (cli.hasOption("r")) {
+ String[] colStrs = cli.getOptionValue("r").split(",");
+ rowIndexCols = new ArrayList<Integer>(colStrs.length);
+ for (String colStr : colStrs) {
+ rowIndexCols.add(Integer.parseInt(colStr));
+ }
+ }
+
+ boolean printTimeZone = cli.hasOption('t');
+ boolean jsonFormat = cli.hasOption('j');
+ String[] files = cli.getArgs();
+ if (files.length == 0) {
+ System.err.println("Error : ORC files are not specified");
+ return;
+ }
+
+ // if the specified path is directory, iterate through all files and print the file dump
+ List<String> filesInPath = Lists.newArrayList();
+ for (String filename : files) {
+ Path path = new Path(filename);
+ filesInPath.addAll(getAllFilesInPath(path, conf));
+ }
+
+ if (dumpData) {
+ printData(filesInPath, conf);
+ } else if (recover && skipDump) {
+ recoverFiles(filesInPath, conf, backupPath);
+ } else {
+ if (jsonFormat) {
+ boolean prettyPrint = cli.hasOption('p');
+ JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone);
+ } else {
+ printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath);
+ }
+ }
+ }
+
+ /**
+ * This method returns an ORC reader object if the specified file is readable. If the specified
+ * file has side file (_flush_length) file, then max footer offset will be read from the side
+ * file and orc reader will be created from that offset. Since both data file and side file
+ * use hflush() for flushing the data, there could be some inconsistencies and both files could be
+ * out-of-sync. Following are the cases under which null will be returned
+ *
+ * 1) If the file specified by path or its side file is still open for writes
+ * 2) If *_flush_length file does not return any footer offset
+ * 3) If *_flush_length returns a valid footer offset but the data file is not readable at that
+ * position (incomplete data file)
+ * 4) If *_flush_length file length is not a multiple of 8, then reader will be created from
+ * previous valid footer. If there is no such footer (file length > 0 and < 8), then null will
+ * be returned
+ *
+ * Also, if this method detects any file corruption (mismatch between data file and side file)
+ * then it will add the corresponding file to the specified input list for corrupted files.
+ *
+ * In all other cases, where the file is readable this method will return a reader object.
+ *
+ * @param path - file to get reader for
+ * @param conf - configuration object
+ * @param corruptFiles - fills this list with all possible corrupted files
+ * @return - reader for the specified file or null
+ * @throws IOException
+ */
+ static Reader getReader(final Path path, final Configuration conf,
+ final List<String> corruptFiles) throws IOException {
+ FileSystem fs = path.getFileSystem(conf);
+ long dataFileLen = fs.getFileStatus(path).getLen();
+ System.err.println("Processing data file " + path + " [length: " + dataFileLen + "]");
+ Path sideFile = OrcAcidUtils.getSideFile(path);
+ final boolean sideFileExists = fs.exists(sideFile);
+ boolean openDataFile = false;
+ boolean openSideFile = false;
+ if (fs instanceof DistributedFileSystem) {
+ DistributedFileSystem dfs = (DistributedFileSystem) fs;
+ openDataFile = !dfs.isFileClosed(path);
+ openSideFile = sideFileExists && !dfs.isFileClosed(sideFile);
+ }
+
+ if (openDataFile || openSideFile) {
+ if (openDataFile && openSideFile) {
+ System.err.println("Unable to perform file dump as " + path + " and " + sideFile +
+ " are still open for writes.");
+ } else if (openSideFile) {
+ System.err.println("Unable to perform file dump as " + sideFile +
+ " is still open for writes.");
+ } else {
+ System.err.println("Unable to perform file dump as " + path +
+ " is still open for writes.");
+ }
+
+ return null;
+ }
+
+ Reader reader = null;
+ if (sideFileExists) {
+ final long maxLen = OrcAcidUtils.getLastFlushLength(fs, path);
+ final long sideFileLen = fs.getFileStatus(sideFile).getLen();
+ System.err.println("Found flush length file " + sideFile
+ + " [length: " + sideFileLen + ", maxFooterOffset: " + maxLen + "]");
+ // no offsets read from side file
+ if (maxLen == -1) {
+
+ // if data file is larger than last flush length, then additional data could be recovered
+ if (dataFileLen > maxLen) {
+ System.err.println("Data file has more data than max footer offset:" + maxLen +
+ ". Adding data file to recovery list.");
+ if (corruptFiles != null) {
+ corruptFiles.add(path.toUri().toString());
+ }
+ }
+ return null;
+ }
+
+ try {
+ reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen));
+
+ // if data file is larger than last flush length, then additional data could be recovered
+ if (dataFileLen > maxLen) {
+ System.err.println("Data file has more data than max footer offset:" + maxLen +
+ ". Adding data file to recovery list.");
+ if (corruptFiles != null) {
+ corruptFiles.add(path.toUri().toString());
+ }
+ }
+ } catch (Exception e) {
+ if (corruptFiles != null) {
+ corruptFiles.add(path.toUri().toString());
+ }
+ System.err.println("Unable to read data from max footer offset." +
+ " Adding data file to recovery list.");
+ return null;
+ }
+ } else {
+ reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
+ }
+
+ return reader;
+ }
+
+ public static Collection<String> getAllFilesInPath(final Path path,
+ final Configuration conf) throws IOException {
+ List<String> filesInPath = Lists.newArrayList();
+ FileSystem fs = path.getFileSystem(conf);
+ FileStatus fileStatus = fs.getFileStatus(path);
+ if (fileStatus.isDir()) {
+ FileStatus[] fileStatuses = fs.listStatus(path, HIDDEN_AND_SIDE_FILE_FILTER);
+ for (FileStatus fileInPath : fileStatuses) {
+ if (fileInPath.isDir()) {
+ filesInPath.addAll(getAllFilesInPath(fileInPath.getPath(), conf));
+ } else {
+ filesInPath.add(fileInPath.getPath().toString());
+ }
+ }
+ } else {
+ filesInPath.add(path.toString());
+ }
+
+ return filesInPath;
+ }
+
+ private static void printData(List<String> files,
+ Configuration conf) throws IOException,
+ JSONException {
+ for (String file : files) {
+ try {
+ Path path = new Path(file);
+ Reader reader = getReader(path, conf, Lists.<String>newArrayList());
+ if (reader == null) {
+ continue;
+ }
+ printJsonData(reader);
+ System.out.println(SEPARATOR);
+ } catch (Exception e) {
+ System.err.println("Unable to dump data for file: " + file);
+ continue;
+ }
+ }
+ }
+
+ private static void printMetaData(List<String> files, Configuration conf,
+ List<Integer> rowIndexCols, boolean printTimeZone, final boolean recover,
+ final String backupPath)
+ throws IOException {
+ List<String> corruptFiles = Lists.newArrayList();
+ for (String filename : files) {
+ printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles);
+ System.out.println(SEPARATOR);
+ }
+
+ if (!corruptFiles.isEmpty()) {
+ if (recover) {
+ recoverFiles(corruptFiles, conf, backupPath);
+ } else {
+ System.err.println(corruptFiles.size() + " file(s) are corrupted." +
+ " Run the following command to recover corrupted files.\n");
+ String fileNames = Joiner.on(" ").skipNulls().join(corruptFiles);
+ System.err.println("hive --orcfiledump --recover --skip-dump " + fileNames);
+ System.out.println(SEPARATOR);
+ }
+ }
+ }
+
+ private static void printMetaDataImpl(final String filename,
+ final Configuration conf, final List<Integer> rowIndexCols, final boolean printTimeZone,
+ final List<String> corruptFiles) throws IOException {
+ Path file = new Path(filename);
+ Reader reader = getReader(file, conf, corruptFiles);
+ // if we can create reader then footer is not corrupt and file will readable
+ if (reader == null) {
+ return;
+ }
+
+ System.out.println("Structure for " + filename);
+ System.out.println("File Version: " + reader.getFileVersion().getName() +
+ " with " + reader.getWriterVersion());
+ RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
+ System.out.println("Rows: " + reader.getNumberOfRows());
+ System.out.println("Compression: " + reader.getCompressionKind());
+ if (reader.getCompressionKind() != CompressionKind.NONE) {
+ System.out.println("Compression size: " + reader.getCompressionSize());
+ }
+ System.out.println("Type: " + reader.getSchema().toString());
+ System.out.println("\nStripe Statistics:");
+ List<StripeStatistics> stripeStats = reader.getStripeStatistics();
+ for (int n = 0; n < stripeStats.size(); n++) {
+ System.out.println(" Stripe " + (n + 1) + ":");
+ StripeStatistics ss = stripeStats.get(n);
+ for (int i = 0; i < ss.getColumnStatistics().length; ++i) {
+ System.out.println(" Column " + i + ": " +
+ ss.getColumnStatistics()[i].toString());
+ }
+ }
+ ColumnStatistics[] stats = reader.getStatistics();
+ int colCount = stats.length;
+ System.out.println("\nFile Statistics:");
+ for (int i = 0; i < stats.length; ++i) {
+ System.out.println(" Column " + i + ": " + stats[i].toString());
+ }
+ System.out.println("\nStripes:");
+ int stripeIx = -1;
+ for (StripeInformation stripe : reader.getStripes()) {
+ ++stripeIx;
+ long stripeStart = stripe.getOffset();
+ OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
+ if (printTimeZone) {
+ String tz = footer.getWriterTimezone();
+ if (tz == null || tz.isEmpty()) {
+ tz = UNKNOWN;
+ }
+ System.out.println(" Stripe: " + stripe.toString() + " timezone: " + tz);
+ } else {
+ System.out.println(" Stripe: " + stripe.toString());
+ }
+ long sectionStart = stripeStart;
+ for (OrcProto.Stream section : footer.getStreamsList()) {
+ String kind = section.hasKind() ? section.getKind().name() : UNKNOWN;
+ System.out.println(" Stream: column " + section.getColumn() +
+ " section " + kind + " start: " + sectionStart +
+ " length " + section.getLength());
+ sectionStart += section.getLength();
+ }
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ StringBuilder buf = new StringBuilder();
+ buf.append(" Encoding column ");
+ buf.append(i);
+ buf.append(": ");
+ buf.append(encoding.getKind());
+ if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
+ encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
+ buf.append("[");
+ buf.append(encoding.getDictionarySize());
+ buf.append("]");
+ }
+ System.out.println(buf);
+ }
+ if (rowIndexCols != null && !rowIndexCols.isEmpty()) {
+ // include the columns that are specified, only if the columns are included, bloom filter
+ // will be read
+ boolean[] sargColumns = new boolean[colCount];
+ for (int colIdx : rowIndexCols) {
+ sargColumns[colIdx] = true;
+ }
+ OrcIndex indices = rows
+ .readRowIndex(stripeIx, null, null, null, sargColumns);
+ for (int col : rowIndexCols) {
+ StringBuilder buf = new StringBuilder();
+ String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex());
+ buf.append(rowIdxString);
+ String bloomFilString = getFormattedBloomFilters(col, indices.getBloomFilterIndex());
+ buf.append(bloomFilString);
+ System.out.println(buf);
+ }
+ }
+ }
+
+ FileSystem fs = file.getFileSystem(conf);
+ long fileLen = fs.getFileStatus(file).getLen();
+ long paddedBytes = getTotalPaddingSize(reader);
+ // empty ORC file is ~45 bytes. Assumption here is file length always >0
+ double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
+ DecimalFormat format = new DecimalFormat("##.##");
+ System.out.println("\nFile length: " + fileLen + " bytes");
+ System.out.println("Padding length: " + paddedBytes + " bytes");
+ System.out.println("Padding ratio: " + format.format(percentPadding) + "%");
+ AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader);
+ if (acidStats != null) {
+ System.out.println("ACID stats:" + acidStats);
+ }
+ rows.close();
+ }
+
+ private static void recoverFiles(final List<String> corruptFiles, final Configuration conf,
+ final String backup)
+ throws IOException {
+ for (String corruptFile : corruptFiles) {
+ System.err.println("Recovering file " + corruptFile);
+ Path corruptPath = new Path(corruptFile);
+ FileSystem fs = corruptPath.getFileSystem(conf);
+ FSDataInputStream fdis = fs.open(corruptPath);
+ try {
+ long corruptFileLen = fs.getFileStatus(corruptPath).getLen();
+ long remaining = corruptFileLen;
+ List<Long> footerOffsets = Lists.newArrayList();
+
+ // start reading the data file form top to bottom and record the valid footers
+ while (remaining > 0) {
+ int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
+ byte[] data = new byte[toRead];
+ long startPos = corruptFileLen - remaining;
+ fdis.readFully(startPos, data, 0, toRead);
+
+ // find all MAGIC string and see if the file is readable from there
+ int index = 0;
+ long nextFooterOffset;
+
+ while (index != -1) {
+ index = indexOf(data, OrcFile.MAGIC.getBytes(), index + 1);
+ if (index != -1) {
+ nextFooterOffset = startPos + index + OrcFile.MAGIC.length() + 1;
+ if (isReadable(corruptPath, conf, nextFooterOffset)) {
+ footerOffsets.add(nextFooterOffset);
+ }
+ }
+ }
+
+ System.err.println("Scanning for valid footers - startPos: " + startPos +
+ " toRead: " + toRead + " remaining: " + remaining);
+ remaining = remaining - toRead;
+ }
+
+ System.err.println("Readable footerOffsets: " + footerOffsets);
+ recoverFile(corruptPath, fs, conf, footerOffsets, backup);
+ } catch (Exception e) {
+ Path recoveryFile = getRecoveryFile(corruptPath);
+ if (fs.exists(recoveryFile)) {
+ fs.delete(recoveryFile, false);
+ }
+ System.err.println("Unable to recover file " + corruptFile);
+ e.printStackTrace();
+ System.err.println(SEPARATOR);
+ continue;
+ } finally {
+ fdis.close();
+ }
+ System.err.println(corruptFile + " recovered successfully!");
+ System.err.println(SEPARATOR);
+ }
+ }
+
+ private static void recoverFile(final Path corruptPath, final FileSystem fs,
+ final Configuration conf, final List<Long> footerOffsets, final String backup)
+ throws IOException {
+
+ // first recover the file to .recovered file and then once successful rename it to actual file
+ Path recoveredPath = getRecoveryFile(corruptPath);
+
+ // make sure that file does not exist
+ if (fs.exists(recoveredPath)) {
+ fs.delete(recoveredPath, false);
+ }
+
+ // if there are no valid footers, the file should still be readable so create an empty orc file
+ if (footerOffsets == null || footerOffsets.isEmpty()) {
+ System.err.println("No readable footers found. Creating empty orc file.");
+ TypeDescription schema = TypeDescription.createStruct();
+ Writer writer = OrcFile.createWriter(recoveredPath,
+ OrcFile.writerOptions(conf).setSchema(schema));
+ writer.close();
+ } else {
+ FSDataInputStream fdis = fs.open(corruptPath);
+ FileStatus fileStatus = fs.getFileStatus(corruptPath);
+ // read corrupt file and copy it to recovered file until last valid footer
+ FSDataOutputStream fdos = fs.create(recoveredPath, true,
+ conf.getInt("io.file.buffer.size", 4096),
+ fileStatus.getReplication(),
+ fileStatus.getBlockSize());
+ try {
+ long fileLen = footerOffsets.get(footerOffsets.size() - 1);
+ long remaining = fileLen;
+
+ while (remaining > 0) {
+ int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
+ byte[] data = new byte[toRead];
+ long startPos = fileLen - remaining;
+ fdis.readFully(startPos, data, 0, toRead);
+ fdos.write(data);
+ System.err.println("Copying data to recovery file - startPos: " + startPos +
+ " toRead: " + toRead + " remaining: " + remaining);
+ remaining = remaining - toRead;
+ }
+ } catch (Exception e) {
+ fs.delete(recoveredPath, false);
+ throw new IOException(e);
+ } finally {
+ fdis.close();
+ fdos.close();
+ }
+ }
+
+ // validate the recovered file once again and start moving corrupt files to backup folder
+ if (isReadable(recoveredPath, conf, Long.MAX_VALUE)) {
+ Path backupDataPath;
+ String scheme = corruptPath.toUri().getScheme();
+ String authority = corruptPath.toUri().getAuthority();
+ String filePath = corruptPath.toUri().getPath();
+
+ // use the same filesystem as corrupt file if backup-path is not explicitly specified
+ if (backup.equals(DEFAULT_BACKUP_PATH)) {
+ backupDataPath = new Path(scheme, authority, DEFAULT_BACKUP_PATH + filePath);
+ } else {
+ backupDataPath = Path.mergePaths(new Path(backup), corruptPath);
+ }
+
+ // Move data file to backup path
+ moveFiles(fs, corruptPath, backupDataPath);
+
+ // Move side file to backup path
+ Path sideFilePath = OrcAcidUtils.getSideFile(corruptPath);
+ Path backupSideFilePath = new Path(backupDataPath.getParent(), sideFilePath.getName());
+ moveFiles(fs, sideFilePath, backupSideFilePath);
+
+ // finally move recovered file to actual file
+ moveFiles(fs, recoveredPath, corruptPath);
+
+ // we are done recovering, backing up and validating
+ System.err.println("Validation of recovered file successful!");
+ }
+ }
+
+ private static void moveFiles(final FileSystem fs, final Path src, final Path dest)
+ throws IOException {
+ try {
+ // create the dest directory if not exist
+ if (!fs.exists(dest.getParent())) {
+ fs.mkdirs(dest.getParent());
+ }
+
+ // if the destination file exists for some reason delete it
+ fs.delete(dest, false);
+
+ if (fs.rename(src, dest)) {
+ System.err.println("Moved " + src + " to " + dest);
+ } else {
+ throw new IOException("Unable to move " + src + " to " + dest);
+ }
+
+ } catch (Exception e) {
+ throw new IOException("Unable to move " + src + " to " + dest, e);
+ }
+ }
+
+ private static Path getRecoveryFile(final Path corruptPath) {
+ return new Path(corruptPath.getParent(), corruptPath.getName() + ".recovered");
+ }
+
+ private static boolean isReadable(final Path corruptPath, final Configuration conf,
+ final long maxLen) {
+ try {
+ OrcFile.createReader(corruptPath, OrcFile.readerOptions(conf).maxLength(maxLen));
+ return true;
+ } catch (Exception e) {
+ // ignore this exception as maxLen is unreadable
+ return false;
+ }
+ }
+
+ // search for byte pattern in another byte array
+ private static int indexOf(final byte[] data, final byte[] pattern, final int index) {
+ if (data == null || data.length == 0 || pattern == null || pattern.length == 0 ||
+ index > data.length || index < 0) {
+ return -1;
+ }
+
+ int j = 0;
+ for (int i = index; i < data.length; i++) {
+ if (pattern[j] == data[i]) {
+ j++;
+ } else {
+ j = 0;
+ }
+
+ if (j == pattern.length) {
+ return i - pattern.length + 1;
+ }
+ }
+
+ return -1;
+ }
+
+ private static String getFormattedBloomFilters(int col,
+ OrcProto.BloomFilterIndex[] bloomFilterIndex) {
+ StringBuilder buf = new StringBuilder();
+ BloomFilterIO stripeLevelBF = null;
+ if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
+ int idx = 0;
+ buf.append("\n Bloom filters for column ").append(col).append(":");
+ for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
+ BloomFilterIO toMerge = new BloomFilterIO(bf);
+ buf.append("\n Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge));
+ if (stripeLevelBF == null) {
+ stripeLevelBF = toMerge;
+ } else {
+ stripeLevelBF.merge(toMerge);
+ }
+ }
+ String bloomFilterStats = getBloomFilterStats(stripeLevelBF);
+ buf.append("\n Stripe level merge:").append(bloomFilterStats);
+ }
+ return buf.toString();
+ }
+
+ private static String getBloomFilterStats(BloomFilterIO bf) {
+ StringBuilder sb = new StringBuilder();
+ int bitCount = bf.getBitSize();
+ int popCount = 0;
+ for (long l : bf.getBitSet()) {
+ popCount += Long.bitCount(l);
+ }
+ int k = bf.getNumHashFunctions();
+ float loadFactor = (float) popCount / (float) bitCount;
+ float expectedFpp = (float) Math.pow(loadFactor, k);
+ DecimalFormat df = new DecimalFormat("###.####");
+ sb.append(" numHashFunctions: ").append(k);
+ sb.append(" bitCount: ").append(bitCount);
+ sb.append(" popCount: ").append(popCount);
+ sb.append(" loadFactor: ").append(df.format(loadFactor));
+ sb.append(" expectedFpp: ").append(expectedFpp);
+ return sb.toString();
+ }
+
+ private static String getFormattedRowIndices(int col,
+ OrcProto.RowIndex[] rowGroupIndex) {
+ StringBuilder buf = new StringBuilder();
+ OrcProto.RowIndex index;
+ buf.append(" Row group indices for column ").append(col).append(":");
+ if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
+ ((index = rowGroupIndex[col]) == null)) {
+ buf.append(" not found\n");
+ return buf.toString();
+ }
+
+ for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
+ buf.append("\n Entry ").append(entryIx).append(": ");
+ OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
+ if (entry == null) {
+ buf.append("unknown\n");
+ continue;
+ }
+ OrcProto.ColumnStatistics colStats = entry.getStatistics();
+ if (colStats == null) {
+ buf.append("no stats at ");
+ } else {
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(colStats);
+ buf.append(cs.toString());
+ }
+ buf.append(" positions: ");
+ for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
+ if (posIx != 0) {
+ buf.append(",");
+ }
+ buf.append(entry.getPositions(posIx));
+ }
+ }
+ return buf.toString();
+ }
+
+ public static long getTotalPaddingSize(Reader reader) throws IOException {
+ long paddedBytes = 0;
+ List<StripeInformation> stripes = reader.getStripes();
+ for (int i = 1; i < stripes.size(); i++) {
+ long prevStripeOffset = stripes.get(i - 1).getOffset();
+ long prevStripeLen = stripes.get(i - 1).getLength();
+ paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + prevStripeLen);
+ }
+ return paddedBytes;
+ }
+
+ static Options createOptions() {
+ Options result = new Options();
+
+ // add -d and --data to print the rows
+ result.addOption(OptionBuilder
+ .withLongOpt("data")
+ .withDescription("Should the data be printed")
+ .create('d'));
+
+ // to avoid breaking unit tests (when run in different time zones) for file dump, printing
+ // of timezone is made optional
+ result.addOption(OptionBuilder
+ .withLongOpt("timezone")
+ .withDescription("Print writer's time zone")
+ .create('t'));
+
+ result.addOption(OptionBuilder
+ .withLongOpt("help")
+ .withDescription("print help message")
+ .create('h'));
+
+ result.addOption(OptionBuilder
+ .withLongOpt("rowindex")
+ .withArgName("comma separated list of column ids for which row index should be printed")
+ .withDescription("Dump stats for column number(s)")
+ .hasArg()
+ .create('r'));
+
+ result.addOption(OptionBuilder
+ .withLongOpt("json")
+ .withDescription("Print metadata in JSON format")
+ .create('j'));
+
+ result.addOption(OptionBuilder
+ .withLongOpt("pretty")
+ .withDescription("Pretty print json metadata output")
+ .create('p'));
+
+ result.addOption(OptionBuilder
+ .withLongOpt("recover")
+ .withDescription("recover corrupted orc files generated by streaming")
+ .create());
+
+ result.addOption(OptionBuilder
+ .withLongOpt("skip-dump")
+ .withDescription("used along with --recover to directly recover files without dumping")
+ .create());
+
+ result.addOption(OptionBuilder
+ .withLongOpt("backup-path")
+ .withDescription("specify a backup path to store the corrupted files (default: /tmp)")
+ .hasArg()
+ .create());
+ return result;
+ }
+
+ private static void printMap(JSONWriter writer,
+ MapColumnVector vector,
+ TypeDescription schema,
+ int row) throws JSONException {
+ writer.array();
+ TypeDescription keyType = schema.getChildren().get(0);
+ TypeDescription valueType = schema.getChildren().get(1);
+ int offset = (int) vector.offsets[row];
+ for (int i = 0; i < vector.lengths[row]; ++i) {
+ writer.object();
+ writer.key("_key");
+ printValue(writer, vector.keys, keyType, offset + i);
+ writer.key("_value");
+ printValue(writer, vector.values, valueType, offset + i);
+ writer.endObject();
+ }
+ writer.endArray();
+ }
+
+ private static void printList(JSONWriter writer,
+ ListColumnVector vector,
+ TypeDescription schema,
+ int row) throws JSONException {
+ writer.array();
+ int offset = (int) vector.offsets[row];
+ TypeDescription childType = schema.getChildren().get(0);
+ for (int i = 0; i < vector.lengths[row]; ++i) {
+ printValue(writer, vector.child, childType, offset + i);
+ }
+ writer.endArray();
+ }
+
+ private static void printUnion(JSONWriter writer,
+ UnionColumnVector vector,
+ TypeDescription schema,
+ int row) throws JSONException {
+ int tag = vector.tags[row];
+ printValue(writer, vector.fields[tag], schema.getChildren().get(tag), row);
+ }
+
+ static void printStruct(JSONWriter writer,
+ StructColumnVector batch,
+ TypeDescription schema,
+ int row) throws JSONException {
+ writer.object();
+ List<String> fieldNames = schema.getFieldNames();
+ List<TypeDescription> fieldTypes = schema.getChildren();
+ for (int i = 0; i < fieldTypes.size(); ++i) {
+ writer.key(fieldNames.get(i));
+ printValue(writer, batch.fields[i], fieldTypes.get(i), row);
+ }
+ writer.endObject();
+ }
+
+ static void printBinary(JSONWriter writer, BytesColumnVector vector,
+ int row) throws JSONException {
+ writer.array();
+ int offset = vector.start[row];
+ for(int i=0; i < vector.length[row]; ++i) {
+ writer.value(0xff & (int) vector.vector[row][offset + i]);
+ }
+ writer.endArray();
+ }
+ static void printValue(JSONWriter writer, ColumnVector vector,
+ TypeDescription schema, int row) throws JSONException {
+ if (vector.isRepeating) {
+ row = 0;
+ }
+ if (vector.noNulls || !vector.isNull[row]) {
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ writer.value(((LongColumnVector) vector).vector[row] != 0);
+ break;
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ writer.value(((LongColumnVector) vector).vector[row]);
+ break;
+ case FLOAT:
+ case DOUBLE:
+ writer.value(((DoubleColumnVector) vector).vector[row]);
+ break;
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ writer.value(((BytesColumnVector) vector).toString(row));
+ break;
+ case BINARY:
+ printBinary(writer, (BytesColumnVector) vector, row);
+ break;
+ case DECIMAL:
+ writer.value(((DecimalColumnVector) vector).vector[row].toString());
+ break;
+ case DATE:
+ writer.value(new DateWritable(
+ (int) ((LongColumnVector) vector).vector[row]).toString());
+ break;
+ case TIMESTAMP:
+ writer.value(((TimestampColumnVector) vector)
+ .asScratchTimestamp(row).toString());
+ break;
+ case LIST:
+ printList(writer, (ListColumnVector) vector, schema, row);
+ break;
+ case MAP:
+ printMap(writer, (MapColumnVector) vector, schema, row);
+ break;
+ case STRUCT:
+ printStruct(writer, (StructColumnVector) vector, schema, row);
+ break;
+ case UNION:
+ printUnion(writer, (UnionColumnVector) vector, schema, row);
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown type " +
+ schema.toString());
+ }
+ } else {
+ writer.value(null);
+ }
+ }
+
+ static void printRow(JSONWriter writer,
+ VectorizedRowBatch batch,
+ TypeDescription schema,
+ int row) throws JSONException {
+ if (schema.getCategory() == TypeDescription.Category.STRUCT) {
+ List<TypeDescription> fieldTypes = schema.getChildren();
+ List<String> fieldNames = schema.getFieldNames();
+ writer.object();
+ for (int c = 0; c < batch.cols.length; ++c) {
+ writer.key(fieldNames.get(c));
+ printValue(writer, batch.cols[c], fieldTypes.get(c), row);
+ }
+ writer.endObject();
+ } else {
+ printValue(writer, batch.cols[0], schema, row);
+ }
+ }
+
+ static void printJsonData(final Reader reader) throws IOException, JSONException {
+ PrintStream printStream = System.out;
+ OutputStreamWriter out = new OutputStreamWriter(printStream, "UTF-8");
+ RecordReader rows = reader.rows();
+ try {
+ TypeDescription schema = reader.getSchema();
+ VectorizedRowBatch batch = schema.createRowBatch();
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ JSONWriter writer = new JSONWriter(out);
+ printRow(writer, batch, schema, r);
+ out.write("\n");
+ out.flush();
+ if (printStream.checkError()) {
+ throw new IOException("Error encountered when writing to stdout.");
+ }
+ }
+ }
+ } finally {
+ rows.close();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/tools/JsonFileDump.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/tools/JsonFileDump.java b/orc/src/java/org/apache/orc/tools/JsonFileDump.java
new file mode 100644
index 0000000..75153a2
--- /dev/null
+++ b/orc/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -0,0 +1,406 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.tools;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.Reader;
+import org.apache.orc.impl.AcidStats;
+import org.apache.orc.impl.OrcAcidUtils;
+import org.apache.orc.impl.RecordReaderImpl;
+import org.codehaus.jettison.json.JSONArray;
+import org.apache.orc.BloomFilterIO;
+import org.apache.orc.BinaryColumnStatistics;
+import org.apache.orc.BooleanColumnStatistics;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.impl.ColumnStatisticsImpl;
+import org.apache.orc.DateColumnStatistics;
+import org.apache.orc.DecimalColumnStatistics;
+import org.apache.orc.DoubleColumnStatistics;
+import org.apache.orc.IntegerColumnStatistics;
+import org.apache.orc.impl.OrcIndex;
+import org.apache.orc.OrcProto;
+import org.apache.orc.StringColumnStatistics;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.StripeStatistics;
+import org.apache.orc.TimestampColumnStatistics;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONObject;
+import org.codehaus.jettison.json.JSONStringer;
+import org.codehaus.jettison.json.JSONWriter;
+
+/**
+ * File dump tool with json formatted output.
+ */
+public class JsonFileDump {
+
+ public static void printJsonMetaData(List<String> files,
+ Configuration conf,
+ List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone)
+ throws JSONException, IOException {
+ if (files.isEmpty()) {
+ return;
+ }
+ JSONStringer writer = new JSONStringer();
+ boolean multiFile = files.size() > 1;
+ if (multiFile) {
+ writer.array();
+ } else {
+ writer.object();
+ }
+ for (String filename : files) {
+ try {
+ if (multiFile) {
+ writer.object();
+ }
+ writer.key("fileName").value(filename);
+ Path path = new Path(filename);
+ Reader reader = FileDump.getReader(path, conf, null);
+ if (reader == null) {
+ writer.key("status").value("FAILED");
+ continue;
+ }
+ writer.key("fileVersion").value(reader.getFileVersion().getName());
+ writer.key("writerVersion").value(reader.getWriterVersion());
+ RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
+ writer.key("numberOfRows").value(reader.getNumberOfRows());
+ writer.key("compression").value(reader.getCompressionKind());
+ if (reader.getCompressionKind() != CompressionKind.NONE) {
+ writer.key("compressionBufferSize").value(reader.getCompressionSize());
+ }
+ writer.key("schemaString").value(reader.getSchema().toString());
+ writer.key("schema").array();
+ writeSchema(writer, reader.getTypes());
+ writer.endArray();
+
+ writer.key("stripeStatistics").array();
+ List<StripeStatistics> stripeStatistics = reader.getStripeStatistics();
+ for (int n = 0; n < stripeStatistics.size(); n++) {
+ writer.object();
+ writer.key("stripeNumber").value(n + 1);
+ StripeStatistics ss = stripeStatistics.get(n);
+ writer.key("columnStatistics").array();
+ for (int i = 0; i < ss.getColumnStatistics().length; i++) {
+ writer.object();
+ writer.key("columnId").value(i);
+ writeColumnStatistics(writer, ss.getColumnStatistics()[i]);
+ writer.endObject();
+ }
+ writer.endArray();
+ writer.endObject();
+ }
+ writer.endArray();
+
+ ColumnStatistics[] stats = reader.getStatistics();
+ int colCount = stats.length;
+ writer.key("fileStatistics").array();
+ for (int i = 0; i < stats.length; ++i) {
+ writer.object();
+ writer.key("columnId").value(i);
+ writeColumnStatistics(writer, stats[i]);
+ writer.endObject();
+ }
+ writer.endArray();
+
+ writer.key("stripes").array();
+ int stripeIx = -1;
+ for (StripeInformation stripe : reader.getStripes()) {
+ ++stripeIx;
+ long stripeStart = stripe.getOffset();
+ OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
+ writer.object(); // start of stripe information
+ writer.key("stripeNumber").value(stripeIx + 1);
+ writer.key("stripeInformation");
+ writeStripeInformation(writer, stripe);
+ if (printTimeZone) {
+ writer.key("writerTimezone").value(
+ footer.hasWriterTimezone() ? footer.getWriterTimezone() : FileDump.UNKNOWN);
+ }
+ long sectionStart = stripeStart;
+
+ writer.key("streams").array();
+ for (OrcProto.Stream section : footer.getStreamsList()) {
+ writer.object();
+ String kind = section.hasKind() ? section.getKind().name() : FileDump.UNKNOWN;
+ writer.key("columnId").value(section.getColumn());
+ writer.key("section").value(kind);
+ writer.key("startOffset").value(sectionStart);
+ writer.key("length").value(section.getLength());
+ sectionStart += section.getLength();
+ writer.endObject();
+ }
+ writer.endArray();
+
+ writer.key("encodings").array();
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ writer.object();
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ writer.key("columnId").value(i);
+ writer.key("kind").value(encoding.getKind());
+ if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
+ encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
+ writer.key("dictionarySize").value(encoding.getDictionarySize());
+ }
+ writer.endObject();
+ }
+ writer.endArray();
+
+ if (rowIndexCols != null && !rowIndexCols.isEmpty()) {
+ // include the columns that are specified, only if the columns are included, bloom filter
+ // will be read
+ boolean[] sargColumns = new boolean[colCount];
+ for (int colIdx : rowIndexCols) {
+ sargColumns[colIdx] = true;
+ }
+ OrcIndex indices = rows.readRowIndex(stripeIx, null, sargColumns);
+ writer.key("indexes").array();
+ for (int col : rowIndexCols) {
+ writer.object();
+ writer.key("columnId").value(col);
+ writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
+ writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex());
+ writer.endObject();
+ }
+ writer.endArray();
+ }
+ writer.endObject(); // end of stripe information
+ }
+ writer.endArray();
+
+ FileSystem fs = path.getFileSystem(conf);
+ long fileLen = fs.getContentSummary(path).getLength();
+ long paddedBytes = FileDump.getTotalPaddingSize(reader);
+ // empty ORC file is ~45 bytes. Assumption here is file length always >0
+ double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
+ writer.key("fileLength").value(fileLen);
+ writer.key("paddingLength").value(paddedBytes);
+ writer.key("paddingRatio").value(percentPadding);
+ AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader);
+ if (acidStats != null) {
+ writer.key("numInserts").value(acidStats.inserts);
+ writer.key("numDeletes").value(acidStats.deletes);
+ writer.key("numUpdates").value(acidStats.updates);
+ }
+ writer.key("status").value("OK");
+ rows.close();
+
+ writer.endObject();
+ } catch (Exception e) {
+ writer.key("status").value("FAILED");
+ throw e;
+ }
+ }
+ if (multiFile) {
+ writer.endArray();
+ }
+
+ if (prettyPrint) {
+ final String prettyJson;
+ if (multiFile) {
+ JSONArray jsonArray = new JSONArray(writer.toString());
+ prettyJson = jsonArray.toString(2);
+ } else {
+ JSONObject jsonObject = new JSONObject(writer.toString());
+ prettyJson = jsonObject.toString(2);
+ }
+ System.out.println(prettyJson);
+ } else {
+ System.out.println(writer.toString());
+ }
+ }
+
+ private static void writeSchema(JSONStringer writer, List<OrcProto.Type> types)
+ throws JSONException {
+ int i = 0;
+ for(OrcProto.Type type : types) {
+ writer.object();
+ writer.key("columnId").value(i++);
+ writer.key("columnType").value(type.getKind());
+ if (type.getFieldNamesCount() > 0) {
+ writer.key("childColumnNames").array();
+ for (String field : type.getFieldNamesList()) {
+ writer.value(field);
+ }
+ writer.endArray();
+ writer.key("childColumnIds").array();
+ for (Integer colId : type.getSubtypesList()) {
+ writer.value(colId);
+ }
+ writer.endArray();
+ }
+ if (type.hasPrecision()) {
+ writer.key("precision").value(type.getPrecision());
+ }
+
+ if (type.hasScale()) {
+ writer.key("scale").value(type.getScale());
+ }
+
+ if (type.hasMaximumLength()) {
+ writer.key("maxLength").value(type.getMaximumLength());
+ }
+ writer.endObject();
+ }
+ }
+
+ private static void writeStripeInformation(JSONWriter writer, StripeInformation stripe)
+ throws JSONException {
+ writer.object();
+ writer.key("offset").value(stripe.getOffset());
+ writer.key("indexLength").value(stripe.getIndexLength());
+ writer.key("dataLength").value(stripe.getDataLength());
+ writer.key("footerLength").value(stripe.getFooterLength());
+ writer.key("rowCount").value(stripe.getNumberOfRows());
+ writer.endObject();
+ }
+
+ private static void writeColumnStatistics(JSONWriter writer, ColumnStatistics cs)
+ throws JSONException {
+ if (cs != null) {
+ writer.key("count").value(cs.getNumberOfValues());
+ writer.key("hasNull").value(cs.hasNull());
+ if (cs instanceof BinaryColumnStatistics) {
+ writer.key("totalLength").value(((BinaryColumnStatistics) cs).getSum());
+ writer.key("type").value(OrcProto.Type.Kind.BINARY);
+ } else if (cs instanceof BooleanColumnStatistics) {
+ writer.key("trueCount").value(((BooleanColumnStatistics) cs).getTrueCount());
+ writer.key("falseCount").value(((BooleanColumnStatistics) cs).getFalseCount());
+ writer.key("type").value(OrcProto.Type.Kind.BOOLEAN);
+ } else if (cs instanceof IntegerColumnStatistics) {
+ writer.key("min").value(((IntegerColumnStatistics) cs).getMinimum());
+ writer.key("max").value(((IntegerColumnStatistics) cs).getMaximum());
+ if (((IntegerColumnStatistics) cs).isSumDefined()) {
+ writer.key("sum").value(((IntegerColumnStatistics) cs).getSum());
+ }
+ writer.key("type").value(OrcProto.Type.Kind.LONG);
+ } else if (cs instanceof DoubleColumnStatistics) {
+ writer.key("min").value(((DoubleColumnStatistics) cs).getMinimum());
+ writer.key("max").value(((DoubleColumnStatistics) cs).getMaximum());
+ writer.key("sum").value(((DoubleColumnStatistics) cs).getSum());
+ writer.key("type").value(OrcProto.Type.Kind.DOUBLE);
+ } else if (cs instanceof StringColumnStatistics) {
+ writer.key("min").value(((StringColumnStatistics) cs).getMinimum());
+ writer.key("max").value(((StringColumnStatistics) cs).getMaximum());
+ writer.key("totalLength").value(((StringColumnStatistics) cs).getSum());
+ writer.key("type").value(OrcProto.Type.Kind.STRING);
+ } else if (cs instanceof DateColumnStatistics) {
+ if (((DateColumnStatistics) cs).getMaximum() != null) {
+ writer.key("min").value(((DateColumnStatistics) cs).getMinimum());
+ writer.key("max").value(((DateColumnStatistics) cs).getMaximum());
+ }
+ writer.key("type").value(OrcProto.Type.Kind.DATE);
+ } else if (cs instanceof TimestampColumnStatistics) {
+ if (((TimestampColumnStatistics) cs).getMaximum() != null) {
+ writer.key("min").value(((TimestampColumnStatistics) cs).getMinimum());
+ writer.key("max").value(((TimestampColumnStatistics) cs).getMaximum());
+ }
+ writer.key("type").value(OrcProto.Type.Kind.TIMESTAMP);
+ } else if (cs instanceof DecimalColumnStatistics) {
+ if (((DecimalColumnStatistics) cs).getMaximum() != null) {
+ writer.key("min").value(((DecimalColumnStatistics) cs).getMinimum());
+ writer.key("max").value(((DecimalColumnStatistics) cs).getMaximum());
+ writer.key("sum").value(((DecimalColumnStatistics) cs).getSum());
+ }
+ writer.key("type").value(OrcProto.Type.Kind.DECIMAL);
+ }
+ }
+ }
+
+ private static void writeBloomFilterIndexes(JSONWriter writer, int col,
+ OrcProto.BloomFilterIndex[] bloomFilterIndex) throws JSONException {
+
+ BloomFilterIO stripeLevelBF = null;
+ if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
+ int entryIx = 0;
+ writer.key("bloomFilterIndexes").array();
+ for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
+ writer.object();
+ writer.key("entryId").value(entryIx++);
+ BloomFilterIO toMerge = new BloomFilterIO(bf);
+ writeBloomFilterStats(writer, toMerge);
+ if (stripeLevelBF == null) {
+ stripeLevelBF = toMerge;
+ } else {
+ stripeLevelBF.merge(toMerge);
+ }
+ writer.endObject();
+ }
+ writer.endArray();
+ }
+ if (stripeLevelBF != null) {
+ writer.key("stripeLevelBloomFilter");
+ writer.object();
+ writeBloomFilterStats(writer, stripeLevelBF);
+ writer.endObject();
+ }
+ }
+
+ private static void writeBloomFilterStats(JSONWriter writer, BloomFilterIO bf)
+ throws JSONException {
+ int bitCount = bf.getBitSize();
+ int popCount = 0;
+ for (long l : bf.getBitSet()) {
+ popCount += Long.bitCount(l);
+ }
+ int k = bf.getNumHashFunctions();
+ float loadFactor = (float) popCount / (float) bitCount;
+ float expectedFpp = (float) Math.pow(loadFactor, k);
+ writer.key("numHashFunctions").value(k);
+ writer.key("bitCount").value(bitCount);
+ writer.key("popCount").value(popCount);
+ writer.key("loadFactor").value(loadFactor);
+ writer.key("expectedFpp").value(expectedFpp);
+ }
+
+ private static void writeRowGroupIndexes(JSONWriter writer, int col,
+ OrcProto.RowIndex[] rowGroupIndex)
+ throws JSONException {
+
+ OrcProto.RowIndex index;
+ if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
+ ((index = rowGroupIndex[col]) == null)) {
+ return;
+ }
+
+ writer.key("rowGroupIndexes").array();
+ for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
+ writer.object();
+ writer.key("entryId").value(entryIx);
+ OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
+ if (entry == null) {
+ continue;
+ }
+ OrcProto.ColumnStatistics colStats = entry.getStatistics();
+ writeColumnStatistics(writer, ColumnStatisticsImpl.deserialize(colStats));
+ writer.key("positions").array();
+ for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
+ writer.value(entry.getPositions(posIx));
+ }
+ writer.endArray();
+ writer.endObject();
+ }
+ writer.endArray();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/TestColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestColumnStatistics.java b/orc/src/test/org/apache/orc/TestColumnStatistics.java
new file mode 100644
index 0000000..1837dbb
--- /dev/null
+++ b/orc/src/test/org/apache/orc/TestColumnStatistics.java
@@ -0,0 +1,364 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import static junit.framework.Assert.assertEquals;
+import static org.junit.Assume.assumeTrue;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.PrintStream;
+import java.sql.Timestamp;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.orc.impl.ColumnStatisticsImpl;
+import org.apache.orc.tools.FileDump;
+import org.apache.orc.tools.TestFileDump;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+/**
+ * Test ColumnStatisticsImpl for ORC.
+ */
+public class TestColumnStatistics {
+
+ @Test
+ public void testLongMerge() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateInteger(10, 2);
+ stats2.updateInteger(1, 1);
+ stats2.updateInteger(1000, 1);
+ stats1.merge(stats2);
+ IntegerColumnStatistics typed = (IntegerColumnStatistics) stats1;
+ assertEquals(1, typed.getMinimum());
+ assertEquals(1000, typed.getMaximum());
+ stats1.reset();
+ stats1.updateInteger(-10, 1);
+ stats1.updateInteger(10000, 1);
+ stats1.merge(stats2);
+ assertEquals(-10, typed.getMinimum());
+ assertEquals(10000, typed.getMaximum());
+ }
+
+ @Test
+ public void testDoubleMerge() throws Exception {
+ TypeDescription schema = TypeDescription.createDouble();
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateDouble(10.0);
+ stats1.updateDouble(100.0);
+ stats2.updateDouble(1.0);
+ stats2.updateDouble(1000.0);
+ stats1.merge(stats2);
+ DoubleColumnStatistics typed = (DoubleColumnStatistics) stats1;
+ assertEquals(1.0, typed.getMinimum(), 0.001);
+ assertEquals(1000.0, typed.getMaximum(), 0.001);
+ stats1.reset();
+ stats1.updateDouble(-10);
+ stats1.updateDouble(10000);
+ stats1.merge(stats2);
+ assertEquals(-10, typed.getMinimum(), 0.001);
+ assertEquals(10000, typed.getMaximum(), 0.001);
+ }
+
+
+ @Test
+ public void testStringMerge() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateString(new Text("bob"));
+ stats1.updateString(new Text("david"));
+ stats1.updateString(new Text("charles"));
+ stats2.updateString(new Text("anne"));
+ byte[] erin = new byte[]{0, 1, 2, 3, 4, 5, 101, 114, 105, 110};
+ stats2.updateString(erin, 6, 4, 5);
+ assertEquals(24, ((StringColumnStatistics)stats2).getSum());
+ stats1.merge(stats2);
+ StringColumnStatistics typed = (StringColumnStatistics) stats1;
+ assertEquals("anne", typed.getMinimum());
+ assertEquals("erin", typed.getMaximum());
+ assertEquals(39, typed.getSum());
+ stats1.reset();
+ stats1.updateString(new Text("aaa"));
+ stats1.updateString(new Text("zzz"));
+ stats1.merge(stats2);
+ assertEquals("aaa", typed.getMinimum());
+ assertEquals("zzz", typed.getMaximum());
+ }
+
+ @Test
+ public void testDateMerge() throws Exception {
+ TypeDescription schema = TypeDescription.createDate();
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateDate(new DateWritable(1000));
+ stats1.updateDate(new DateWritable(100));
+ stats2.updateDate(new DateWritable(10));
+ stats2.updateDate(new DateWritable(2000));
+ stats1.merge(stats2);
+ DateColumnStatistics typed = (DateColumnStatistics) stats1;
+ assertEquals(new DateWritable(10).get(), typed.getMinimum());
+ assertEquals(new DateWritable(2000).get(), typed.getMaximum());
+ stats1.reset();
+ stats1.updateDate(new DateWritable(-10));
+ stats1.updateDate(new DateWritable(10000));
+ stats1.merge(stats2);
+ assertEquals(new DateWritable(-10).get(), typed.getMinimum());
+ assertEquals(new DateWritable(10000).get(), typed.getMaximum());
+ }
+
+ @Test
+ public void testTimestampMerge() throws Exception {
+ TypeDescription schema = TypeDescription.createTimestamp();
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateTimestamp(new Timestamp(10));
+ stats1.updateTimestamp(new Timestamp(100));
+ stats2.updateTimestamp(new Timestamp(1));
+ stats2.updateTimestamp(new Timestamp(1000));
+ stats1.merge(stats2);
+ TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
+ assertEquals(1, typed.getMinimum().getTime());
+ assertEquals(1000, typed.getMaximum().getTime());
+ stats1.reset();
+ stats1.updateTimestamp(new Timestamp(-10));
+ stats1.updateTimestamp(new Timestamp(10000));
+ stats1.merge(stats2);
+ assertEquals(-10, typed.getMinimum().getTime());
+ assertEquals(10000, typed.getMaximum().getTime());
+ }
+
+ @Test
+ public void testDecimalMerge() throws Exception {
+ TypeDescription schema = TypeDescription.createDecimal()
+ .withPrecision(38).withScale(16);
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateDecimal(HiveDecimal.create(10));
+ stats1.updateDecimal(HiveDecimal.create(100));
+ stats2.updateDecimal(HiveDecimal.create(1));
+ stats2.updateDecimal(HiveDecimal.create(1000));
+ stats1.merge(stats2);
+ DecimalColumnStatistics typed = (DecimalColumnStatistics) stats1;
+ assertEquals(1, typed.getMinimum().longValue());
+ assertEquals(1000, typed.getMaximum().longValue());
+ stats1.reset();
+ stats1.updateDecimal(HiveDecimal.create(-10));
+ stats1.updateDecimal(HiveDecimal.create(10000));
+ stats1.merge(stats2);
+ assertEquals(-10, typed.getMinimum().longValue());
+ assertEquals(10000, typed.getMaximum().longValue());
+ }
+
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ fs.setWorkingDirectory(workDir);
+ testFilePath = new Path("TestOrcFile." + testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ private static BytesWritable bytes(int... items) {
+ BytesWritable result = new BytesWritable();
+ result.setSize(items.length);
+ for (int i = 0; i < items.length; ++i) {
+ result.getBytes()[i] = (byte) items[i];
+ }
+ return result;
+ }
+
+ void appendRow(VectorizedRowBatch batch, BytesWritable bytes,
+ String str) {
+ int row = batch.size++;
+ if (bytes == null) {
+ batch.cols[0].noNulls = false;
+ batch.cols[0].isNull[row] = true;
+ } else {
+ ((BytesColumnVector) batch.cols[0]).setVal(row, bytes.getBytes(),
+ 0, bytes.getLength());
+ }
+ if (str == null) {
+ batch.cols[1].noNulls = false;
+ batch.cols[1].isNull[row] = true;
+ } else {
+ ((BytesColumnVector) batch.cols[1]).setVal(row, str.getBytes());
+ }
+ }
+
+ @Test
+ public void testHasNull() throws Exception {
+ TypeDescription schema =
+ TypeDescription.createStruct()
+ .addField("bytes1", TypeDescription.createBinary())
+ .addField("string1", TypeDescription.createString());
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .rowIndexStride(1000)
+ .stripeSize(10000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch(5000);
+ // STRIPE 1
+ // RG1
+ for(int i=0; i<1000; i++) {
+ appendRow(batch, bytes(1, 2, 3), "RG1");
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // RG2
+ for(int i=0; i<1000; i++) {
+ appendRow(batch, bytes(1, 2, 3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // RG3
+ for(int i=0; i<1000; i++) {
+ appendRow(batch, bytes(1, 2, 3), "RG3");
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // RG4
+ for (int i = 0; i < 1000; i++) {
+ appendRow(batch, bytes(1,2,3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // RG5
+ for(int i=0; i<1000; i++) {
+ appendRow(batch, bytes(1, 2, 3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // STRIPE 2
+ for (int i = 0; i < 5000; i++) {
+ appendRow(batch, bytes(1,2,3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // STRIPE 3
+ for (int i = 0; i < 5000; i++) {
+ appendRow(batch, bytes(1,2,3), "STRIPE-3");
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // STRIPE 4
+ for (int i = 0; i < 5000; i++) {
+ appendRow(batch, bytes(1,2,3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ // check the file level stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(20000, stats[0].getNumberOfValues());
+ assertEquals(20000, stats[1].getNumberOfValues());
+ assertEquals(7000, stats[2].getNumberOfValues());
+ assertEquals(false, stats[0].hasNull());
+ assertEquals(false, stats[1].hasNull());
+ assertEquals(true, stats[2].hasNull());
+
+ // check the stripe level stats
+ List<StripeStatistics> stripeStats = reader.getStripeStatistics();
+ // stripe 1 stats
+ StripeStatistics ss1 = stripeStats.get(0);
+ ColumnStatistics ss1_cs1 = ss1.getColumnStatistics()[0];
+ ColumnStatistics ss1_cs2 = ss1.getColumnStatistics()[1];
+ ColumnStatistics ss1_cs3 = ss1.getColumnStatistics()[2];
+ assertEquals(false, ss1_cs1.hasNull());
+ assertEquals(false, ss1_cs2.hasNull());
+ assertEquals(true, ss1_cs3.hasNull());
+
+ // stripe 2 stats
+ StripeStatistics ss2 = stripeStats.get(1);
+ ColumnStatistics ss2_cs1 = ss2.getColumnStatistics()[0];
+ ColumnStatistics ss2_cs2 = ss2.getColumnStatistics()[1];
+ ColumnStatistics ss2_cs3 = ss2.getColumnStatistics()[2];
+ assertEquals(false, ss2_cs1.hasNull());
+ assertEquals(false, ss2_cs2.hasNull());
+ assertEquals(true, ss2_cs3.hasNull());
+
+ // stripe 3 stats
+ StripeStatistics ss3 = stripeStats.get(2);
+ ColumnStatistics ss3_cs1 = ss3.getColumnStatistics()[0];
+ ColumnStatistics ss3_cs2 = ss3.getColumnStatistics()[1];
+ ColumnStatistics ss3_cs3 = ss3.getColumnStatistics()[2];
+ assertEquals(false, ss3_cs1.hasNull());
+ assertEquals(false, ss3_cs2.hasNull());
+ assertEquals(false, ss3_cs3.hasNull());
+
+ // stripe 4 stats
+ StripeStatistics ss4 = stripeStats.get(3);
+ ColumnStatistics ss4_cs1 = ss4.getColumnStatistics()[0];
+ ColumnStatistics ss4_cs2 = ss4.getColumnStatistics()[1];
+ ColumnStatistics ss4_cs3 = ss4.getColumnStatistics()[2];
+ assertEquals(false, ss4_cs1.hasNull());
+ assertEquals(false, ss4_cs2.hasNull());
+ assertEquals(true, ss4_cs3.hasNull());
+
+ // Test file dump
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-has-null.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=2"});
+ System.out.flush();
+ System.setOut(origOut);
+ // If called with an expression evaluating to false, the test will halt
+ // and be ignored.
+ assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
+ TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+}
[06/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRecordReaderImpl.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRecordReaderImpl.java
deleted file mode 100644
index 8731be0..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRecordReaderImpl.java
+++ /dev/null
@@ -1,1678 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static junit.framework.Assert.assertEquals;
-import static org.hamcrest.core.Is.is;
-import static org.junit.Assert.*;
-import static org.mockito.Mockito.any;
-import static org.mockito.Mockito.atLeastOnce;
-import static org.mockito.Mockito.doThrow;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.when;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.sql.Timestamp;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PositionedReadable;
-import org.apache.hadoop.fs.Seekable;
-import org.apache.hadoop.hive.common.io.DiskRangeList;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hive.common.util.HiveTestUtils;
-import org.apache.orc.BloomFilterIO;
-import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.Location;
-import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
-import org.apache.hadoop.hive.ql.io.sarg.TestSearchArgumentImpl;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.DataReader;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.impl.ColumnStatisticsImpl;
-import org.apache.orc.OrcProto;
-
-import org.junit.Test;
-import org.mockito.MockSettings;
-import org.mockito.Mockito;
-
-public class TestRecordReaderImpl {
-
- // can add .verboseLogging() to cause Mockito to log invocations
- private final MockSettings settings = Mockito.withSettings().verboseLogging();
-
- static class BufferInStream
- extends InputStream implements PositionedReadable, Seekable {
- private final byte[] buffer;
- private final int length;
- private int position = 0;
-
- BufferInStream(byte[] bytes, int length) {
- this.buffer = bytes;
- this.length = length;
- }
-
- @Override
- public int read() {
- if (position < length) {
- return buffer[position++];
- }
- return -1;
- }
-
- @Override
- public int read(byte[] bytes, int offset, int length) {
- int lengthToRead = Math.min(length, this.length - this.position);
- if (lengthToRead >= 0) {
- for(int i=0; i < lengthToRead; ++i) {
- bytes[offset + i] = buffer[position++];
- }
- return lengthToRead;
- } else {
- return -1;
- }
- }
-
- @Override
- public int read(long position, byte[] bytes, int offset, int length) {
- this.position = (int) position;
- return read(bytes, offset, length);
- }
-
- @Override
- public void readFully(long position, byte[] bytes, int offset,
- int length) throws IOException {
- this.position = (int) position;
- while (length > 0) {
- int result = read(bytes, offset, length);
- offset += result;
- length -= result;
- if (result < 0) {
- throw new IOException("Read past end of buffer at " + offset);
- }
- }
- }
-
- @Override
- public void readFully(long position, byte[] bytes) throws IOException {
- readFully(position, bytes, 0, bytes.length);
- }
-
- @Override
- public void seek(long position) {
- this.position = (int) position;
- }
-
- @Override
- public long getPos() {
- return position;
- }
-
- @Override
- public boolean seekToNewSource(long position) throws IOException {
- this.position = (int) position;
- return false;
- }
- }
-
- @Test
- public void testMaxLengthToReader() throws Exception {
- Configuration conf = new Configuration();
- OrcProto.Type rowType = OrcProto.Type.newBuilder()
- .setKind(OrcProto.Type.Kind.STRUCT).build();
- OrcProto.Footer footer = OrcProto.Footer.newBuilder()
- .setHeaderLength(0).setContentLength(0).setNumberOfRows(0)
- .setRowIndexStride(0).addTypes(rowType).build();
- OrcProto.PostScript ps = OrcProto.PostScript.newBuilder()
- .setCompression(OrcProto.CompressionKind.NONE)
- .setFooterLength(footer.getSerializedSize())
- .setMagic("ORC").addVersion(0).addVersion(11).build();
- DataOutputBuffer buffer = new DataOutputBuffer();
- footer.writeTo(buffer);
- ps.writeTo(buffer);
- buffer.write(ps.getSerializedSize());
- FileSystem fs = mock(FileSystem.class, settings);
- FSDataInputStream file =
- new FSDataInputStream(new BufferInStream(buffer.getData(),
- buffer.getLength()));
- Path p = new Path("/dir/file.orc");
- when(fs.open(p)).thenReturn(file);
- OrcFile.ReaderOptions options = OrcFile.readerOptions(conf);
- options.filesystem(fs);
- options.maxLength(buffer.getLength());
- when(fs.getFileStatus(p))
- .thenReturn(new FileStatus(10, false, 3, 3000, 0, p));
- Reader reader = OrcFile.createReader(p, options);
- }
-
- @Test
- public void testCompareToRangeInt() throws Exception {
- assertEquals(Location.BEFORE,
- RecordReaderImpl.compareToRange(19L, 20L, 40L));
- assertEquals(Location.AFTER,
- RecordReaderImpl.compareToRange(41L, 20L, 40L));
- assertEquals(Location.MIN,
- RecordReaderImpl.compareToRange(20L, 20L, 40L));
- assertEquals(Location.MIDDLE,
- RecordReaderImpl.compareToRange(21L, 20L, 40L));
- assertEquals(Location.MAX,
- RecordReaderImpl.compareToRange(40L, 20L, 40L));
- assertEquals(Location.BEFORE,
- RecordReaderImpl.compareToRange(0L, 1L, 1L));
- assertEquals(Location.MIN,
- RecordReaderImpl.compareToRange(1L, 1L, 1L));
- assertEquals(Location.AFTER,
- RecordReaderImpl.compareToRange(2L, 1L, 1L));
- }
-
- @Test
- public void testCompareToRangeString() throws Exception {
- assertEquals(Location.BEFORE,
- RecordReaderImpl.compareToRange("a", "b", "c"));
- assertEquals(Location.AFTER,
- RecordReaderImpl.compareToRange("d", "b", "c"));
- assertEquals(Location.MIN,
- RecordReaderImpl.compareToRange("b", "b", "c"));
- assertEquals(Location.MIDDLE,
- RecordReaderImpl.compareToRange("bb", "b", "c"));
- assertEquals(Location.MAX,
- RecordReaderImpl.compareToRange("c", "b", "c"));
- assertEquals(Location.BEFORE,
- RecordReaderImpl.compareToRange("a", "b", "b"));
- assertEquals(Location.MIN,
- RecordReaderImpl.compareToRange("b", "b", "b"));
- assertEquals(Location.AFTER,
- RecordReaderImpl.compareToRange("c", "b", "b"));
- }
-
- @Test
- public void testCompareToCharNeedConvert() throws Exception {
- assertEquals(Location.BEFORE,
- RecordReaderImpl.compareToRange("apple", "hello", "world"));
- assertEquals(Location.AFTER,
- RecordReaderImpl.compareToRange("zombie", "hello", "world"));
- assertEquals(Location.MIN,
- RecordReaderImpl.compareToRange("hello", "hello", "world"));
- assertEquals(Location.MIDDLE,
- RecordReaderImpl.compareToRange("pilot", "hello", "world"));
- assertEquals(Location.MAX,
- RecordReaderImpl.compareToRange("world", "hello", "world"));
- assertEquals(Location.BEFORE,
- RecordReaderImpl.compareToRange("apple", "hello", "hello"));
- assertEquals(Location.MIN,
- RecordReaderImpl.compareToRange("hello", "hello", "hello"));
- assertEquals(Location.AFTER,
- RecordReaderImpl.compareToRange("zombie", "hello", "hello"));
- }
-
- @Test
- public void testGetMin() throws Exception {
- assertEquals(10L, RecordReaderImpl.getMin(
- ColumnStatisticsImpl.deserialize(createIntStats(10L, 100L))));
- assertEquals(10.0d, RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
- OrcProto.ColumnStatistics.newBuilder()
- .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
- .setMinimum(10.0d).setMaximum(100.0d).build()).build())));
- assertEquals(null, RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
- OrcProto.ColumnStatistics.newBuilder()
- .setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
- .build())));
- assertEquals("a", RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
- OrcProto.ColumnStatistics.newBuilder()
- .setStringStatistics(OrcProto.StringStatistics.newBuilder()
- .setMinimum("a").setMaximum("b").build()).build())));
- assertEquals("hello", RecordReaderImpl.getMin(ColumnStatisticsImpl
- .deserialize(createStringStats("hello", "world"))));
- assertEquals(HiveDecimal.create("111.1"), RecordReaderImpl.getMin(ColumnStatisticsImpl
- .deserialize(createDecimalStats("111.1", "112.1"))));
- }
-
- private static OrcProto.ColumnStatistics createIntStats(Long min,
- Long max) {
- OrcProto.IntegerStatistics.Builder intStats =
- OrcProto.IntegerStatistics.newBuilder();
- if (min != null) {
- intStats.setMinimum(min);
- }
- if (max != null) {
- intStats.setMaximum(max);
- }
- return OrcProto.ColumnStatistics.newBuilder()
- .setIntStatistics(intStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createBooleanStats(int n, int trueCount) {
- OrcProto.BucketStatistics.Builder boolStats = OrcProto.BucketStatistics.newBuilder();
- boolStats.addCount(trueCount);
- return OrcProto.ColumnStatistics.newBuilder().setNumberOfValues(n).setBucketStatistics(
- boolStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createIntStats(int min, int max) {
- OrcProto.IntegerStatistics.Builder intStats = OrcProto.IntegerStatistics.newBuilder();
- intStats.setMinimum(min);
- intStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setIntStatistics(intStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createDoubleStats(double min, double max) {
- OrcProto.DoubleStatistics.Builder dblStats = OrcProto.DoubleStatistics.newBuilder();
- dblStats.setMinimum(min);
- dblStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setDoubleStatistics(dblStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createStringStats(String min, String max,
- boolean hasNull) {
- OrcProto.StringStatistics.Builder strStats = OrcProto.StringStatistics.newBuilder();
- strStats.setMinimum(min);
- strStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build())
- .setHasNull(hasNull).build();
- }
-
- private static OrcProto.ColumnStatistics createStringStats(String min, String max) {
- OrcProto.StringStatistics.Builder strStats = OrcProto.StringStatistics.newBuilder();
- strStats.setMinimum(min);
- strStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createDateStats(int min, int max) {
- OrcProto.DateStatistics.Builder dateStats = OrcProto.DateStatistics.newBuilder();
- dateStats.setMinimum(min);
- dateStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setDateStatistics(dateStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createTimestampStats(long min, long max) {
- OrcProto.TimestampStatistics.Builder tsStats = OrcProto.TimestampStatistics.newBuilder();
- tsStats.setMinimum(min);
- tsStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setTimestampStatistics(tsStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createDecimalStats(String min, String max) {
- OrcProto.DecimalStatistics.Builder decStats = OrcProto.DecimalStatistics.newBuilder();
- decStats.setMinimum(min);
- decStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setDecimalStatistics(decStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createDecimalStats(String min, String max,
- boolean hasNull) {
- OrcProto.DecimalStatistics.Builder decStats = OrcProto.DecimalStatistics.newBuilder();
- decStats.setMinimum(min);
- decStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setDecimalStatistics(decStats.build())
- .setHasNull(hasNull).build();
- }
-
- @Test
- public void testGetMax() throws Exception {
- assertEquals(100L, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(createIntStats(10L, 100L))));
- assertEquals(100.0d, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
- OrcProto.ColumnStatistics.newBuilder()
- .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
- .setMinimum(10.0d).setMaximum(100.0d).build()).build())));
- assertEquals(null, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
- OrcProto.ColumnStatistics.newBuilder()
- .setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
- .build())));
- assertEquals("b", RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
- OrcProto.ColumnStatistics.newBuilder()
- .setStringStatistics(OrcProto.StringStatistics.newBuilder()
- .setMinimum("a").setMaximum("b").build()).build())));
- assertEquals("world", RecordReaderImpl.getMax(ColumnStatisticsImpl
- .deserialize(createStringStats("hello", "world"))));
- assertEquals(HiveDecimal.create("112.1"), RecordReaderImpl.getMax(ColumnStatisticsImpl
- .deserialize(createDecimalStats("111.1", "112.1"))));
- }
-
- @Test
- public void testPredEvalWithBooleanStats() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", false, null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
- }
-
- @Test
- public void testPredEvalWithIntStats() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
-
- // Stats gets converted to column type. "15" is outside of "10" and "100"
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "15", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
-
- // Integer stats will not be converted date because of days/seconds/millis ambiguity
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
- }
-
- @Test
- public void testPredEvalWithDoubleStats() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
-
- // Stats gets converted to column type. "15.0" is outside of "10.0" and "100.0"
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "15", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
-
- // Double is not converted to date type because of days/seconds/millis ambiguity
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15*1000L), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150*1000L), null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
- }
-
- @Test
- public void testPredEvalWithStringStats() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 100L, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.FLOAT, "x", 100.0, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "100", null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
-
- // IllegalArgumentException is thrown when converting String to Date, hence YES_NO
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(100).get(), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 1000), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("100"), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(100), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
- }
-
- @Test
- public void testPredEvalWithDateStats() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- // Date to Integer conversion is not possible.
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- // Date to Float conversion is also not possible.
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "15", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "1970-01-11", null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "15.1", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "__a15__1", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "2000-01-16", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "1970-01-16", null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(150).get(), null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- // Date to Decimal conversion is also not possible.
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15L * 24L * 60L * 60L * 1000L), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
- }
-
- @Test
- public void testPredEvalWithDecimalStats() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
-
- // "15" out of range of "10.0" and "100.0"
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "15", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
-
- // Decimal to Date not possible.
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15 * 1000L), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150 * 1000L), null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
- }
-
- @Test
- public void testPredEvalWithTimestampStats() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "15", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", new Timestamp(15).toString(), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10 * 24L * 60L * 60L * 1000L,
- 100 * 24L * 60L * 60L * 1000L), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
-
- pred = TestSearchArgumentImpl.createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
- }
-
- @Test
- public void testEquals() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG,
- "x", 15L, null);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null));
- }
-
- @Test
- public void testNullSafeEquals() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG,
- "x", 15L, null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null));
- }
-
- @Test
- public void testLessThan() throws Exception {
- PredicateLeaf lessThan = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.LONG,
- "x", 15L, null);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), lessThan, null));
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), lessThan, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), lessThan, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), lessThan, null));
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), lessThan, null));
- }
-
- @Test
- public void testLessThanEquals() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.LONG,
- "x", 15L, null);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
- }
-
- @Test
- public void testIn() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(10L);
- args.add(20L);
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
- "x", null, args);
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 20L), pred, null));
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null));
- }
-
- @Test
- public void testBetween() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(10L);
- args.add(20L);
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.LONG,
- "x", null, args);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 5L), pred, null));
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 40L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 15L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 25L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 25L), pred, null));
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 20L), pred, null));
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null));
- }
-
- @Test
- public void testIsNull() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.LONG,
- "x", null, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
- }
-
-
- @Test
- public void testEqualsWithNullInStats() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING,
- "x", "c", null);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
- }
-
- @Test
- public void testNullSafeEqualsWithNullInStats() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING,
- "x", "c", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
- }
-
- @Test
- public void testLessThanWithNullInStats() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.STRING,
- "x", "c", null);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
- assertEquals(TruthValue.NO_NULL, // min, same stats
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null));
- }
-
- @Test
- public void testLessThanEqualsWithNullInStats() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.STRING,
- "x", "c", null);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
- }
-
- @Test
- public void testInWithNullInStats() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add("c");
- args.add("f");
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
- "x", null, args);
- assertEquals(TruthValue.NO_NULL, // before & after
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null));
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null)); // max
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
- }
-
- @Test
- public void testBetweenWithNullInStats() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add("c");
- args.add("f");
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.STRING,
- "x", null, args);
- assertEquals(TruthValue.YES_NULL, // before & after
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null));
- assertEquals(TruthValue.YES_NULL, // before & max
- RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null));
- assertEquals(TruthValue.NO_NULL, // before & before
- RecordReaderImpl.evaluatePredicateProto(createStringStats("h", "g", true), pred, null));
- assertEquals(TruthValue.YES_NO_NULL, // before & min
- RecordReaderImpl.evaluatePredicateProto(createStringStats("f", "g", true), pred, null));
- assertEquals(TruthValue.YES_NO_NULL, // before & middle
- RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "g", true), pred, null));
-
- assertEquals(TruthValue.YES_NULL, // min & after
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "e", true), pred, null));
- assertEquals(TruthValue.YES_NULL, // min & max
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "f", true), pred, null));
- assertEquals(TruthValue.YES_NO_NULL, // min & middle
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "g", true), pred, null));
-
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "c", true), pred, null)); // max
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
- assertEquals(TruthValue.YES_NULL, // min & after, same stats
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null));
- }
-
- @Test
- public void testIsNullWithNullInStats() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.STRING,
- "x", null, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null));
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", false), pred, null));
- }
-
- @Test
- public void testOverlap() throws Exception {
- assertTrue(!RecordReaderUtils.overlap(0, 10, -10, -1));
- assertTrue(RecordReaderUtils.overlap(0, 10, -1, 0));
- assertTrue(RecordReaderUtils.overlap(0, 10, -1, 1));
- assertTrue(RecordReaderUtils.overlap(0, 10, 2, 8));
- assertTrue(RecordReaderUtils.overlap(0, 10, 5, 10));
- assertTrue(RecordReaderUtils.overlap(0, 10, 10, 11));
- assertTrue(RecordReaderUtils.overlap(0, 10, 0, 10));
- assertTrue(RecordReaderUtils.overlap(0, 10, -1, 11));
- assertTrue(!RecordReaderUtils.overlap(0, 10, 11, 12));
- }
-
- private static DiskRangeList diskRanges(Integer... points) {
- DiskRangeList head = null, tail = null;
- for(int i = 0; i < points.length; i += 2) {
- DiskRangeList range = new DiskRangeList(points[i], points[i+1]);
- if (tail == null) {
- head = tail = range;
- } else {
- tail = tail.insertAfter(range);
- }
- }
- return head;
- }
-
- @Test
- public void testGetIndexPosition() throws Exception {
- assertEquals(0, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
- OrcProto.Stream.Kind.PRESENT, true, true));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
- OrcProto.Stream.Kind.DATA, true, true));
- assertEquals(3, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
- OrcProto.Stream.Kind.DATA, false, true));
- assertEquals(0, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
- OrcProto.Stream.Kind.DATA, true, false));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DICTIONARY, OrcProto.Type.Kind.STRING,
- OrcProto.Stream.Kind.DATA, true, true));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
- OrcProto.Stream.Kind.DATA, true, true));
- assertEquals(3, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
- OrcProto.Stream.Kind.DATA, false, true));
- assertEquals(6, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
- OrcProto.Stream.Kind.LENGTH, true, true));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
- OrcProto.Stream.Kind.LENGTH, false, true));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
- OrcProto.Stream.Kind.DATA, true, true));
- assertEquals(3, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
- OrcProto.Stream.Kind.DATA, false, true));
- assertEquals(6, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
- OrcProto.Stream.Kind.SECONDARY, true, true));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
- OrcProto.Stream.Kind.SECONDARY, false, true));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
- OrcProto.Stream.Kind.DATA, true, true));
- assertEquals(3, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
- OrcProto.Stream.Kind.DATA, false, true));
- assertEquals(7, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
- OrcProto.Stream.Kind.SECONDARY, true, true));
- assertEquals(5, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
- OrcProto.Stream.Kind.SECONDARY, false, true));
- }
-
- @Test
- public void testPartialPlan() throws Exception {
- DiskRangeList result;
-
- // set the streams
- List<OrcProto.Stream> streams = new ArrayList<OrcProto.Stream>();
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.PRESENT)
- .setColumn(1).setLength(1000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DATA)
- .setColumn(1).setLength(99000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.PRESENT)
- .setColumn(2).setLength(2000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DATA)
- .setColumn(2).setLength(98000).build());
-
- boolean[] columns = new boolean[]{true, true, false};
- boolean[] rowGroups = new boolean[]{true, true, false, false, true, false};
-
- // set the index
- OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.length];
- indexes[1] = OrcProto.RowIndex.newBuilder()
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(0).addPositions(-1).addPositions(-1)
- .addPositions(0)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(100).addPositions(-1).addPositions(-1)
- .addPositions(10000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(200).addPositions(-1).addPositions(-1)
- .addPositions(20000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(300).addPositions(-1).addPositions(-1)
- .addPositions(30000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(400).addPositions(-1).addPositions(-1)
- .addPositions(40000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(500).addPositions(-1).addPositions(-1)
- .addPositions(50000)
- .build())
- .build();
-
- // set encodings
- List<OrcProto.ColumnEncoding> encodings =
- new ArrayList<OrcProto.ColumnEncoding>();
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
-
- // set types struct{x: int, y: int}
- List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT)
- .addSubtypes(1).addSubtypes(2).addFieldNames("x")
- .addFieldNames("y").build());
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
-
- // filter by rows and groups
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(0, 1000, 100, 1000, 400, 1000,
- 1000, 11000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
- 11000, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
- 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, true);
- assertThat(result, is(diskRanges(0, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
- 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
-
- // if we read no rows, don't read any bytes
- rowGroups = new boolean[]{false, false, false, false, false, false};
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, false);
- assertNull(result);
-
- // all rows, but only columns 0 and 2.
- rowGroups = null;
- columns = new boolean[]{true, false, true};
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, null, false, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(100000, 102000, 102000, 200000)));
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, null, false, encodings, types, 32768, true);
- assertThat(result, is(diskRanges(100000, 200000)));
-
- rowGroups = new boolean[]{false, true, false, false, false, false};
- indexes[2] = indexes[1];
- indexes[1] = null;
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(100100, 102000,
- 112000, 122000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, true);
- assertThat(result, is(diskRanges(100100, 102000,
- 112000, 122000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
-
- rowGroups = new boolean[]{false, false, false, false, false, true};
- indexes[1] = indexes[2];
- columns = new boolean[]{true, true, true};
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(500, 1000, 51000, 100000, 100500, 102000,
- 152000, 200000)));
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, true);
- assertThat(result, is(diskRanges(500, 1000, 51000, 100000, 100500, 102000,
- 152000, 200000)));
- }
-
-
- @Test
- public void testPartialPlanCompressed() throws Exception {
- DiskRangeList result;
-
- // set the streams
- List<OrcProto.Stream> streams = new ArrayList<OrcProto.Stream>();
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.PRESENT)
- .setColumn(1).setLength(1000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DATA)
- .setColumn(1).setLength(99000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.PRESENT)
- .setColumn(2).setLength(2000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DATA)
- .setColumn(2).setLength(98000).build());
-
- boolean[] columns = new boolean[]{true, true, false};
- boolean[] rowGroups = new boolean[]{true, true, false, false, true, false};
-
- // set the index
- OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.length];
- indexes[1] = OrcProto.RowIndex.newBuilder()
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(0).addPositions(-1).addPositions(-1).addPositions(-1)
- .addPositions(0)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(100).addPositions(-1).addPositions(-1).addPositions(-1)
- .addPositions(10000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(200).addPositions(-1).addPositions(-1).addPositions(-1)
- .addPositions(20000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(300).addPositions(-1).addPositions(-1).addPositions(-1)
- .addPositions(30000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(400).addPositions(-1).addPositions(-1).addPositions(-1)
- .addPositions(40000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(500).addPositions(-1).addPositions(-1).addPositions(-1)
- .addPositions(50000)
- .build())
- .build();
-
- // set encodings
- List<OrcProto.ColumnEncoding> encodings =
- new ArrayList<OrcProto.ColumnEncoding>();
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
-
- // set types struct{x: int, y: int}
- List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT)
- .addSubtypes(1).addSubtypes(2).addFieldNames("x")
- .addFieldNames("y").build());
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
-
- // filter by rows and groups
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, true, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(0, 1000, 100, 1000,
- 400, 1000, 1000, 11000+(2*32771),
- 11000, 21000+(2*32771), 41000, 100000)));
-
- rowGroups = new boolean[]{false, false, false, false, false, true};
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, true, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(500, 1000, 51000, 100000)));
- }
-
- @Test
- public void testPartialPlanString() throws Exception {
- DiskRangeList result;
-
- // set the streams
- List<OrcProto.Stream> streams = new ArrayList<OrcProto.Stream>();
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.PRESENT)
- .setColumn(1).setLength(1000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DATA)
- .setColumn(1).setLength(94000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.LENGTH)
- .setColumn(1).setLength(2000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DICTIONARY_DATA)
- .setColumn(1).setLength(3000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.PRESENT)
- .setColumn(2).setLength(2000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DATA)
- .setColumn(2).setLength(98000).build());
-
- boolean[] columns = new boolean[]{true, true, false};
- boolean[] rowGroups = new boolean[]{false, true, false, false, true, true};
-
- // set the index
- OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.length];
- indexes[1] = OrcProto.RowIndex.newBuilder()
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(0).addPositions(-1).addPositions(-1)
- .addPositions(0)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(100).addPositions(-1).addPositions(-1)
- .addPositions(10000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(200).addPositions(-1).addPositions(-1)
- .addPositions(20000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(300).addPositions(-1).addPositions(-1)
- .addPositions(30000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(400).addPositions(-1).addPositions(-1)
- .addPositions(40000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(500).addPositions(-1).addPositions(-1)
- .addPositions(50000)
- .build())
- .build();
-
- // set encodings
- List<OrcProto.ColumnEncoding> encodings =
- new ArrayList<OrcProto.ColumnEncoding>();
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DICTIONARY).build());
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
-
- // set types struct{x: string, y: int}
- List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT)
- .addSubtypes(1).addSubtypes(2).addFieldNames("x")
- .addFieldNames("y").build());
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRING).build());
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
-
- // filter by rows and groups
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(100, 1000, 400, 1000, 500, 1000,
- 11000, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
- 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
- 51000, 95000, 95000, 97000, 97000, 100000)));
- }
-
- @Test
- public void testIntNullSafeEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong(i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong(15);
- assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testIntEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong(i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong(15);
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testIntInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(15L);
- args.add(19L);
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong(i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong(19);
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong(15);
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDoubleNullSafeEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addDouble(i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addDouble(15.0);
- assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDoubleEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addDouble(i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addDouble(15.0);
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDoubleInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(15.0);
- args.add(19.0);
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.FLOAT,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addDouble(i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addDouble(19.0);
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addDouble(15.0);
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testStringNullSafeEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString("str_" + i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString("str_15");
- assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testStringEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString("str_" + i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString("str_15");
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testStringInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add("str_15");
- args.add("str_19");
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString("str_" + i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString("str_19");
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString("str_15");
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDateWritableNullSafeEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x",
- new DateWritable(15).get(), null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong((new DateWritable(i)).getDays());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new DateWritable(15)).getDays());
- assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDateWritableEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DATE, "x",
- new DateWritable(15).get(), null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong((new DateWritable(i)).getDays());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new DateWritable(15)).getDays());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDateWritableInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(new DateWritable(15).get());
- args.add(new DateWritable(19).get());
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DATE,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong((new DateWritable(i)).getDays());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new DateWritable(19)).getDays());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new DateWritable(15)).getDays());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testTimestampNullSafeEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x",
- new Timestamp(15),
- null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong((new Timestamp(i)).getTime());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new Timestamp(15)).getTime());
- assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testTimestampEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong((new Timestamp(i)).getTime());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new Timestamp(15)).getTime());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testTimestampInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(new Timestamp(15));
- args.add(new Timestamp(19));
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.TIMESTAMP,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong((new Timestamp(i)).getTime());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new Timestamp(19)).getTime());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new Timestamp(15)).getTime());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDecimalNullSafeEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x",
- new HiveDecimalWritable("15"),
- null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString(HiveDecimal.create(i).toString());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString(HiveDecimal.create(15).toString());
- assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDecimalEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
- PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DECIMAL, "x",
- new HiveDecimalWritable("15"),
- null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString(HiveDecimal.create(i).toString());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString(HiveDecimal.create(15).toString());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDecimalInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(new HiveDecimalWritable("15"));
- args.add(new HiveDecimalWritable("19"));
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString(HiveDecimal.create(i).toString());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString(HiveDecimal.create(19).toString());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString(HiveDecimal.create(15).toString());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testNullsInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(new HiveDecimalWritable("15"));
- args.add(null);
- args.add(new HiveDecimalWritable("19"));
- PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString(HiveDecimal.create(i).toString());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", false));
- // hasNull is false, so bloom filter should return NO
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", true));
- // hasNull is true, so bloom filter should return YES_NO_NULL
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString(HiveDecimal.create(19).toString());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString(HiveDecimal.create(15).toString());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testClose() throws Exception {
- DataReader mockedDataReader = mock(DataReader.class);
- closeMockedRecordReader(mockedDataReader);
-
- verify(mockedDataReader, atLeastOnce()).close();
- }
-
- @Test
- public void testCloseWithException() throws Exception {
- DataReader mockedDataReader = mock(DataReader.class);
- doThrow(IOException.class).when(mockedDataReader).close();
-
- try {
- closeMockedRecordReader(mockedDataReader);
- fail("Exception should have been thrown when Record Reader was closed");
- } catch (IOException expected) {
-
- }
-
- verify(mockedDataReader, atLeastOnce()).close();
- }
-
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
-
- private void closeMockedRecordReader(DataReader mockedDataReader) throws IOException {
- Configuration conf = new Configuration();
- Path path = new Path(workDir, "empty.orc");
- FileSystem.get(conf).delete(path, true);
- Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf)
- .setSchema(TypeDescription.createLong()));
- writer.close();
- Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
-
- RecordReader recordReader = reader.rowsOptions(new Reader.Options()
- .dataReader(mockedDataReader));
-
- recordReader.close();
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStreamName.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStreamName.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStreamName.java
deleted file mode 100644
index dfccd9a..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStreamName.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hive.ql.io.orc;
-
-import org.apache.orc.OrcProto;
-import org.apache.orc.impl.StreamName;
-import org.junit.Test;
-
-import static org.junit.Assert.assertEquals;
-
-public class TestStreamName {
-
- @Test
- public void test1() throws Exception {
- StreamName s1 = new StreamName(3, OrcProto.Stream.Kind.DATA);
- StreamName s2 = new StreamName(3,
- OrcProto.Stream.Kind.DICTIONARY_DATA);
- StreamName s3 = new StreamName(5, OrcProto.Stream.Kind.DATA);
- StreamName s4 = new StreamName(5,
- OrcProto.Stream.Kind.DICTIONARY_DATA);
- StreamName s1p = new StreamName(3, OrcProto.Stream.Kind.DATA);
- assertEquals(true, s1.equals(s1));
- assertEquals(false, s1.equals(s2));
- assertEquals(false, s1.equals(s3));
- assertEquals(true, s1.equals(s1p));
- assertEquals(true, s1.compareTo(null) < 0);
- assertEquals(false, s1.equals(null));
- assertEquals(true, s1.compareTo(s2) < 0);
- assertEquals(true, s2.compareTo(s3) < 0);
- assertEquals(true, s3.compareTo(s4) < 0);
- assertEquals(true, s4.compareTo(s1p) > 0);
- assertEquals(0, s1p.compareTo(s1));
- }
-}
[21/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/TestNewIntegerEncoding.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestNewIntegerEncoding.java b/orc/src/test/org/apache/orc/TestNewIntegerEncoding.java
new file mode 100644
index 0000000..526dd81
--- /dev/null
+++ b/orc/src/test/org/apache/orc/TestNewIntegerEncoding.java
@@ -0,0 +1,1373 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.io.File;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import com.google.common.collect.Lists;
+import com.google.common.primitives.Longs;
+
+@RunWith(value = Parameterized.class)
+public class TestNewIntegerEncoding {
+
+ private OrcFile.EncodingStrategy encodingStrategy;
+
+ public TestNewIntegerEncoding( OrcFile.EncodingStrategy es) {
+ this.encodingStrategy = es;
+ }
+
+ @Parameters
+ public static Collection<Object[]> data() {
+ Object[][] data = new Object[][] { { OrcFile.EncodingStrategy.COMPRESSION },
+ { OrcFile.EncodingStrategy.SPEED } };
+ return Arrays.asList(data);
+ }
+
+ public static class TSRow {
+ Timestamp ts;
+
+ public TSRow(Timestamp ts) {
+ this.ts = ts;
+ }
+ }
+
+ public static TypeDescription getRowSchema() {
+ return TypeDescription.createStruct()
+ .addField("int1", TypeDescription.createInt())
+ .addField("long1", TypeDescription.createLong());
+ }
+
+ public static void appendRow(VectorizedRowBatch batch,
+ int int1, long long1) {
+ int row = batch.size++;
+ ((LongColumnVector) batch.cols[0]).vector[row] = int1;
+ ((LongColumnVector) batch.cols[1]).vector[row] = long1;
+ }
+
+ public static void appendLong(VectorizedRowBatch batch,
+ long long1) {
+ int row = batch.size++;
+ ((LongColumnVector) batch.cols[0]).vector[row] = long1;
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir", "target"
+ + File.separator + "test" + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile."
+ + testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testBasicRow() throws Exception {
+ TypeDescription schema= getRowSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ appendRow(batch, 111, 1111L);
+ appendRow(batch, 111, 1111L);
+ appendRow(batch, 111, 1111L);
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(111, ((LongColumnVector) batch.cols[0]).vector[r]);
+ assertEquals(1111, ((LongColumnVector) batch.cols[1]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testBasicOld() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+ long[] inp = new long[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 1, 1, 1, 1, 1, 1, 10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1,
+ 2, 5, 1, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1,
+ 9, 2, 6, 3, 7, 1, 9, 2, 6, 2000, 2, 1, 1, 1, 1, 1, 3, 7, 1, 9, 2, 6, 1,
+ 1, 1, 1, 1 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .version(OrcFile.Version.V_0_11)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ batch = reader.getSchema().createRowBatch();
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testBasicNew() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 1, 1, 1, 1, 1, 1, 10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1,
+ 2, 5, 1, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1,
+ 9, 2, 6, 3, 7, 1, 9, 2, 6, 2000, 2, 1, 1, 1, 1, 1, 3, 7, 1, 9, 2, 6, 1,
+ 1, 1, 1, 1 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ batch = reader.getSchema().createRowBatch();
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testBasicDelta1() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { -500, -400, -350, -325, -310 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testBasicDelta2() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { -500, -600, -650, -675, -710 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testBasicDelta3() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 500, 400, 350, 325, 310 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testBasicDelta4() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 500, 600, 650, 675, 710 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testDeltaOverflow() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[]{4513343538618202719l, 4513343538618202711l,
+ 2911390882471569739l,
+ -9181829309989854913l};
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .compress(CompressionKind.NONE).bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile
+ .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testDeltaOverflow2() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[]{Long.MAX_VALUE, 4513343538618202711l,
+ 2911390882471569739l,
+ Long.MIN_VALUE};
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .compress(CompressionKind.NONE).bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile
+ .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testDeltaOverflow3() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[]{-4513343538618202711l, -2911390882471569739l, -2,
+ Long.MAX_VALUE};
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .compress(CompressionKind.NONE).bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile
+ .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testIntegerMin() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ input.add((long) Integer.MIN_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testIntegerMax() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ input.add((long) Integer.MAX_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testLongMin() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ input.add(Long.MIN_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testLongMax() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ input.add(Long.MAX_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testRandomInt() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 100000; i++) {
+ input.add((long) rand.nextInt());
+ }
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(100000);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testRandomLong() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 100000; i++) {
+ input.add(rand.nextLong());
+ }
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(100000);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseNegativeMin() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2,
+ 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1,
+ 1, 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1, 2, 2, 1, 1,
+ 52, 4, 1, 2, 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6,
+ 2, 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1, 33, 2, -13, 1, 2, 3,
+ 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6, 15, 25, 1, 1, 1, 46, 2, 1, 1,
+ 141, 3, 1, 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2, 2, 1, 9, 10, 2, 1, 4,
+ 13, 1, 5, 4, 4, 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5, 2, 4, 1,
+ 1, 2, 16, 1, 2966, 3, 1, 1, 25501, 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1,
+ 2, 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3, 112, 13, 2, 2, 1, 5, 10, 3, 1,
+ 1, 13, 2, 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2, 207, 1, 1, 2, 4, 3, 3, 2,
+ 2, 16 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseNegativeMin2() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2,
+ 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1,
+ 1, 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1, 2, 2, 1, 1,
+ 52, 4, 1, 2, 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6,
+ 2, 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1, 33, 2, -1, 1, 2, 3,
+ 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6, 15, 25, 1, 1, 1, 46, 2, 1, 1,
+ 141, 3, 1, 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2, 2, 1, 9, 10, 2, 1, 4,
+ 13, 1, 5, 4, 4, 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5, 2, 4, 1,
+ 1, 2, 16, 1, 2966, 3, 1, 1, 25501, 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1,
+ 2, 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3, 112, 13, 2, 2, 1, 5, 10, 3, 1,
+ 1, 13, 2, 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2, 207, 1, 1, 2, 4, 3, 3, 2,
+ 2, 16 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseNegativeMin3() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2,
+ 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1,
+ 1, 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1, 2, 2, 1, 1,
+ 52, 4, 1, 2, 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6,
+ 2, 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1, 33, 2, 0, 1, 2, 3,
+ 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6, 15, 25, 1, 1, 1, 46, 2, 1, 1,
+ 141, 3, 1, 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2, 2, 1, 9, 10, 2, 1, 4,
+ 13, 1, 5, 4, 4, 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5, 2, 4, 1,
+ 1, 2, 16, 1, 2966, 3, 1, 1, 25501, 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1,
+ 2, 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3, 112, 13, 2, 2, 1, 5, 10, 3, 1,
+ 1, 13, 2, 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2, 207, 1, 1, 2, 4, 3, 3, 2,
+ 2, 16 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseNegativeMin4() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 13, 13, 11, 8, 13, 10, 10, 11, 11, 14, 11, 7, 13,
+ 12, 12, 11, 15, 12, 12, 9, 8, 10, 13, 11, 8, 6, 5, 6, 11, 7, 15, 10, 7,
+ 6, 8, 7, 9, 9, 11, 33, 11, 3, 7, 4, 6, 10, 14, 12, 5, 14, 7, 6 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseAt0() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(100));
+ }
+ input.set(0, 20000L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseAt1() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(100));
+ }
+ input.set(1, 20000L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseAt255() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(100));
+ }
+ input.set(255, 20000L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseAt256() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(100));
+ }
+ input.set(256, 20000L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBase510() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(100));
+ }
+ input.set(510, 20000L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBase511() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(100));
+ }
+ input.set(511, 20000L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseMax1() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for (int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(60));
+ }
+ input.set(511, Long.MAX_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseMax2() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for (int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(60));
+ }
+ input.set(128, Long.MAX_VALUE);
+ input.set(256, Long.MAX_VALUE);
+ input.set(511, Long.MAX_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseMax3() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ input.add(371946367L);
+ input.add(11963367L);
+ input.add(68639400007L);
+ input.add(100233367L);
+ input.add(6367L);
+ input.add(10026367L);
+ input.add(3670000L);
+ input.add(3602367L);
+ input.add(4719226367L);
+ input.add(7196367L);
+ input.add(444442L);
+ input.add(210267L);
+ input.add(21033L);
+ input.add(160267L);
+ input.add(400267L);
+ input.add(23634347L);
+ input.add(16027L);
+ input.add(46026367L);
+ input.add(Long.MAX_VALUE);
+ input.add(33333L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseMax4() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ for (int i = 0; i < 25; i++) {
+ input.add(371292224226367L);
+ input.add(119622332222267L);
+ input.add(686329400222007L);
+ input.add(100233333222367L);
+ input.add(636272333322222L);
+ input.add(10202633223267L);
+ input.add(36700222022230L);
+ input.add(36023226224227L);
+ input.add(47192226364427L);
+ input.add(71963622222447L);
+ input.add(22244444222222L);
+ input.add(21220263327442L);
+ input.add(21032233332232L);
+ input.add(16026322232227L);
+ input.add(40022262272212L);
+ input.add(23634342227222L);
+ input.add(16022222222227L);
+ input.add(46026362222227L);
+ input.add(46026362222227L);
+ input.add(33322222222323L);
+ }
+ input.add(Long.MAX_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseTimestamp() throws Exception {
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("ts", TypeDescription.createTimestamp());
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+
+ List<Timestamp> tslist = Lists.newArrayList();
+ tslist.add(Timestamp.valueOf("2099-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2003-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("1999-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("1995-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2002-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2010-03-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("2005-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2006-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2003-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("1996-08-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("1998-11-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("1993-08-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("2008-01-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("2007-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2004-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("2003-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2004-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2008-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2005-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("1994-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2006-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2004-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2001-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2000-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2000-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2002-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2006-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2011-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2002-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2005-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("1974-01-01 00:00:00"));
+ int idx = 0;
+ for (Timestamp ts : tslist) {
+ ((TimestampColumnVector) batch.cols[0]).set(idx, ts);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(tslist.get(idx++),
+ ((TimestampColumnVector) batch.cols[0]).asScratchTimestamp(r));
+ }
+ }
+ }
+
+ @Test
+ public void testDirectLargeNegatives() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+
+ appendLong(batch, -7486502418706614742L);
+ appendLong(batch, 0L);
+ appendLong(batch, 1L);
+ appendLong(batch, 1L);
+ appendLong(batch, -5535739865598783616L);
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ assertEquals(true, rows.nextBatch(batch));
+ assertEquals(5, batch.size);
+ assertEquals(-7486502418706614742L,
+ ((LongColumnVector) batch.cols[0]).vector[0]);
+ assertEquals(0L,
+ ((LongColumnVector) batch.cols[0]).vector[1]);
+ assertEquals(1L,
+ ((LongColumnVector) batch.cols[0]).vector[2]);
+ assertEquals(1L,
+ ((LongColumnVector) batch.cols[0]).vector[3]);
+ assertEquals(-5535739865598783616L,
+ ((LongColumnVector) batch.cols[0]).vector[4]);
+ assertEquals(false, rows.nextBatch(batch));
+ }
+
+ @Test
+ public void testSeek() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 100000; i++) {
+ input.add((long) rand.nextInt());
+ }
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .version(OrcFile.Version.V_0_11)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(100000);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 55555;
+ rows.seekToRow(idx);
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/TestOrcNullOptimization.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestOrcNullOptimization.java b/orc/src/test/org/apache/orc/TestOrcNullOptimization.java
new file mode 100644
index 0000000..0b605c9
--- /dev/null
+++ b/orc/src/test/org/apache/orc/TestOrcNullOptimization.java
@@ -0,0 +1,415 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+import java.util.Random;
+
+import junit.framework.Assert;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import org.apache.orc.impl.RecordReaderImpl;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+import com.google.common.collect.Lists;
+
+public class TestOrcNullOptimization {
+
+ TypeDescription createMyStruct() {
+ return TypeDescription.createStruct()
+ .addField("a", TypeDescription.createInt())
+ .addField("b", TypeDescription.createString())
+ .addField("c", TypeDescription.createBoolean())
+ .addField("d", TypeDescription.createList(
+ TypeDescription.createStruct()
+ .addField("z", TypeDescription.createInt())));
+ }
+
+ void addRow(Writer writer, VectorizedRowBatch batch,
+ Integer a, String b, Boolean c,
+ Integer... d) throws IOException {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ int row = batch.size++;
+ LongColumnVector aColumn = (LongColumnVector) batch.cols[0];
+ BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1];
+ LongColumnVector cColumn = (LongColumnVector) batch.cols[2];
+ ListColumnVector dColumn = (ListColumnVector) batch.cols[3];
+ StructColumnVector dStruct = (StructColumnVector) dColumn.child;
+ LongColumnVector dInt = (LongColumnVector) dStruct.fields[0];
+ if (a == null) {
+ aColumn.noNulls = false;
+ aColumn.isNull[row] = true;
+ } else {
+ aColumn.vector[row] = a;
+ }
+ if (b == null) {
+ bColumn.noNulls = false;
+ bColumn.isNull[row] = true;
+ } else {
+ bColumn.setVal(row, b.getBytes());
+ }
+ if (c == null) {
+ cColumn.noNulls = false;
+ cColumn.isNull[row] = true;
+ } else {
+ cColumn.vector[row] = c ? 1 : 0;
+ }
+ if (d == null) {
+ dColumn.noNulls = false;
+ dColumn.isNull[row] = true;
+ } else {
+ dColumn.offsets[row] = dColumn.childCount;
+ dColumn.lengths[row] = d.length;
+ dColumn.childCount += d.length;
+ for(int e=0; e < d.length; ++e) {
+ dInt.vector[(int) dColumn.offsets[row] + e] = d[e];
+ }
+ }
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcNullOptimization." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testMultiStripeWithNull() throws Exception {
+ TypeDescription schema = createMyStruct();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ Random rand = new Random(100);
+ VectorizedRowBatch batch = schema.createRowBatch();
+ addRow(writer, batch, null, null, true, 100);
+ for (int i = 2; i < 20000; i++) {
+ addRow(writer, batch, rand.nextInt(1), "a", true, 100);
+ }
+ addRow(writer, batch, null, null, true, 100);
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(20000, reader.getNumberOfRows());
+ assertEquals(20000, stats[0].getNumberOfValues());
+
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMaximum());
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getSum());
+ assertEquals("count: 19998 hasNull: true min: 0 max: 0 sum: 0",
+ stats[1].toString());
+
+ assertEquals("a", ((StringColumnStatistics) stats[2]).getMaximum());
+ assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
+ assertEquals(19998, stats[2].getNumberOfValues());
+ assertEquals("count: 19998 hasNull: true min: a max: a sum: 19998",
+ stats[2].toString());
+
+ // check the inspectors
+ assertEquals("struct<a:int,b:string,c:boolean,d:array<struct<z:int>>>",
+ reader.getSchema().toString());
+
+ RecordReader rows = reader.rows();
+
+ List<Boolean> expected = Lists.newArrayList();
+ for (StripeInformation sinfo : reader.getStripes()) {
+ expected.add(false);
+ }
+ // only the first and last stripe will have PRESENT stream
+ expected.set(0, true);
+ expected.set(expected.size() - 1, true);
+
+ List<Boolean> got = Lists.newArrayList();
+ // check if the strip footer contains PRESENT stream
+ for (StripeInformation sinfo : reader.getStripes()) {
+ OrcProto.StripeFooter sf =
+ ((RecordReaderImpl) rows).readStripeFooter(sinfo);
+ got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString())
+ != -1);
+ }
+ assertEquals(expected, got);
+
+ batch = reader.getSchema().createRowBatch();
+ LongColumnVector aColumn = (LongColumnVector) batch.cols[0];
+ BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1];
+ LongColumnVector cColumn = (LongColumnVector) batch.cols[2];
+ ListColumnVector dColumn = (ListColumnVector) batch.cols[3];
+ LongColumnVector dElements =
+ (LongColumnVector)(((StructColumnVector) dColumn.child).fields[0]);
+ assertEquals(true , rows.nextBatch(batch));
+ assertEquals(1024, batch.size);
+
+ // row 1
+ assertEquals(true, aColumn.isNull[0]);
+ assertEquals(true, bColumn.isNull[0]);
+ assertEquals(1, cColumn.vector[0]);
+ assertEquals(0, dColumn.offsets[0]);
+ assertEquals(1, dColumn.lengths[1]);
+ assertEquals(100, dElements.vector[0]);
+
+ rows.seekToRow(19998);
+ rows.nextBatch(batch);
+ assertEquals(2, batch.size);
+
+ // last-1 row
+ assertEquals(0, aColumn.vector[0]);
+ assertEquals("a", bColumn.toString(0));
+ assertEquals(1, cColumn.vector[0]);
+ assertEquals(0, dColumn.offsets[0]);
+ assertEquals(1, dColumn.lengths[0]);
+ assertEquals(100, dElements.vector[0]);
+
+ // last row
+ assertEquals(true, aColumn.isNull[1]);
+ assertEquals(true, bColumn.isNull[1]);
+ assertEquals(1, cColumn.vector[1]);
+ assertEquals(1, dColumn.offsets[1]);
+ assertEquals(1, dColumn.lengths[1]);
+ assertEquals(100, dElements.vector[1]);
+
+ assertEquals(false, rows.nextBatch(batch));
+ rows.close();
+ }
+
+ @Test
+ public void testMultiStripeWithoutNull() throws Exception {
+ TypeDescription schema = createMyStruct();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ Random rand = new Random(100);
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (int i = 1; i < 20000; i++) {
+ addRow(writer, batch, rand.nextInt(1), "a", true, 100);
+ }
+ addRow(writer, batch, 0, "b", true, 100);
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(20000, reader.getNumberOfRows());
+ assertEquals(20000, stats[0].getNumberOfValues());
+
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMaximum());
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getSum());
+ assertEquals("count: 20000 hasNull: false min: 0 max: 0 sum: 0",
+ stats[1].toString());
+
+ assertEquals("b", ((StringColumnStatistics) stats[2]).getMaximum());
+ assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
+ assertEquals(20000, stats[2].getNumberOfValues());
+ assertEquals("count: 20000 hasNull: false min: a max: b sum: 20000",
+ stats[2].toString());
+
+ // check the inspectors
+ Assert.assertEquals("struct<a:int,b:string,c:boolean,d:array<struct<z:int>>>",
+ reader.getSchema().toString());
+
+ RecordReader rows = reader.rows();
+
+ // none of the stripes will have PRESENT stream
+ List<Boolean> expected = Lists.newArrayList();
+ for (StripeInformation sinfo : reader.getStripes()) {
+ expected.add(false);
+ }
+
+ List<Boolean> got = Lists.newArrayList();
+ // check if the strip footer contains PRESENT stream
+ for (StripeInformation sinfo : reader.getStripes()) {
+ OrcProto.StripeFooter sf =
+ ((RecordReaderImpl) rows).readStripeFooter(sinfo);
+ got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString())
+ != -1);
+ }
+ assertEquals(expected, got);
+
+ rows.seekToRow(19998);
+
+ batch = reader.getSchema().createRowBatch();
+ LongColumnVector aColumn = (LongColumnVector) batch.cols[0];
+ BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1];
+ LongColumnVector cColumn = (LongColumnVector) batch.cols[2];
+ ListColumnVector dColumn = (ListColumnVector) batch.cols[3];
+ LongColumnVector dElements =
+ (LongColumnVector)(((StructColumnVector) dColumn.child).fields[0]);
+
+ assertEquals(true, rows.nextBatch(batch));
+ assertEquals(2, batch.size);
+
+ // last-1 row
+ assertEquals(0, aColumn.vector[0]);
+ assertEquals("a", bColumn.toString(0));
+ assertEquals(1, cColumn.vector[0]);
+ assertEquals(0, dColumn.offsets[0]);
+ assertEquals(1, dColumn.lengths[0]);
+ assertEquals(100, dElements.vector[0]);
+
+ // last row
+ assertEquals(0, aColumn.vector[1]);
+ assertEquals("b", bColumn.toString(1));
+ assertEquals(1, cColumn.vector[1]);
+ assertEquals(1, dColumn.offsets[1]);
+ assertEquals(1, dColumn.lengths[1]);
+ assertEquals(100, dElements.vector[1]);
+ rows.close();
+ }
+
+ @Test
+ public void testColumnsWithNullAndCompression() throws Exception {
+ TypeDescription schema = createMyStruct();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ addRow(writer, batch, 3, "a", true, 100);
+ addRow(writer, batch, null, "b", true, 100);
+ addRow(writer, batch, 3, null, false, 100);
+ addRow(writer, batch, 3, "d", true, 100);
+ addRow(writer, batch, 2, "e", true, 100);
+ addRow(writer, batch, 2, "f", true, 100);
+ addRow(writer, batch, 2, "g", true, 100);
+ addRow(writer, batch, 2, "h", true, 100);
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(8, reader.getNumberOfRows());
+ assertEquals(8, stats[0].getNumberOfValues());
+
+ assertEquals(3, ((IntegerColumnStatistics) stats[1]).getMaximum());
+ assertEquals(2, ((IntegerColumnStatistics) stats[1]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
+ assertEquals(17, ((IntegerColumnStatistics) stats[1]).getSum());
+ assertEquals("count: 7 hasNull: true min: 2 max: 3 sum: 17",
+ stats[1].toString());
+
+ assertEquals("h", ((StringColumnStatistics) stats[2]).getMaximum());
+ assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
+ assertEquals(7, stats[2].getNumberOfValues());
+ assertEquals("count: 7 hasNull: true min: a max: h sum: 7",
+ stats[2].toString());
+
+ // check the inspectors
+ batch = reader.getSchema().createRowBatch();
+ LongColumnVector aColumn = (LongColumnVector) batch.cols[0];
+ BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1];
+ LongColumnVector cColumn = (LongColumnVector) batch.cols[2];
+ ListColumnVector dColumn = (ListColumnVector) batch.cols[3];
+ LongColumnVector dElements =
+ (LongColumnVector)(((StructColumnVector) dColumn.child).fields[0]);
+ Assert.assertEquals("struct<a:int,b:string,c:boolean,d:array<struct<z:int>>>",
+ reader.getSchema().toString());
+
+ RecordReader rows = reader.rows();
+ // only the last strip will have PRESENT stream
+ List<Boolean> expected = Lists.newArrayList();
+ for (StripeInformation sinfo : reader.getStripes()) {
+ expected.add(false);
+ }
+ expected.set(expected.size() - 1, true);
+
+ List<Boolean> got = Lists.newArrayList();
+ // check if the strip footer contains PRESENT stream
+ for (StripeInformation sinfo : reader.getStripes()) {
+ OrcProto.StripeFooter sf =
+ ((RecordReaderImpl) rows).readStripeFooter(sinfo);
+ got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString())
+ != -1);
+ }
+ assertEquals(expected, got);
+
+ assertEquals(true, rows.nextBatch(batch));
+ assertEquals(8, batch.size);
+
+ // row 1
+ assertEquals(3, aColumn.vector[0]);
+ assertEquals("a", bColumn.toString(0));
+ assertEquals(1, cColumn.vector[0]);
+ assertEquals(0, dColumn.offsets[0]);
+ assertEquals(1, dColumn.lengths[0]);
+ assertEquals(100, dElements.vector[0]);
+
+ // row 2
+ assertEquals(true, aColumn.isNull[1]);
+ assertEquals("b", bColumn.toString(1));
+ assertEquals(1, cColumn.vector[1]);
+ assertEquals(1, dColumn.offsets[1]);
+ assertEquals(1, dColumn.lengths[1]);
+ assertEquals(100, dElements.vector[1]);
+
+ // row 3
+ assertEquals(3, aColumn.vector[2]);
+ assertEquals(true, bColumn.isNull[2]);
+ assertEquals(0, cColumn.vector[2]);
+ assertEquals(2, dColumn.offsets[2]);
+ assertEquals(1, dColumn.lengths[2]);
+ assertEquals(100, dElements.vector[2]);
+
+ rows.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/TestOrcTimezone1.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestOrcTimezone1.java b/orc/src/test/org/apache/orc/TestOrcTimezone1.java
new file mode 100644
index 0000000..72dc455
--- /dev/null
+++ b/orc/src/test/org/apache/orc/TestOrcTimezone1.java
@@ -0,0 +1,189 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertNotNull;
+
+import java.io.File;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.TimeZone;
+
+import junit.framework.Assert;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import com.google.common.collect.Lists;
+
+/**
+ *
+ */
+@RunWith(Parameterized.class)
+public class TestOrcTimezone1 {
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+ String writerTimeZone;
+ String readerTimeZone;
+ static TimeZone defaultTimeZone = TimeZone.getDefault();
+
+ public TestOrcTimezone1(String writerTZ, String readerTZ) {
+ this.writerTimeZone = writerTZ;
+ this.readerTimeZone = readerTZ;
+ }
+
+ @Parameterized.Parameters
+ public static Collection<Object[]> data() {
+ List<Object[]> result = Arrays.asList(new Object[][]{
+ /* Extreme timezones */
+ {"GMT-12:00", "GMT+14:00"},
+ /* No difference in DST */
+ {"America/Los_Angeles", "America/Los_Angeles"}, /* same timezone both with DST */
+ {"Europe/Berlin", "Europe/Berlin"}, /* same as above but europe */
+ {"America/Phoenix", "Asia/Kolkata"} /* Writer no DST, Reader no DST */,
+ {"Europe/Berlin", "America/Los_Angeles"} /* Writer DST, Reader DST */,
+ {"Europe/Berlin", "America/Chicago"} /* Writer DST, Reader DST */,
+ /* With DST difference */
+ {"Europe/Berlin", "UTC"},
+ {"UTC", "Europe/Berlin"} /* Writer no DST, Reader DST */,
+ {"America/Los_Angeles", "Asia/Kolkata"} /* Writer DST, Reader no DST */,
+ {"Europe/Berlin", "Asia/Kolkata"} /* Writer DST, Reader no DST */,
+ /* Timezone offsets for the reader has changed historically */
+ {"Asia/Saigon", "Pacific/Enderbury"},
+ {"UTC", "Asia/Jerusalem"},
+
+ // NOTE:
+ // "1995-01-01 03:00:00.688888888" this is not a valid time in Pacific/Enderbury timezone.
+ // On 1995-01-01 00:00:00 GMT offset moved from -11:00 hr to +13:00 which makes all values
+ // on 1995-01-01 invalid. Try this with joda time
+ // new MutableDateTime("1995-01-01", DateTimeZone.forTimeZone(readerTimeZone));
+ });
+ return result;
+ }
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @After
+ public void restoreTimeZone() {
+ TimeZone.setDefault(defaultTimeZone);
+ }
+
+ @Test
+ public void testTimestampWriter() throws Exception {
+ TypeDescription schema = TypeDescription.createTimestamp();
+
+ TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .bufferSize(10000));
+ assertEquals(writerTimeZone, TimeZone.getDefault().getID());
+ List<String> ts = Lists.newArrayList();
+ ts.add("2003-01-01 01:00:00.000000222");
+ ts.add("1996-08-02 09:00:00.723100809");
+ ts.add("1999-01-01 02:00:00.999999999");
+ ts.add("1995-01-02 03:00:00.688888888");
+ ts.add("2002-01-01 04:00:00.1");
+ ts.add("2010-03-02 05:00:00.000009001");
+ ts.add("2005-01-01 06:00:00.000002229");
+ ts.add("2006-01-01 07:00:00.900203003");
+ ts.add("2003-01-01 08:00:00.800000007");
+ ts.add("1998-11-02 10:00:00.857340643");
+ ts.add("2008-10-02 11:00:00.0");
+ ts.add("2037-01-01 00:00:00.000999");
+ ts.add("2014-03-28 00:00:00.0");
+ VectorizedRowBatch batch = schema.createRowBatch();
+ TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
+ for (String t : ts) {
+ times.set(batch.size++, Timestamp.valueOf(t));
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals(readerTimeZone, TimeZone.getDefault().getID());
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ times = (TimestampColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(ts.get(idx++), times.asScratchTimestamp(r).toString());
+ }
+ }
+ rows.close();
+ }
+
+ @Test
+ public void testReadTimestampFormat_0_11() throws Exception {
+ TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
+ Path oldFilePath = new Path(getClass().getClassLoader().
+ getSystemResource("orc-file-11-format.orc").getPath());
+ Reader reader = OrcFile.createReader(oldFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ TypeDescription schema = reader.getSchema();
+ int col = schema.getFieldNames().indexOf("ts");
+ VectorizedRowBatch batch = schema.createRowBatch(10);
+ TimestampColumnVector ts = (TimestampColumnVector) batch.cols[col];
+
+ boolean[] include = new boolean[schema.getMaximumId() + 1];
+ include[schema.getChildren().get(col).getId()] = true;
+ RecordReader rows = reader.rows
+ (new Reader.Options().include(include));
+ assertEquals(true, rows.nextBatch(batch));
+ assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"),
+ ts.asScratchTimestamp(0));
+
+ // check the contents of second row
+ rows.seekToRow(7499);
+ assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1, batch.size);
+ assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"),
+ ts.asScratchTimestamp(0));
+
+ // handle the close up
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ rows.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/TestOrcTimezone2.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestOrcTimezone2.java b/orc/src/test/org/apache/orc/TestOrcTimezone2.java
new file mode 100644
index 0000000..4a02855
--- /dev/null
+++ b/orc/src/test/org/apache/orc/TestOrcTimezone2.java
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.io.File;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Random;
+import java.util.TimeZone;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import com.google.common.collect.Lists;
+
+/**
+ *
+ */
+@RunWith(Parameterized.class)
+public class TestOrcTimezone2 {
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+ String writerTimeZone;
+ String readerTimeZone;
+ static TimeZone defaultTimeZone = TimeZone.getDefault();
+
+ public TestOrcTimezone2(String writerTZ, String readerTZ) {
+ this.writerTimeZone = writerTZ;
+ this.readerTimeZone = readerTZ;
+ }
+
+ @Parameterized.Parameters
+ public static Collection<Object[]> data() {
+ String[] allTimeZones = TimeZone.getAvailableIDs();
+ Random rand = new Random(123);
+ int len = allTimeZones.length;
+ int n = 500;
+ Object[][] data = new Object[n][];
+ for (int i = 0; i < n; i++) {
+ int wIdx = rand.nextInt(len);
+ int rIdx = rand.nextInt(len);
+ data[i] = new Object[2];
+ data[i][0] = allTimeZones[wIdx];
+ data[i][1] = allTimeZones[rIdx];
+ }
+ return Arrays.asList(data);
+ }
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @After
+ public void restoreTimeZone() {
+ TimeZone.setDefault(defaultTimeZone);
+ }
+
+ @Test
+ public void testTimestampWriter() throws Exception {
+ TypeDescription schema = TypeDescription.createTimestamp();
+
+ TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema)
+ .stripeSize(100000).bufferSize(10000));
+ assertEquals(writerTimeZone, TimeZone.getDefault().getID());
+ List<String> ts = Lists.newArrayList();
+ ts.add("2003-01-01 01:00:00.000000222");
+ ts.add("1999-01-01 02:00:00.999999999");
+ ts.add("1995-01-02 03:00:00.688888888");
+ ts.add("2002-01-01 04:00:00.1");
+ ts.add("2010-03-02 05:00:00.000009001");
+ ts.add("2005-01-01 06:00:00.000002229");
+ ts.add("2006-01-01 07:00:00.900203003");
+ ts.add("2003-01-01 08:00:00.800000007");
+ ts.add("1996-08-02 09:00:00.723100809");
+ ts.add("1998-11-02 10:00:00.857340643");
+ ts.add("2008-10-02 11:00:00.0");
+ ts.add("2037-01-01 00:00:00.000999");
+ VectorizedRowBatch batch = schema.createRowBatch();
+ TimestampColumnVector tsc = (TimestampColumnVector) batch.cols[0];
+ for (String t : ts) {
+ tsc.set(batch.size++, Timestamp.valueOf(t));
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals(readerTimeZone, TimeZone.getDefault().getID());
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ batch = reader.getSchema().createRowBatch();
+ tsc = (TimestampColumnVector) batch.cols[0];
+ while (rows.nextBatch(batch)) {
+ for (int r=0; r < batch.size; ++r) {
+ assertEquals(ts.get(idx++), tsc.asScratchTimestamp(r).toString());
+ }
+ }
+ rows.close();
+ }
+}
[26/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java b/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
new file mode 100644
index 0000000..3ba56f7
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
@@ -0,0 +1,2840 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.EnumMap;
+import java.util.Map;
+
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
+import org.apache.hadoop.hive.ql.util.TimestampUtils;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.orc.OrcProto;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.TypeDescription.Category;
+
+/**
+ * Convert ORC tree readers.
+ */
+public class ConvertTreeReaderFactory extends TreeReaderFactory {
+
+ /**
+ * Override methods like checkEncoding to pass-thru to the convert TreeReader.
+ */
+ public static class ConvertTreeReader extends TreeReader {
+
+ private TreeReader convertTreeReader;
+
+ ConvertTreeReader(int columnId) throws IOException {
+ super(columnId);
+ }
+
+ // The ordering of types here is used to determine which numeric types
+ // are common/convertible to one another. Probably better to rely on the
+ // ordering explicitly defined here than to assume that the enum values
+ // that were arbitrarily assigned in PrimitiveCategory work for our purposes.
+ private static EnumMap<TypeDescription.Category, Integer> numericTypes =
+ new EnumMap<>(TypeDescription.Category.class);
+
+ static {
+ registerNumericType(TypeDescription.Category.BOOLEAN, 1);
+ registerNumericType(TypeDescription.Category.BYTE, 2);
+ registerNumericType(TypeDescription.Category.SHORT, 3);
+ registerNumericType(TypeDescription.Category.INT, 4);
+ registerNumericType(TypeDescription.Category.LONG, 5);
+ registerNumericType(TypeDescription.Category.FLOAT, 6);
+ registerNumericType(TypeDescription.Category.DOUBLE, 7);
+ registerNumericType(TypeDescription.Category.DECIMAL, 8);
+ }
+
+ private static void registerNumericType(TypeDescription.Category kind, int level) {
+ numericTypes.put(kind, level);
+ }
+
+ protected void setConvertTreeReader(TreeReader convertTreeReader) {
+ this.convertTreeReader = convertTreeReader;
+ }
+
+ protected TreeReader getStringGroupTreeReader(int columnId,
+ TypeDescription fileType) throws IOException {
+ switch (fileType.getCategory()) {
+ case STRING:
+ return new StringTreeReader(columnId);
+ case CHAR:
+ return new CharTreeReader(columnId, fileType.getMaxLength());
+ case VARCHAR:
+ return new VarcharTreeReader(columnId, fileType.getMaxLength());
+ default:
+ throw new RuntimeException("Unexpected type kind " + fileType.getCategory().name());
+ }
+ }
+
+ protected void assignStringGroupVectorEntry(BytesColumnVector bytesColVector,
+ int elementNum, TypeDescription readerType, byte[] bytes) {
+ assignStringGroupVectorEntry(bytesColVector,
+ elementNum, readerType, bytes, 0, bytes.length);
+ }
+
+ /*
+ * Assign a BytesColumnVector entry when we have a byte array, start, and
+ * length for the string group which can be (STRING, CHAR, VARCHAR).
+ */
+ protected void assignStringGroupVectorEntry(BytesColumnVector bytesColVector,
+ int elementNum, TypeDescription readerType, byte[] bytes, int start, int length) {
+ switch (readerType.getCategory()) {
+ case STRING:
+ bytesColVector.setVal(elementNum, bytes, start, length);
+ break;
+ case CHAR:
+ {
+ int adjustedDownLen =
+ StringExpr.rightTrimAndTruncate(bytes, start, length, readerType.getMaxLength());
+ bytesColVector.setVal(elementNum, bytes, start, adjustedDownLen);
+ }
+ break;
+ case VARCHAR:
+ {
+ int adjustedDownLen =
+ StringExpr.truncate(bytes, start, length, readerType.getMaxLength());
+ bytesColVector.setVal(elementNum, bytes, start, adjustedDownLen);
+ }
+ break;
+ default:
+ throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name());
+ }
+ }
+
+ protected void convertStringGroupVectorElement(BytesColumnVector bytesColVector,
+ int elementNum, TypeDescription readerType) {
+ switch (readerType.getCategory()) {
+ case STRING:
+ // No conversion needed.
+ break;
+ case CHAR:
+ {
+ int length = bytesColVector.length[elementNum];
+ int adjustedDownLen = StringExpr
+ .rightTrimAndTruncate(bytesColVector.vector[elementNum],
+ bytesColVector.start[elementNum], length,
+ readerType.getMaxLength());
+ if (adjustedDownLen < length) {
+ bytesColVector.length[elementNum] = adjustedDownLen;
+ }
+ }
+ break;
+ case VARCHAR:
+ {
+ int length = bytesColVector.length[elementNum];
+ int adjustedDownLen = StringExpr
+ .truncate(bytesColVector.vector[elementNum],
+ bytesColVector.start[elementNum], length,
+ readerType.getMaxLength());
+ if (adjustedDownLen < length) {
+ bytesColVector.length[elementNum] = adjustedDownLen;
+ }
+ }
+ break;
+ default:
+ throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name());
+ }
+ }
+
+ private boolean isParseError;
+
+ /*
+ * We do this because we want the various parse methods return a primitive.
+ *
+ * @return true if there was a parse error in the last call to
+ * parseLongFromString, etc.
+ */
+ protected boolean getIsParseError() {
+ return isParseError;
+ }
+
+ protected long parseLongFromString(String string) {
+ try {
+ long longValue = Long.parseLong(string);
+ isParseError = false;
+ return longValue;
+ } catch (NumberFormatException e) {
+ isParseError = true;
+ return 0;
+ }
+ }
+
+ protected float parseFloatFromString(String string) {
+ try {
+ float floatValue = Float.parseFloat(string);
+ isParseError = false;
+ return floatValue;
+ } catch (NumberFormatException e) {
+ isParseError = true;
+ return Float.NaN;
+ }
+ }
+
+ protected double parseDoubleFromString(String string) {
+ try {
+ double value = Double.parseDouble(string);
+ isParseError = false;
+ return value;
+ } catch (NumberFormatException e) {
+ isParseError = true;
+ return Double.NaN;
+ }
+ }
+
+ /**
+ * @param string
+ * @return the HiveDecimal parsed, or null if there was a parse error.
+ */
+ protected HiveDecimal parseDecimalFromString(String string) {
+ try {
+ HiveDecimal value = HiveDecimal.create(string);
+ return value;
+ } catch (NumberFormatException e) {
+ return null;
+ }
+ }
+
+ /**
+ * @param string
+ * @return the Timestamp parsed, or null if there was a parse error.
+ */
+ protected Timestamp parseTimestampFromString(String string) {
+ try {
+ Timestamp value = Timestamp.valueOf(string);
+ return value;
+ } catch (IllegalArgumentException e) {
+ return null;
+ }
+ }
+
+ /**
+ * @param string
+ * @return the Date parsed, or null if there was a parse error.
+ */
+ protected Date parseDateFromString(String string) {
+ try {
+ Date value = Date.valueOf(string);
+ return value;
+ } catch (IllegalArgumentException e) {
+ return null;
+ }
+ }
+
+ protected String stringFromBytesColumnVectorEntry(
+ BytesColumnVector bytesColVector, int elementNum) {
+ String string;
+
+ string = new String(
+ bytesColVector.vector[elementNum],
+ bytesColVector.start[elementNum], bytesColVector.length[elementNum],
+ StandardCharsets.UTF_8);
+
+ return string;
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ // Pass-thru.
+ convertTreeReader.checkEncoding(encoding);
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ // Pass-thru.
+ convertTreeReader.startStripe(streams, stripeFooter);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ // Pass-thru.
+ convertTreeReader.seek(index);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ // Pass-thru.
+ convertTreeReader.seek(index);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ // Pass-thru.
+ convertTreeReader.skipRows(items);
+ }
+
+ /**
+ * Override this to use convertVector.
+ * Source and result are member variables in the subclass with the right
+ * type.
+ * @param elementNum
+ * @throws IOException
+ */
+ // Override this to use convertVector.
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ throw new RuntimeException("Expected this method to be overriden");
+ }
+
+ // Common code used by the conversion.
+ public void convertVector(ColumnVector fromColVector,
+ ColumnVector resultColVector, final int batchSize) throws IOException {
+
+ resultColVector.reset();
+ if (fromColVector.isRepeating) {
+ resultColVector.isRepeating = true;
+ if (fromColVector.noNulls || !fromColVector.isNull[0]) {
+ setConvertVectorElement(0);
+ } else {
+ resultColVector.noNulls = false;
+ resultColVector.isNull[0] = true;
+ }
+ } else if (fromColVector.noNulls){
+ for (int i = 0; i < batchSize; i++) {
+ setConvertVectorElement(i);
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!fromColVector.isNull[i]) {
+ setConvertVectorElement(i);
+ } else {
+ resultColVector.noNulls = false;
+ resultColVector.isNull[i] = true;
+ }
+ }
+ }
+ }
+
+ public long downCastAnyInteger(long input, TypeDescription readerType) {
+ switch (readerType.getCategory()) {
+ case BOOLEAN:
+ return input == 0 ? 0 : 1;
+ case BYTE:
+ return (byte) input;
+ case SHORT:
+ return (short) input;
+ case INT:
+ return (int) input;
+ case LONG:
+ return input;
+ default:
+ throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name());
+ }
+ }
+
+ protected boolean integerDownCastNeeded(TypeDescription fileType, TypeDescription readerType) {
+ Integer fileLevel = numericTypes.get(fileType.getCategory());
+ Integer schemaLevel = numericTypes.get(readerType.getCategory());
+ return (schemaLevel.intValue() < fileLevel.intValue());
+ }
+ }
+
+ public static class AnyIntegerTreeReader extends ConvertTreeReader {
+
+ private TypeDescription.Category fileTypeCategory;
+ private TreeReader anyIntegerTreeReader;
+
+ private long longValue;
+
+ AnyIntegerTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.fileTypeCategory = fileType.getCategory();
+ switch (fileTypeCategory) {
+ case BOOLEAN:
+ anyIntegerTreeReader = new BooleanTreeReader(columnId);
+ break;
+ case BYTE:
+ anyIntegerTreeReader = new ByteTreeReader(columnId);
+ break;
+ case SHORT:
+ anyIntegerTreeReader = new ShortTreeReader(columnId);
+ break;
+ case INT:
+ anyIntegerTreeReader = new IntTreeReader(columnId);
+ break;
+ case LONG:
+ anyIntegerTreeReader = new LongTreeReader(columnId, skipCorrupt);
+ break;
+ default:
+ throw new RuntimeException("Unexpected type kind " + fileType.getCategory().name());
+ }
+ setConvertTreeReader(anyIntegerTreeReader);
+ }
+
+ protected long getLong() throws IOException {
+ return longValue;
+ }
+
+ protected String getString(long longValue) {
+ if (fileTypeCategory == TypeDescription.Category.BOOLEAN) {
+ return longValue == 0 ? "FALSE" : "TRUE";
+ } else {
+ return Long.toString(longValue);
+ }
+ }
+
+ protected String getString() {
+ return getString(longValue);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ anyIntegerTreeReader.nextVector(previousVector, isNull, batchSize);
+ }
+ }
+
+ public static class AnyIntegerFromAnyIntegerTreeReader extends ConvertTreeReader {
+
+ private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
+
+ private final TypeDescription readerType;
+ private final boolean downCastNeeded;
+
+ AnyIntegerFromAnyIntegerTreeReader(int columnId, TypeDescription fileType, TypeDescription readerType, boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ anyIntegerAsLongTreeReader = new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+ setConvertTreeReader(anyIntegerAsLongTreeReader);
+ downCastNeeded = integerDownCastNeeded(fileType, readerType);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ anyIntegerAsLongTreeReader.nextVector(previousVector, isNull, batchSize);
+ LongColumnVector resultColVector = (LongColumnVector) previousVector;
+ if (downCastNeeded) {
+ long[] resultVector = resultColVector.vector;
+ if (resultColVector.isRepeating) {
+ if (resultColVector.noNulls || !resultColVector.isNull[0]) {
+ resultVector[0] = downCastAnyInteger(resultVector[0], readerType);
+ } else {
+ resultColVector.noNulls = false;
+ resultColVector.isNull[0] = true;
+ }
+ } else if (resultColVector.noNulls){
+ for (int i = 0; i < batchSize; i++) {
+ resultVector[i] = downCastAnyInteger(resultVector[i], readerType);
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!resultColVector.isNull[i]) {
+ resultVector[i] = downCastAnyInteger(resultVector[i], readerType);
+ } else {
+ resultColVector.noNulls = false;
+ resultColVector.isNull[i] = true;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ public static class AnyIntegerFromFloatTreeReader extends ConvertTreeReader {
+
+ private FloatTreeReader floatTreeReader;
+
+ private final TypeDescription readerType;
+ private FloatWritable floatResult;
+ private DoubleColumnVector doubleColVector;
+ private LongColumnVector longColVector;
+
+ AnyIntegerFromFloatTreeReader(int columnId, TypeDescription readerType)
+ throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ floatTreeReader = new FloatTreeReader(columnId);
+ setConvertTreeReader(floatTreeReader);
+ floatResult = new FloatWritable();
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ float floatValue = (float) doubleColVector.vector[elementNum];
+ longColVector.vector[elementNum] =
+ downCastAnyInteger(
+ (long) floatValue, readerType);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class AnyIntegerFromDoubleTreeReader extends ConvertTreeReader {
+
+ private DoubleTreeReader doubleTreeReader;
+
+ private final TypeDescription readerType;
+ private DoubleColumnVector doubleColVector;
+ private LongColumnVector longColVector;
+
+ AnyIntegerFromDoubleTreeReader(int columnId, TypeDescription readerType)
+ throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ doubleTreeReader = new DoubleTreeReader(columnId);
+ setConvertTreeReader(doubleTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ longColVector.vector[elementNum] =
+ downCastAnyInteger(
+ (long) doubleColVector.vector[elementNum], readerType);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class AnyIntegerFromDecimalTreeReader extends ConvertTreeReader {
+
+ private DecimalTreeReader decimalTreeReader;
+
+ private final int precision;
+ private final int scale;
+ private final TypeDescription readerType;
+ private HiveDecimalWritable hiveDecimalResult;
+ private DecimalColumnVector decimalColVector;
+ private LongColumnVector longColVector;
+
+ AnyIntegerFromDecimalTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType) throws IOException {
+ super(columnId);
+ this.precision = fileType.getPrecision();
+ this.scale = fileType.getScale();
+ this.readerType = readerType;
+ decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
+ setConvertTreeReader(decimalTreeReader);
+ hiveDecimalResult = new HiveDecimalWritable();
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ longColVector.vector[elementNum] =
+ downCastAnyInteger(
+ decimalColVector.vector[elementNum].getHiveDecimal().longValue(),
+ readerType);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (decimalColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ decimalColVector = new DecimalColumnVector(precision, scale);
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
+
+ convertVector(decimalColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class AnyIntegerFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private final TypeDescription fileType;
+ private final TypeDescription readerType;
+ private BytesColumnVector bytesColVector;
+ private LongColumnVector longColVector;
+
+ AnyIntegerFromStringGroupTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType) throws IOException {
+ super(columnId);
+ this.fileType = fileType;
+ this.readerType = readerType;
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
+ long longValue = parseLongFromString(string);
+ if (!getIsParseError()) {
+ longColVector.vector[elementNum] =
+ downCastAnyInteger(longValue, readerType);
+ } else {
+ longColVector.noNulls = false;
+ longColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (bytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ bytesColVector = new BytesColumnVector();
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
+
+ convertVector(bytesColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class AnyIntegerFromTimestampTreeReader extends ConvertTreeReader {
+
+ private TimestampTreeReader timestampTreeReader;
+
+ private final TypeDescription readerType;
+ private TimestampColumnVector timestampColVector;
+ private LongColumnVector longColVector;
+
+ AnyIntegerFromTimestampTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
+ setConvertTreeReader(timestampTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ // Use TimestampWritable's getSeconds.
+ long longValue = TimestampUtils.millisToSeconds(
+ timestampColVector.asScratchTimestamp(elementNum).getTime());
+ longColVector.vector[elementNum] =
+ downCastAnyInteger(longValue, readerType);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (timestampColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ timestampColVector = new TimestampColumnVector();
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
+
+ convertVector(timestampColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class FloatFromAnyIntegerTreeReader extends ConvertTreeReader {
+
+ private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
+
+ private LongColumnVector longColVector;
+ private DoubleColumnVector doubleColVector;
+
+ FloatFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ anyIntegerAsLongTreeReader =
+ new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+ setConvertTreeReader(anyIntegerAsLongTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ float floatValue = (float) longColVector.vector[elementNum];
+ if (!Float.isNaN(floatValue)) {
+ doubleColVector.vector[elementNum] = floatValue;
+ } else {
+ doubleColVector.vector[elementNum] = Double.NaN;
+ doubleColVector.noNulls = false;
+ doubleColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class FloatFromDoubleTreeReader extends ConvertTreeReader {
+
+ private DoubleTreeReader doubleTreeReader;
+
+ FloatFromDoubleTreeReader(int columnId) throws IOException {
+ super(columnId);
+ doubleTreeReader = new DoubleTreeReader(columnId);
+ setConvertTreeReader(doubleTreeReader);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ doubleTreeReader.nextVector(previousVector, isNull, batchSize);
+
+ DoubleColumnVector resultColVector = (DoubleColumnVector) previousVector;
+ double[] resultVector = resultColVector.vector;
+ if (resultColVector.isRepeating) {
+ if (resultColVector.noNulls || !resultColVector.isNull[0]) {
+ resultVector[0] = (float) resultVector[0];
+ } else {
+ resultColVector.noNulls = false;
+ resultColVector.isNull[0] = true;
+ }
+ } else if (resultColVector.noNulls){
+ for (int i = 0; i < batchSize; i++) {
+ resultVector[i] = (float) resultVector[i];
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!resultColVector.isNull[i]) {
+ resultVector[i] = (float) resultVector[i];
+ } else {
+ resultColVector.noNulls = false;
+ resultColVector.isNull[i] = true;
+ }
+ }
+ }
+ }
+ }
+
+ public static class FloatFromDecimalTreeReader extends ConvertTreeReader {
+
+ private DecimalTreeReader decimalTreeReader;
+
+ private final int precision;
+ private final int scale;
+ private final TypeDescription readerType;
+ private HiveDecimalWritable hiveDecimalResult;
+ private DecimalColumnVector decimalColVector;
+ private DoubleColumnVector doubleColVector;
+
+ FloatFromDecimalTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType) throws IOException {
+ super(columnId);
+ this.precision = fileType.getPrecision();
+ this.scale = fileType.getScale();
+ this.readerType = readerType;
+ decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
+ setConvertTreeReader(decimalTreeReader);
+ hiveDecimalResult = new HiveDecimalWritable();
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ doubleColVector.vector[elementNum] =
+ (float) decimalColVector.vector[elementNum].getHiveDecimal().doubleValue();
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (decimalColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ decimalColVector = new DecimalColumnVector(precision, scale);
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
+
+ convertVector(decimalColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class FloatFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private final TypeDescription fileType;
+ private BytesColumnVector bytesColVector;
+ private DoubleColumnVector doubleColVector;
+
+ FloatFromStringGroupTreeReader(int columnId, TypeDescription fileType)
+ throws IOException {
+ super(columnId);
+ this.fileType = fileType;
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
+ float floatValue = parseFloatFromString(string);
+ if (!getIsParseError()) {
+ doubleColVector.vector[elementNum] = floatValue;
+ } else {
+ doubleColVector.vector[elementNum] = Double.NaN;
+ doubleColVector.noNulls = false;
+ doubleColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (bytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ bytesColVector = new BytesColumnVector();
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
+
+ convertVector(bytesColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class FloatFromTimestampTreeReader extends ConvertTreeReader {
+
+ private TimestampTreeReader timestampTreeReader;
+
+ private final TypeDescription readerType;
+ private TimestampColumnVector timestampColVector;
+ private DoubleColumnVector doubleColVector;
+
+ FloatFromTimestampTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
+ setConvertTreeReader(timestampTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ doubleColVector.vector[elementNum] = (float) TimestampUtils.getDouble(
+ timestampColVector.asScratchTimestamp(elementNum));
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (timestampColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ timestampColVector = new TimestampColumnVector();
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
+
+ convertVector(timestampColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class DoubleFromAnyIntegerTreeReader extends ConvertTreeReader {
+
+ private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
+
+ private LongColumnVector longColVector;
+ private DoubleColumnVector doubleColVector;
+
+ DoubleFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ anyIntegerAsLongTreeReader =
+ new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+ setConvertTreeReader(anyIntegerAsLongTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+
+ double doubleValue = (double) longColVector.vector[elementNum];
+ if (!Double.isNaN(doubleValue)) {
+ doubleColVector.vector[elementNum] = doubleValue;
+ } else {
+ doubleColVector.vector[elementNum] = Double.NaN;
+ doubleColVector.noNulls = false;
+ doubleColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class DoubleFromFloatTreeReader extends ConvertTreeReader {
+
+ private FloatTreeReader floatTreeReader;
+
+ private FloatWritable floatResult;
+
+ DoubleFromFloatTreeReader(int columnId) throws IOException {
+ super(columnId);
+ floatTreeReader = new FloatTreeReader(columnId);
+ setConvertTreeReader(floatTreeReader);
+ floatResult = new FloatWritable();
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ // The DoubleColumnVector produced by FloatTreeReader is what we want.
+ floatTreeReader.nextVector(previousVector, isNull, batchSize);
+ }
+ }
+
+ public static class DoubleFromDecimalTreeReader extends ConvertTreeReader {
+
+ private DecimalTreeReader decimalTreeReader;
+
+ private final int precision;
+ private final int scale;
+ private final TypeDescription readerType;
+ private HiveDecimalWritable hiveDecimalResult;
+ private DecimalColumnVector decimalColVector;
+ private DoubleColumnVector doubleColVector;
+
+ DoubleFromDecimalTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType) throws IOException {
+ super(columnId);
+ this.precision = fileType.getPrecision();
+ this.scale = fileType.getScale();
+ this.readerType = readerType;
+ decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
+ setConvertTreeReader(decimalTreeReader);
+ hiveDecimalResult = new HiveDecimalWritable();
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ doubleColVector.vector[elementNum] =
+ decimalColVector.vector[elementNum].getHiveDecimal().doubleValue();
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (decimalColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ decimalColVector = new DecimalColumnVector(precision, scale);
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
+
+ convertVector(decimalColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class DoubleFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private final TypeDescription fileType;
+ private BytesColumnVector bytesColVector;
+ private DoubleColumnVector doubleColVector;
+
+ DoubleFromStringGroupTreeReader(int columnId, TypeDescription fileType)
+ throws IOException {
+ super(columnId);
+ this.fileType = fileType;
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
+ double doubleValue = parseDoubleFromString(string);
+ if (!getIsParseError()) {
+ doubleColVector.vector[elementNum] = doubleValue;
+ } else {
+ doubleColVector.noNulls = false;
+ doubleColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (bytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ bytesColVector = new BytesColumnVector();
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
+
+ convertVector(bytesColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class DoubleFromTimestampTreeReader extends ConvertTreeReader {
+
+ private TimestampTreeReader timestampTreeReader;
+
+ private final TypeDescription readerType;
+ private TimestampColumnVector timestampColVector;
+ private DoubleColumnVector doubleColVector;
+
+ DoubleFromTimestampTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
+ setConvertTreeReader(timestampTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ doubleColVector.vector[elementNum] = TimestampUtils.getDouble(
+ timestampColVector.asScratchTimestamp(elementNum));
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (timestampColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ timestampColVector = new TimestampColumnVector();
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
+
+ convertVector(timestampColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class DecimalFromAnyIntegerTreeReader extends ConvertTreeReader {
+
+ private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
+
+ private int precision;
+ private int scale;
+ private LongColumnVector longColVector;
+ private DecimalColumnVector decimalColVector;
+
+ DecimalFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType, boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.precision = readerType.getPrecision();
+ this.scale = readerType.getScale();
+ anyIntegerAsLongTreeReader =
+ new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+ setConvertTreeReader(anyIntegerAsLongTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ long longValue = longColVector.vector[elementNum];
+ HiveDecimalWritable hiveDecimalWritable =
+ new HiveDecimalWritable(longValue);
+ decimalColVector.set(elementNum, hiveDecimalWritable);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ decimalColVector = (DecimalColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, decimalColVector, batchSize);
+ }
+ }
+
+ public static class DecimalFromFloatTreeReader extends ConvertTreeReader {
+
+ private FloatTreeReader floatTreeReader;
+
+ private int precision;
+ private int scale;
+ private FloatWritable floatResult;
+ private DoubleColumnVector doubleColVector;
+ private DecimalColumnVector decimalColVector;
+
+ DecimalFromFloatTreeReader(int columnId, TypeDescription readerType)
+ throws IOException {
+ super(columnId);
+ this.precision = readerType.getPrecision();
+ this.scale = readerType.getScale();
+ floatTreeReader = new FloatTreeReader(columnId);
+ setConvertTreeReader(floatTreeReader);
+ floatResult = new FloatWritable();
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ float floatValue = (float) doubleColVector.vector[elementNum];
+ if (!Float.isNaN(floatValue)) {
+ HiveDecimal value =
+ HiveDecimal.create(Float.toString(floatValue));
+ if (value != null) {
+ decimalColVector.set(elementNum, value);
+ } else {
+ decimalColVector.noNulls = false;
+ decimalColVector.isNull[elementNum] = true;
+ }
+ } else {
+ decimalColVector.noNulls = false;
+ decimalColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ decimalColVector = (DecimalColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, decimalColVector, batchSize);
+ }
+ }
+
+ public static class DecimalFromDoubleTreeReader extends ConvertTreeReader {
+
+ private DoubleTreeReader doubleTreeReader;
+
+ private DoubleColumnVector doubleColVector;
+ private DecimalColumnVector decimalColVector;
+
+ DecimalFromDoubleTreeReader(int columnId, TypeDescription readerType)
+ throws IOException {
+ super(columnId);
+ doubleTreeReader = new DoubleTreeReader(columnId);
+ setConvertTreeReader(doubleTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ HiveDecimal value =
+ HiveDecimal.create(Double.toString(doubleColVector.vector[elementNum]));
+ if (value != null) {
+ decimalColVector.set(elementNum, value);
+ } else {
+ decimalColVector.noNulls = false;
+ decimalColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ decimalColVector = (DecimalColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, decimalColVector, batchSize);
+ }
+ }
+
+ public static class DecimalFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private final TypeDescription fileType;
+ private BytesColumnVector bytesColVector;
+ private DecimalColumnVector decimalColVector;
+
+ DecimalFromStringGroupTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType) throws IOException {
+ super(columnId);
+ this.fileType = fileType;
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
+ HiveDecimal value = parseDecimalFromString(string);
+ if (value != null) {
+ decimalColVector.set(elementNum, value);
+ } else {
+ decimalColVector.noNulls = false;
+ decimalColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (bytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ bytesColVector = new BytesColumnVector();
+ decimalColVector = (DecimalColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
+
+ convertVector(bytesColVector, decimalColVector, batchSize);
+ }
+ }
+
+ public static class DecimalFromTimestampTreeReader extends ConvertTreeReader {
+
+ private TimestampTreeReader timestampTreeReader;
+
+ private final TypeDescription readerType;
+ private TimestampColumnVector timestampColVector;
+ private int precision;
+ private int scale;
+ private DecimalColumnVector decimalColVector;
+
+ DecimalFromTimestampTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ this.precision = readerType.getPrecision();
+ this.scale = readerType.getScale();
+ timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
+ setConvertTreeReader(timestampTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ double doubleValue = TimestampUtils.getDouble(
+ timestampColVector.asScratchTimestamp(elementNum));
+ HiveDecimal value = HiveDecimal.create(Double.toString(doubleValue));
+ if (value != null) {
+ decimalColVector.set(elementNum, value);
+ } else {
+ decimalColVector.noNulls = false;
+ decimalColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (timestampColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ timestampColVector = new TimestampColumnVector();
+ decimalColVector = (DecimalColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
+
+ convertVector(timestampColVector, decimalColVector, batchSize);
+ }
+ }
+
+ public static class StringGroupFromAnyIntegerTreeReader extends ConvertTreeReader {
+
+ private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
+
+ private final TypeDescription fileType;
+ private final TypeDescription readerType;
+ private LongColumnVector longColVector;
+ private BytesColumnVector bytesColVector;
+
+ StringGroupFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType, boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.fileType = fileType;
+ this.readerType = readerType;
+ anyIntegerAsLongTreeReader =
+ new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+ setConvertTreeReader(anyIntegerAsLongTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ long longValue = longColVector.vector[elementNum];
+ String string = anyIntegerAsLongTreeReader.getString(longValue);
+ byte[] bytes = string.getBytes();
+ assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ bytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, bytesColVector, batchSize);
+ }
+ }
+
+ public static class StringGroupFromFloatTreeReader extends ConvertTreeReader {
+
+ private FloatTreeReader floatTreeReader;
+
+ private final TypeDescription readerType;
+ private FloatWritable floatResult;
+ private DoubleColumnVector doubleColVector;
+ private BytesColumnVector bytesColVector;
+
+
+ StringGroupFromFloatTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ floatTreeReader = new FloatTreeReader(columnId);
+ setConvertTreeReader(floatTreeReader);
+ floatResult = new FloatWritable();
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ float floatValue = (float) doubleColVector.vector[elementNum];
+ if (!Float.isNaN(floatValue)) {
+ String string = String.valueOf(floatValue);
+ byte[] bytes = string.getBytes();
+ assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
+ } else {
+ bytesColVector.noNulls = false;
+ bytesColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ bytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, bytesColVector, batchSize);
+ }
+ }
+
+ public static class StringGroupFromDoubleTreeReader extends ConvertTreeReader {
+
+ private DoubleTreeReader doubleTreeReader;
+
+ private final TypeDescription readerType;
+ private DoubleColumnVector doubleColVector;
+ private BytesColumnVector bytesColVector;
+
+ StringGroupFromDoubleTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ doubleTreeReader = new DoubleTreeReader(columnId);
+ setConvertTreeReader(doubleTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ double doubleValue = doubleColVector.vector[elementNum];
+ if (!Double.isNaN(doubleValue)) {
+ String string = String.valueOf(doubleValue);
+ byte[] bytes = string.getBytes();
+ assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
+ } else {
+ bytesColVector.noNulls = false;
+ bytesColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ bytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, bytesColVector, batchSize);
+ }
+ }
+
+
+
+ public static class StringGroupFromDecimalTreeReader extends ConvertTreeReader {
+
+ private DecimalTreeReader decimalTreeReader;
+
+ private int precision;
+ private int scale;
+ private final TypeDescription readerType;
+ private DecimalColumnVector decimalColVector;
+ private BytesColumnVector bytesColVector;
+
+ StringGroupFromDecimalTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType, boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.precision = fileType.getPrecision();
+ this.scale = fileType.getScale();
+ this.readerType = readerType;
+ decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
+ setConvertTreeReader(decimalTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ String string = decimalColVector.vector[elementNum].getHiveDecimal().toString();
+ byte[] bytes = string.getBytes();
+ assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (decimalColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ decimalColVector = new DecimalColumnVector(precision, scale);
+ bytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
+
+ convertVector(decimalColVector, bytesColVector, batchSize);
+ }
+ }
+
+ public static class StringGroupFromTimestampTreeReader extends ConvertTreeReader {
+
+ private TimestampTreeReader timestampTreeReader;
+
+ private final TypeDescription readerType;
+ private TimestampColumnVector timestampColVector;
+ private BytesColumnVector bytesColVector;
+
+ StringGroupFromTimestampTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
+ setConvertTreeReader(timestampTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String string =
+ timestampColVector.asScratchTimestamp(elementNum).toString();
+ byte[] bytes = string.getBytes();
+ assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (timestampColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ timestampColVector = new TimestampColumnVector();
+ bytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
+
+ convertVector(timestampColVector, bytesColVector, batchSize);
+ }
+ }
+
+ public static class StringGroupFromDateTreeReader extends ConvertTreeReader {
+
+ private DateTreeReader dateTreeReader;
+
+ private final TypeDescription readerType;
+ private LongColumnVector longColVector;
+ private BytesColumnVector bytesColVector;
+ private DateWritable dateWritableResult;
+ private Date date;
+
+ StringGroupFromDateTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ dateTreeReader = new DateTreeReader(columnId);
+ setConvertTreeReader(dateTreeReader);
+ dateWritableResult = new DateWritable();
+ date = new Date(0);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ date.setTime(DateWritable.daysToMillis((int) longColVector.vector[elementNum]));
+ String string = date.toString();
+ byte[] bytes = string.getBytes();
+ assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ bytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ dateTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, bytesColVector, batchSize);
+ }
+ }
+
+ public static class StringGroupFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private final TypeDescription fileType;
+ private final TypeDescription readerType;
+
+ StringGroupFromStringGroupTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType) throws IOException {
+ super(columnId);
+ this.fileType = fileType;
+ this.readerType = readerType;
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ stringGroupTreeReader.nextVector(previousVector, isNull, batchSize);
+
+ BytesColumnVector resultColVector = (BytesColumnVector) previousVector;
+
+ if (resultColVector.isRepeating) {
+ if (resultColVector.noNulls || !resultColVector.isNull[0]) {
+ convertStringGroupVectorElement(resultColVector, 0, readerType);
+ } else {
+ resultColVector.noNulls = false;
+ resultColVector.isNull[0] = true;
+ }
+ } else if (resultColVector.noNulls){
+ for (int i = 0; i < batchSize; i++) {
+ convertStringGroupVectorElement(resultColVector, i, readerType);
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!resultColVector.isNull[i]) {
+ convertStringGroupVectorElement(resultColVector, i, readerType);
+ } else {
+ resultColVector.noNulls = false;
+ resultColVector.isNull[i] = true;
+ }
+ }
+ }
+ }
+ }
+
+ public static class StringGroupFromBinaryTreeReader extends ConvertTreeReader {
+
+ private BinaryTreeReader binaryTreeReader;
+
+ private final TypeDescription readerType;
+ private BytesWritable binaryWritableResult;
+ private BytesColumnVector inBytesColVector;
+ private BytesColumnVector outBytesColVector;
+
+ StringGroupFromBinaryTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ binaryTreeReader = new BinaryTreeReader(columnId);
+ setConvertTreeReader(binaryTreeReader);
+ binaryWritableResult = new BytesWritable();
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ byte[] bytes = inBytesColVector.vector[elementNum];
+ int start = inBytesColVector.start[elementNum];
+ int length = inBytesColVector.length[elementNum];
+ byte[] string = new byte[length == 0 ? 0 : 3 * length - 1];
+ for(int p = 0; p < string.length; p += 2) {
+ if (p != 0) {
+ string[p++] = ' ';
+ }
+ int num = 0xff & bytes[start++];
+ int digit = num / 16;
+ string[p] = (byte)((digit) + (digit < 10 ? '0' : 'a' - 10));
+ digit = num % 16;
+ string[p + 1] = (byte)((digit) + (digit < 10 ? '0' : 'a' - 10));
+ }
+ assignStringGroupVectorEntry(outBytesColVector, elementNum, readerType,
+ string, 0, string.length);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (inBytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ inBytesColVector = new BytesColumnVector();
+ outBytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ binaryTreeReader.nextVector(inBytesColVector, isNull, batchSize);
+
+ convertVector(inBytesColVector, outBytesColVector, batchSize);
+ }
+ }
+
+ public static class TimestampFromAnyIntegerTreeReader extends ConvertTreeReader {
+
+ private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
+
+ private LongColumnVector longColVector;
+ private TimestampColumnVector timestampColVector;
+
+ TimestampFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ anyIntegerAsLongTreeReader =
+ new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+ setConvertTreeReader(anyIntegerAsLongTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ long longValue = longColVector.vector[elementNum];
+ // UNDONE: What does the boolean setting need to be?
+ timestampColVector.set(elementNum, new Timestamp(longValue));
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ timestampColVector = (TimestampColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, timestampColVector, batchSize);
+ }
+ }
+
+ public static class TimestampFromFloatTreeReader extends ConvertTreeReader {
+
+ private FloatTreeReader floatTreeReader;
+
+ private FloatWritable floatResult;
+ private DoubleColumnVector doubleColVector;
+ private TimestampColumnVector timestampColVector;
+
+ TimestampFromFloatTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ floatTreeReader = new FloatTreeReader(columnId);
+ setConvertTreeReader(floatTreeReader);
+ floatResult = new FloatWritable();
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ float floatValue = (float) doubleColVector.vector[elementNum];
+ timestampColVector.set(elementNum,
+ TimestampUtils.doubleToTimestamp(floatValue));
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ timestampColVector = (TimestampColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, timestampColVector, batchSize);
+ }
+ }
+
+ public static class TimestampFromDoubleTreeReader extends ConvertTreeReader {
+
+ private DoubleTreeReader doubleTreeReader;
+
+ private DoubleColumnVector doubleColVector;
+ private TimestampColumnVector timestampColVector;
+
+ TimestampFromDoubleTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ doubleTreeReader = new DoubleTreeReader(columnId);
+ setConvertTreeReader(doubleTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ double doubleValue = doubleColVector.vector[elementNum];
+ timestampColVector.set(elementNum,
+ TimestampUtils.doubleToTimestamp(doubleValue));
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ timestampColVector = (TimestampColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, timestampColVector, batchSize);
+ }
+ }
+
+ public static class TimestampFromDecimalTreeReader extends ConvertTreeReader {
+
+ private DecimalTreeReader decimalTreeReader;
+
+ private final int precision;
+ private final int scale;
+ private HiveDecimalWritable hiveDecimalResult;
+ private DecimalColumnVector decimalColVector;
+ private TimestampColumnVector timestampColVector;
+
+ TimestampFromDecimalTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.precision = fileType.getPrecision();
+ this.scale = fileType.getScale();
+ decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
+ setConvertTreeReader(decimalTreeReader);
+ hiveDecimalResult = new HiveDecimalWritable();
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ Timestamp timestampValue =
+ TimestampUtils.decimalToTimestamp(
+ decimalColVector.vector[elementNum].getHiveDecimal());
+ timestampColVector.set(elementNum, timestampValue);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (decimalColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ decimalColVector = new DecimalColumnVector(precision, scale);
+ timestampColVector = (TimestampColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
+
+ convertVector(decimalColVector, timestampColVector, batchSize);
+ }
+ }
+
+ public static class TimestampFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private final TypeDescription fileType;
+ private BytesColumnVector bytesColVector;
+ private TimestampColumnVector timestampColVector;
+
+ TimestampFromStringGroupTreeReader(int columnId, TypeDescription fileType)
+ throws IOException {
+ super(columnId);
+ this.fileType = fileType;
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String stringValue =
+ stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
+ Timestamp timestampValue = parseTimestampFromString(stringValue);
+ if (timestampValue != null) {
+ timestampColVector.set(elementNum, timestampValue);
+ } else {
+ timestampColVector.noNulls = false;
+ timestampColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (bytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ bytesColVector = new BytesColumnVector();
+ timestampColVector = (TimestampColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
+
+ convertVector(bytesColVector, timestampColVector, batchSize);
+ }
+ }
+
+ public static class TimestampFromDateTreeReader extends ConvertTreeReader {
+
+ private DateTreeReader dateTreeReader;
+
+ private DateWritable doubleResult;
+ private LongColumnVector longColVector;
+ private TimestampColumnVector timestampColVector;
+
+ TimestampFromDateTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ dateTreeReader = new DateTreeReader(columnId);
+ setConvertTreeReader(dateTreeReader);
+ doubleResult = new DateWritable();
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ long millis =
+ DateWritable.daysToMillis((int) longColVector.vector[elementNum]);
+ timestampColVector.set(elementNum, new Timestamp(millis));
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ timestampColVector = (TimestampColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ dateTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, timestampColVector, batchSize);
+ }
+ }
+
+ public static class DateFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private final TypeDescription fileType;
+ private BytesColumnVector bytesColVector;
+ private LongColumnVector longColVector;
+
+ DateFromStringGroupTreeReader(int columnId, TypeDescription fileType)
+ throws IOException {
+ super(columnId);
+ this.fileType = fileType;
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String stringValue =
+ stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
+ Date dateValue = parseDateFromString(stringValue);
+ if (dateValue != null) {
+ longColVector.vector[elementNum] = DateWritable.dateToDays(dateValue);
+ } else {
+ longColVector.noNulls = false;
+ longColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (bytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ bytesColVector = new BytesColumnVector();
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
+
+ convertVector(bytesColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class DateFromTimestampTreeReader extends ConvertTreeReader {
+
+ private TimestampTreeReader timestampTreeReader;
+
+ private final TypeDescription readerType;
+ private TimestampColumnVector timestampColVector;
+ private LongColumnVector longColVector;
+
+ DateFromTimestampTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
+ setConvertTreeReader(timestampTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ Date dateValue =
+ DateWritable.timeToDate(TimestampUtils.millisToSeconds(
+ timestampColVector.asScratchTimestamp(elementNum).getTime()));
+ longColVector.vector[elementNum] = DateWritable.dateToDays(dateValue);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (timestampColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ timestampColVector = new TimestampColumnVector();
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
+
+ convertVector(timestampColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class BinaryFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private final TypeDescription fileType;
+
+ BinaryFromStringGroupTreeReader(int columnId, TypeDescription fileType)
+ throws IOException {
+ super(columnId);
+ this.fileType = fileType;
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ super.nextVector(previousVector, isNull, batchSize);
+ }
+ }
+
+ private static TreeReader createAnyIntegerConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from (BOOLEAN, BYTE, SHORT, INT, LONG) to schema type.
+ //
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ if (fileType.getCategory() == readerType.getCategory()) {
+ throw new IllegalArgumentException("No conversion of type " +
+ readerType.getCategory() + " to self needed");
+ }
+ return new AnyIntegerFromAnyIntegerTreeReader(columnId, fileType, readerType,
+ skipCorrupt);
+
+ case FLOAT:
+ return new FloatFromAnyIntegerTreeReader(columnId, fileType,
+ skipCorrupt);
+
+ case DOUBLE:
+ return new DoubleFromAnyIntegerTreeReader(columnId, fileType,
+ skipCorrupt);
+
+ case DECIMAL:
+ return new DecimalFromAnyIntegerTreeReader(columnId, fileType, readerType, skipCorrupt);
+
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return new StringGroupFromAnyIntegerTreeReader(columnId, fileType, readerType,
+ skipCorrupt);
+
+ case TIMESTAMP:
+ return new TimestampFromAnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+
+ // Not currently supported conversion(s):
+ case BINARY:
+ case DATE:
+
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION:
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerType.getCategory());
+ }
+ }
+
+ private static TreeReader createFloatConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from FLOAT to schema type.
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new AnyIntegerFromFloatTreeReader(columnId, readerType);
+
+ case FLOAT:
+ throw new IllegalArgumentException("No conversion of type " +
+ readerType.getCategory() + " to self needed");
+
+ case DOUBLE:
+ return new DoubleFromFloatTreeReader(columnId);
+
+ case DECIMAL:
+ return new DecimalFromFloatTreeReader(columnId, readerType);
+
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return new StringGroupFromFloatTreeReader(columnId, readerType, skipCorrupt);
+
+ case TIMESTAMP:
+ return new TimestampFromFloatTreeReader(columnId, readerType, skipCorrupt);
+
+ // Not currently supported conversion(s):
+ case BINARY:
+ case DATE:
+
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION:
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerType.getCategory());
+ }
+ }
+
+ private static TreeReader createDoubleConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from DOUBLE to schema type.
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new AnyIntegerFromDoubleTreeReader(columnId, readerType);
+
+ case FLOAT:
+ return new FloatFromDoubleTreeReader(columnId);
+
+ case DOUBLE:
+ throw new IllegalArgumentException("No conversion of type " +
+ readerType.getCategory() + " to self needed");
+
+ case DECIMAL:
+ return new DecimalFromDoubleTreeReader(columnId, readerType);
+
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return new StringGroupFromDoubleTreeReader(columnId, readerType, skipCorrupt);
+
+ case TIMESTAMP:
+ return new TimestampFromDoubleTreeReader(columnId, readerType, skipCorrupt);
+
+ // Not currently supported conversion(s):
+ case BINARY:
+ case DATE:
+
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION:
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerType.getCategory());
+ }
+ }
+
+ private static TreeReader createDecimalConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from DECIMAL to schema type.
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new AnyIntegerFromDecimalTreeReader(columnId, fileType, readerType);
+
+ case FLOAT:
+ return new FloatFromDecimalTreeReader(columnId, fileType, readerType);
+
+ case DOUBLE:
+ return new DoubleFromDecimalTreeReader(columnId, fileType, readerType);
+
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return new StringGroupFromDecimalTreeReader(columnId, fileType, readerType, skipCorrupt);
+
+ case TIMESTAMP:
+ return new TimestampFromDecimalTreeReader(columnId, fileType, skipCorrupt);
+
+ case DECIMAL:
+ // UNDONE: Decimal to Decimal conversion????
+
+ // Not currently supported conversion(s):
+ case BINARY:
+ case DATE:
+
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION:
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerType.getCategory());
+ }
+ }
+
+ private static TreeReader createStringConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from STRING to schema type.
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new AnyIntegerFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case FLOAT:
+ return new FloatFromStringGroupTreeReader(columnId, fileType);
+
+ case DOUBLE:
+ return new DoubleFromStringGroupTreeReader(columnId, fileType);
+
+ case DECIMAL:
+ return new DecimalFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case CHAR:
+ return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case VARCHAR:
+ return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case STRING:
+ throw new IllegalArgumentException("No conversion of type " +
+ readerType.getCategory() + " to self needed");
+
+ case BINARY:
+ return new BinaryFromStringGroupTreeReader(columnId, fileType);
+
+ case TIMESTAMP:
+ return new TimestampFromStringGroupTreeReader(columnId, fileType);
+
+ case DATE:
+ return new DateFromStringGroupTreeReader(columnId, fileType);
+
+ // Not currently supported conversion(s):
+
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION:
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerType.getCategory());
+ }
+ }
+
+ private static TreeReader createCharConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from CHAR to schema type.
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new AnyIntegerFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case FLOAT:
+ return new FloatFromStringGroupTreeReader(columnId, fileType);
+
+ case DOUBLE:
+ return new DoubleFromStringGroupTreeReader(columnId, fileType);
+
+ case DECIMAL:
+ return new DecimalFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case STRING:
+ return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case VARCHAR:
+ return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case CHAR:
+ throw new IllegalArgumentException("No conversion of type " +
+ readerType.getCategory() + " to self needed");
+
+ case BINARY:
+ return new BinaryFromStringGroupTreeReader(columnId, fileType);
+
+ case TIMESTAMP:
+ return new TimestampFromStringGroupTreeReader(columnId, fileType);
+
+ case DATE:
+ return new DateFromStringGroupTreeReader(columnId, fileType);
+
+ // Not currently supported conversion(s):
+
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION:
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerType.getCategory());
+ }
+ }
+
+ private static TreeReader createVarcharConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from VARCHAR to schema type.
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new AnyIntegerFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case FLOAT:
+ return new FloatFromStringGroupTreeReader(columnId, fileType);
+
+ case DOUBLE:
+ return new DoubleFromStringGroupTreeReader(columnId, fileType);
+
+ case DECIMAL:
+ return new DecimalFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case STRING:
+ return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case CHAR:
+ return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case VARCHAR:
+ throw new IllegalArgumentException("No conversion of type " +
+ readerType.getCategory() + " to self needed");
+
+ case BINARY:
+ return new BinaryFromStringGroupTreeReader(columnId, fileType);
+
+ case TIMESTAMP:
+ return new TimestampFromStringGroupTreeReader(columnId, fileType);
+
+ case DATE:
+ return new DateFromStringGroupTreeReader(columnId, fileType);
+
+ // Not currently supported conversion(s):
+
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION:
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerType.getCategory());
+ }
+ }
+
+ private static TreeReader createTimestampConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from TIMESTAMP to schema type.
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new AnyInte
<TRUNCATED>
[07/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcWideTable.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcWideTable.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcWideTable.java
deleted file mode 100644
index da2c681..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcWideTable.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-
-import org.junit.Test;
-
-public class TestOrcWideTable {
-
- @Test
- public void testBufferSizeFor1Col() throws IOException {
- assertEquals(128 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
- 1, 128*1024));
- }
-
- @Test
- public void testBufferSizeFor50Col() throws IOException {
- assertEquals(256 * 1024, WriterImpl.getEstimatedBufferSize(256 * 1024 * 1024,
- 50, 256*1024));
- }
-
- @Test
- public void testBufferSizeFor1000Col() throws IOException {
- assertEquals(32 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
- 1000, 128*1024));
- }
-
- @Test
- public void testBufferSizeFor2000Col() throws IOException {
- assertEquals(16 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
- 2000, 256*1024));
- }
-
- @Test
- public void testBufferSizeFor4000Col() throws IOException {
- assertEquals(8 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
- 4000, 256*1024));
- }
-
- @Test
- public void testBufferSizeFor25000Col() throws IOException {
- assertEquals(4 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
- 25000, 256*1024));
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRLEv2.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRLEv2.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRLEv2.java
deleted file mode 100644
index 1a3559e..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestRLEv2.java
+++ /dev/null
@@ -1,297 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.PrintStream;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-
-public class TestRLEv2 {
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
- Path testFilePath;
- Configuration conf;
- FileSystem fs;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem () throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestRLEv2." +
- testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @Test
- public void testFixedDeltaZero() throws Exception {
- ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .inspector(inspector)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
-
- for (int i = 0; i < 5120; ++i) {
- w.addRow(123);
- }
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // 10 runs of 512 elements. Each run has 2 bytes header, 2 bytes base (base = 123,
- // zigzag encoded varint) and 1 byte delta (delta = 0). In total, 5 bytes per run.
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
- System.setOut(origOut);
- }
-
- @Test
- public void testFixedDeltaOne() throws Exception {
- ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .inspector(inspector)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
-
- for (int i = 0; i < 5120; ++i) {
- w.addRow(i % 512);
- }
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // 10 runs of 512 elements. Each run has 2 bytes header, 1 byte base (base = 0)
- // and 1 byte delta (delta = 1). In total, 4 bytes per run.
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 40"));
- System.setOut(origOut);
- }
-
- @Test
- public void testFixedDeltaOneDescending() throws Exception {
- ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .inspector(inspector)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
-
- for (int i = 0; i < 5120; ++i) {
- w.addRow(512 - (i % 512));
- }
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint)
- // and 1 byte delta (delta = 1). In total, 5 bytes per run.
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
- System.setOut(origOut);
- }
-
- @Test
- public void testFixedDeltaLarge() throws Exception {
- ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .inspector(inspector)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
-
- for (int i = 0; i < 5120; ++i) {
- w.addRow(i % 512 + ((i % 512 ) * 100));
- }
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // 10 runs of 512 elements. Each run has 2 bytes header, 1 byte base (base = 0)
- // and 2 bytes delta (delta = 100, zigzag encoded varint). In total, 5 bytes per run.
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
- System.setOut(origOut);
- }
-
- @Test
- public void testFixedDeltaLargeDescending() throws Exception {
- ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .inspector(inspector)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
-
- for (int i = 0; i < 5120; ++i) {
- w.addRow((512 - i % 512) + ((i % 512 ) * 100));
- }
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint)
- // and 2 bytes delta (delta = 100, zigzag encoded varint). In total, 6 bytes per run.
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 60"));
- System.setOut(origOut);
- }
-
- @Test
- public void testShortRepeat() throws Exception {
- ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .inspector(inspector)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
-
- for (int i = 0; i < 5; ++i) {
- w.addRow(10);
- }
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // 1 byte header + 1 byte value
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 2"));
- System.setOut(origOut);
- }
-
- @Test
- public void testDeltaUnknownSign() throws Exception {
- ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .inspector(inspector)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
-
- w.addRow(0);
- for (int i = 0; i < 511; ++i) {
- w.addRow(i);
- }
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // monotonicity will be undetermined for this sequence 0,0,1,2,3,...510. Hence DIRECT encoding
- // will be used. 2 bytes for header and 640 bytes for data (512 values with fixed bit of 10 bits
- // each, 5120/8 = 640). Total bytes 642
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 642"));
- System.setOut(origOut);
- }
-
- @Test
- public void testPatchedBase() throws Exception {
- ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .inspector(inspector)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
-
- Random rand = new Random(123);
- w.addRow(10000000);
- for (int i = 0; i < 511; ++i) {
- w.addRow(rand.nextInt(i+1));
- }
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // use PATCHED_BASE encoding
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 583"));
- System.setOut(origOut);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestReaderImpl.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestReaderImpl.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestReaderImpl.java
deleted file mode 100644
index e0199d6..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestReaderImpl.java
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright 2016 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import java.io.ByteArrayInputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PositionedReadable;
-import org.apache.hadoop.fs.Seekable;
-import org.apache.hadoop.hive.ql.io.FileFormatException;
-import org.apache.hadoop.io.Text;
-import org.junit.Test;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.rules.ExpectedException;
-
-public class TestReaderImpl {
-
- @Rule
- public ExpectedException thrown = ExpectedException.none();
-
- private final Path path = new Path("test-file.orc");
- private FSDataInputStream in;
- private int psLen;
- private ByteBuffer buffer;
-
- @Before
- public void setup() {
- in = null;
- }
-
- @Test
- public void testEnsureOrcFooterSmallTextFile() throws IOException {
- prepareTestCase("1".getBytes());
- thrown.expect(FileFormatException.class);
- ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
- }
-
- @Test
- public void testEnsureOrcFooterLargeTextFile() throws IOException {
- prepareTestCase("This is Some Text File".getBytes());
- thrown.expect(FileFormatException.class);
- ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
- }
-
- @Test
- public void testEnsureOrcFooter011ORCFile() throws IOException {
- prepareTestCase(composeContent(OrcFile.MAGIC, "FOOTER"));
- ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
- }
-
- @Test
- public void testEnsureOrcFooterCorrectORCFooter() throws IOException {
- prepareTestCase(composeContent("",OrcFile.MAGIC));
- ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
- }
-
- private void prepareTestCase(byte[] bytes) {
- buffer = ByteBuffer.wrap(bytes);
- psLen = buffer.get(bytes.length - 1) & 0xff;
- in = new FSDataInputStream(new SeekableByteArrayInputStream(bytes));
- }
-
- private byte[] composeContent(String headerStr, String footerStr) throws CharacterCodingException {
- ByteBuffer header = Text.encode(headerStr);
- ByteBuffer footer = Text.encode(footerStr);
- int headerLen = header.remaining();
- int footerLen = footer.remaining() + 1;
-
- ByteBuffer buf = ByteBuffer.allocate(headerLen + footerLen);
-
- buf.put(header);
- buf.put(footer);
- buf.put((byte) footerLen);
- return buf.array();
- }
-
- private static final class SeekableByteArrayInputStream extends ByteArrayInputStream
- implements Seekable, PositionedReadable {
-
- public SeekableByteArrayInputStream(byte[] buf) {
- super(buf);
- }
-
- @Override
- public void seek(long pos) throws IOException {
- this.reset();
- this.skip(pos);
- }
-
- @Override
- public long getPos() throws IOException {
- return pos;
- }
-
- @Override
- public boolean seekToNewSource(long targetPos) throws IOException {
- return false;
- }
-
- @Override
- public int read(long position, byte[] buffer, int offset, int length)
- throws IOException {
- long oldPos = getPos();
- int nread = -1;
- try {
- seek(position);
- nread = read(buffer, offset, length);
- } finally {
- seek(oldPos);
- }
- return nread;
- }
-
- @Override
- public void readFully(long position, byte[] buffer, int offset, int length)
- throws IOException {
- int nread = 0;
- while (nread < length) {
- int nbytes = read(position + nread, buffer, offset + nread, length - nread);
- if (nbytes < 0) {
- throw new EOFException("End of file reached before reading fully.");
- }
- nread += nbytes;
- }
- }
-
- @Override
- public void readFully(long position, byte[] buffer)
- throws IOException {
- readFully(position, buffer, 0, buffer.length);
- }
- }
-}
[05/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStringDictionary.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStringDictionary.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStringDictionary.java
deleted file mode 100644
index 41a211b..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStringDictionary.java
+++ /dev/null
@@ -1,261 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.apache.hadoop.io.Text;
-import org.apache.orc.CompressionKind;
-import org.apache.orc.OrcProto;
-
-import org.apache.orc.StripeInformation;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-
-public class TestStringDictionary {
-
- Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
- + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @Test
- public void testTooManyDistinct() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE)
- .bufferSize(10000));
- for (int i = 0; i < 20000; i++) {
- writer.addRow(new Text(String.valueOf(i)));
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(new Text(String.valueOf(idx++)), row);
- }
-
- // make sure the encoding type is correct
- for (StripeInformation stripe : reader.getStripes()) {
- // hacky but does the job, this casting will work as long this test resides
- // within the same package as ORC reader
- OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind());
- }
- }
- }
-
- @Test
- public void testHalfDistinct() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE)
- .bufferSize(10000));
- Random rand = new Random(123);
- int[] input = new int[20000];
- for (int i = 0; i < 20000; i++) {
- input[i] = rand.nextInt(10000);
- }
-
- for (int i = 0; i < 20000; i++) {
- writer.addRow(new Text(String.valueOf(input[i])));
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(new Text(String.valueOf(input[idx++])), row);
- }
-
- // make sure the encoding type is correct
- for (StripeInformation stripe : reader.getStripes()) {
- // hacky but does the job, this casting will work as long this test resides
- // within the same package as ORC reader
- OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind());
- }
- }
- }
-
- @Test
- public void testTooManyDistinctCheckDisabled() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false);
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE)
- .bufferSize(10000));
- for (int i = 0; i < 20000; i++) {
- writer.addRow(new Text(String.valueOf(i)));
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(new Text(String.valueOf(idx++)), row);
- }
-
- // make sure the encoding type is correct
- for (StripeInformation stripe : reader.getStripes()) {
- // hacky but does the job, this casting will work as long this test resides
- // within the same package as ORC reader
- OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind());
- }
- }
- }
-
- @Test
- public void testHalfDistinctCheckDisabled() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false);
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE)
- .bufferSize(10000));
- Random rand = new Random(123);
- int[] input = new int[20000];
- for (int i = 0; i < 20000; i++) {
- input[i] = rand.nextInt(10000);
- }
-
- for (int i = 0; i < 20000; i++) {
- writer.addRow(new Text(String.valueOf(input[i])));
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(new Text(String.valueOf(input[idx++])), row);
- }
-
- // make sure the encoding type is correct
- for (StripeInformation stripe : reader.getStripes()) {
- // hacky but does the job, this casting will work as long this test resides
- // within the same package as ORC reader
- OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind());
- }
- }
- }
-
- @Test
- public void testTooManyDistinctV11AlwaysDictionary() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE)
- .version(OrcFile.Version.V_0_11).bufferSize(10000));
- for (int i = 0; i < 20000; i++) {
- writer.addRow(new Text(String.valueOf(i)));
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(new Text(String.valueOf(idx++)), row);
- }
-
- // make sure the encoding type is correct
- for (StripeInformation stripe : reader.getStripes()) {
- // hacky but does the job, this casting will work as long this test resides
- // within the same package as ORC reader
- OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY, encoding.getKind());
- }
- }
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestTypeDescription.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestTypeDescription.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestTypeDescription.java
deleted file mode 100644
index 96af65a..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestTypeDescription.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.orc.TypeDescription;
-import org.junit.Test;
-
-public class TestTypeDescription {
-
- @Test
- public void testJson() {
- TypeDescription bin = TypeDescription.createBinary();
- assertEquals("{\"category\": \"binary\", \"id\": 0, \"max\": 0}",
- bin.toJson());
- assertEquals("binary", bin.toString());
- TypeDescription struct = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt())
- .addField("f2", TypeDescription.createString())
- .addField("f3", TypeDescription.createDecimal());
- assertEquals("struct<f1:int,f2:string,f3:decimal(38,10)>",
- struct.toString());
- assertEquals("{\"category\": \"struct\", \"id\": 0, \"max\": 3, \"fields\": [\n"
- + " \"f1\": {\"category\": \"int\", \"id\": 1, \"max\": 1},\n"
- + " \"f2\": {\"category\": \"string\", \"id\": 2, \"max\": 2},\n"
- + " \"f3\": {\"category\": \"decimal\", \"id\": 3, \"max\": 3, \"precision\": 38, \"scale\": 10}]}",
- struct.toJson());
- struct = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createUnion()
- .addUnionChild(TypeDescription.createByte())
- .addUnionChild(TypeDescription.createDecimal()
- .withPrecision(20).withScale(10)))
- .addField("f2", TypeDescription.createStruct()
- .addField("f3", TypeDescription.createDate())
- .addField("f4", TypeDescription.createDouble())
- .addField("f5", TypeDescription.createBoolean()))
- .addField("f6", TypeDescription.createChar().withMaxLength(100));
- assertEquals("struct<f1:uniontype<tinyint,decimal(20,10)>,f2:struct<f3:date,f4:double,f5:boolean>,f6:char(100)>",
- struct.toString());
- assertEquals(
- "{\"category\": \"struct\", \"id\": 0, \"max\": 8, \"fields\": [\n" +
- " \"f1\": {\"category\": \"uniontype\", \"id\": 1, \"max\": 3, \"children\": [\n" +
- " {\"category\": \"tinyint\", \"id\": 2, \"max\": 2},\n" +
- " {\"category\": \"decimal\", \"id\": 3, \"max\": 3, \"precision\": 20, \"scale\": 10}]},\n" +
- " \"f2\": {\"category\": \"struct\", \"id\": 4, \"max\": 7, \"fields\": [\n" +
- " \"f3\": {\"category\": \"date\", \"id\": 5, \"max\": 5},\n" +
- " \"f4\": {\"category\": \"double\", \"id\": 6, \"max\": 6},\n" +
- " \"f5\": {\"category\": \"boolean\", \"id\": 7, \"max\": 7}]},\n" +
- " \"f6\": {\"category\": \"char\", \"id\": 8, \"max\": 8, \"length\": 100}]}",
- struct.toJson());
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestUnrolledBitPack.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestUnrolledBitPack.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestUnrolledBitPack.java
deleted file mode 100644
index 3251731..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestUnrolledBitPack.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.orc.CompressionKind;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-import com.google.common.collect.Lists;
-import com.google.common.primitives.Longs;
-
-@RunWith(value = Parameterized.class)
-public class TestUnrolledBitPack {
-
- private long val;
-
- public TestUnrolledBitPack(long val) {
- this.val = val;
- }
-
- @Parameters
- public static Collection<Object[]> data() {
- Object[][] data = new Object[][] { { -1 }, { 1 }, { 7 }, { -128 }, { 32000 }, { 8300000 },
- { Integer.MAX_VALUE }, { 540000000000L }, { 140000000000000L }, { 36000000000000000L },
- { Long.MAX_VALUE } };
- return Arrays.asList(data);
- }
-
- Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
- + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @Test
- public void testBitPacking() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[] { val, 0, val, val, 0, val, 0, val, val, 0, val, 0, val, val, 0, 0,
- val, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val,
- 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0,
- 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0,
- val, 0, val, 0, 0, val, 0, val, 0, 0, val, val };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(10000));
- for (Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
-}
[08/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java
deleted file mode 100644
index f41a7ba..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java
+++ /dev/null
@@ -1,1342 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.io.File;
-import java.sql.Timestamp;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.serde2.io.TimestampWritable;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.orc.CompressionKind;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-import com.google.common.collect.Lists;
-import com.google.common.primitives.Longs;
-
-@RunWith(value = Parameterized.class)
-public class TestNewIntegerEncoding {
-
- private OrcFile.EncodingStrategy encodingStrategy;
-
- public TestNewIntegerEncoding( OrcFile.EncodingStrategy es) {
- this.encodingStrategy = es;
- }
-
- @Parameters
- public static Collection<Object[]> data() {
- Object[][] data = new Object[][] { { OrcFile.EncodingStrategy.COMPRESSION },
- { OrcFile.EncodingStrategy.SPEED } };
- return Arrays.asList(data);
- }
-
- public static class TSRow {
- Timestamp ts;
-
- public TSRow(Timestamp ts) {
- this.ts = ts;
- }
- }
-
- public static class Row {
- Integer int1;
- Long long1;
-
- public Row(int val, long l) {
- this.int1 = val;
- this.long1 = l;
- }
- }
-
- Path workDir = new Path(System.getProperty("test.tmp.dir", "target"
- + File.separator + "test" + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile."
- + testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @Test
- public void testBasicRow() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Row.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- writer.addRow(new Row(111, 1111L));
- writer.addRow(new Row(111, 1111L));
- writer.addRow(new Row(111, 1111L));
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(new IntWritable(111), ((OrcStruct) row).getFieldValue(0));
- assertEquals(new LongWritable(1111), ((OrcStruct) row).getFieldValue(1));
- }
- }
-
- @Test
- public void testBasicOld() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6,
- 7, 8, 9, 10, 1, 1, 1, 1, 1, 1, 10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1,
- 2, 5, 1, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1,
- 9, 2, 6, 3, 7, 1, 9, 2, 6, 2000, 2, 1, 1, 1, 1, 1, 3, 7, 1, 9, 2, 6, 1,
- 1, 1, 1, 1 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .compress(CompressionKind.NONE)
- .version(OrcFile.Version.V_0_11)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testBasicNew() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6,
- 7, 8, 9, 10, 1, 1, 1, 1, 1, 1, 10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1,
- 2, 5, 1, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1,
- 9, 2, 6, 3, 7, 1, 9, 2, 6, 2000, 2, 1, 1, 1, 1, 1, 3, 7, 1, 9, 2, 6, 1,
- 1, 1, 1, 1 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testBasicDelta1() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[] { -500, -400, -350, -325, -310 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testBasicDelta2() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[] { -500, -600, -650, -675, -710 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testBasicDelta3() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[] { 500, 400, 350, 325, 310 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testBasicDelta4() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[] { 500, 600, 650, 675, 710 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testDeltaOverflow() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory
- .getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[]{4513343538618202719l, 4513343538618202711l,
- 2911390882471569739l,
- -9181829309989854913l};
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(10000));
- for (Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile
- .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testDeltaOverflow2() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory
- .getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[]{Long.MAX_VALUE, 4513343538618202711l,
- 2911390882471569739l,
- Long.MIN_VALUE};
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(10000));
- for (Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile
- .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testDeltaOverflow3() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory
- .getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[]{-4513343538618202711l, -2911390882471569739l, -2,
- Long.MAX_VALUE};
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(10000));
- for (Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile
- .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testIntegerMin() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- input.add((long) Integer.MIN_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testIntegerMax() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- input.add((long) Integer.MAX_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testLongMin() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- input.add(Long.MIN_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testLongMax() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- input.add(Long.MAX_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testRandomInt() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 100000; i++) {
- input.add((long) rand.nextInt());
- }
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testRandomLong() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 100000; i++) {
- input.add(rand.nextLong());
- }
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseNegativeMin() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2,
- 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1,
- 1, 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1, 2, 2, 1, 1,
- 52, 4, 1, 2, 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6,
- 2, 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1, 33, 2, -13, 1, 2, 3,
- 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6, 15, 25, 1, 1, 1, 46, 2, 1, 1,
- 141, 3, 1, 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2, 2, 1, 9, 10, 2, 1, 4,
- 13, 1, 5, 4, 4, 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5, 2, 4, 1,
- 1, 2, 16, 1, 2966, 3, 1, 1, 25501, 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1,
- 2, 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3, 112, 13, 2, 2, 1, 5, 10, 3, 1,
- 1, 13, 2, 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2, 207, 1, 1, 2, 4, 3, 3, 2,
- 2, 16 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseNegativeMin2() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2,
- 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1,
- 1, 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1, 2, 2, 1, 1,
- 52, 4, 1, 2, 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6,
- 2, 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1, 33, 2, -1, 1, 2, 3,
- 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6, 15, 25, 1, 1, 1, 46, 2, 1, 1,
- 141, 3, 1, 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2, 2, 1, 9, 10, 2, 1, 4,
- 13, 1, 5, 4, 4, 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5, 2, 4, 1,
- 1, 2, 16, 1, 2966, 3, 1, 1, 25501, 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1,
- 2, 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3, 112, 13, 2, 2, 1, 5, 10, 3, 1,
- 1, 13, 2, 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2, 207, 1, 1, 2, 4, 3, 3, 2,
- 2, 16 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseNegativeMin3() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2,
- 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1,
- 1, 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1, 2, 2, 1, 1,
- 52, 4, 1, 2, 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6,
- 2, 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1, 33, 2, 0, 1, 2, 3,
- 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6, 15, 25, 1, 1, 1, 46, 2, 1, 1,
- 141, 3, 1, 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2, 2, 1, 9, 10, 2, 1, 4,
- 13, 1, 5, 4, 4, 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5, 2, 4, 1,
- 1, 2, 16, 1, 2966, 3, 1, 1, 25501, 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1,
- 2, 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3, 112, 13, 2, 2, 1, 5, 10, 3, 1,
- 1, 13, 2, 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2, 207, 1, 1, 2, 4, 3, 3, 2,
- 2, 16 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseNegativeMin4() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- long[] inp = new long[] { 13, 13, 11, 8, 13, 10, 10, 11, 11, 14, 11, 7, 13,
- 12, 12, 11, 15, 12, 12, 9, 8, 10, 13, 11, 8, 6, 5, 6, 11, 7, 15, 10, 7,
- 6, 8, 7, 9, 9, 11, 33, 11, 3, 7, 4, 6, 10, 14, 12, 5, 14, 7, 6 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseAt0() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(100));
- }
- input.set(0, 20000L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseAt1() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(100));
- }
- input.set(1, 20000L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseAt255() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(100));
- }
- input.set(255, 20000L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseAt256() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(100));
- }
- input.set(256, 20000L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBase510() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(100));
- }
- input.set(510, 20000L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBase511() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(100));
- }
- input.set(511, 20000L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseMax1() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for (int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(60));
- }
- input.set(511, Long.MAX_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for (Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseMax2() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for (int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(60));
- }
- input.set(128, Long.MAX_VALUE);
- input.set(256, Long.MAX_VALUE);
- input.set(511, Long.MAX_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for (Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseMax3() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- input.add(371946367L);
- input.add(11963367L);
- input.add(68639400007L);
- input.add(100233367L);
- input.add(6367L);
- input.add(10026367L);
- input.add(3670000L);
- input.add(3602367L);
- input.add(4719226367L);
- input.add(7196367L);
- input.add(444442L);
- input.add(210267L);
- input.add(21033L);
- input.add(160267L);
- input.add(400267L);
- input.add(23634347L);
- input.add(16027L);
- input.add(46026367L);
- input.add(Long.MAX_VALUE);
- input.add(33333L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for (Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseMax4() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- for (int i = 0; i < 25; i++) {
- input.add(371292224226367L);
- input.add(119622332222267L);
- input.add(686329400222007L);
- input.add(100233333222367L);
- input.add(636272333322222L);
- input.add(10202633223267L);
- input.add(36700222022230L);
- input.add(36023226224227L);
- input.add(47192226364427L);
- input.add(71963622222447L);
- input.add(22244444222222L);
- input.add(21220263327442L);
- input.add(21032233332232L);
- input.add(16026322232227L);
- input.add(40022262272212L);
- input.add(23634342227222L);
- input.add(16022222222227L);
- input.add(46026362222227L);
- input.add(46026362222227L);
- input.add(33322222222323L);
- }
- input.add(Long.MAX_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- for (Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-
- @Test
- public void testPatchedBaseTimestamp() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(TSRow.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
-
- List<Timestamp> tslist = Lists.newArrayList();
- tslist.add(Timestamp.valueOf("2099-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2003-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("1999-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("1995-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2002-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2010-03-02 00:00:00"));
- tslist.add(Timestamp.valueOf("2005-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2006-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2003-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("1996-08-02 00:00:00"));
- tslist.add(Timestamp.valueOf("1998-11-02 00:00:00"));
- tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
- tslist.add(Timestamp.valueOf("1993-08-02 00:00:00"));
- tslist.add(Timestamp.valueOf("2008-01-02 00:00:00"));
- tslist.add(Timestamp.valueOf("2007-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2004-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
- tslist.add(Timestamp.valueOf("2003-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2004-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2008-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2005-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("1994-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2006-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2004-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2001-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2000-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2000-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2002-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2006-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2011-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2002-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2005-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("1974-01-01 00:00:00"));
-
- for (Timestamp ts : tslist) {
- writer.addRow(new TSRow(ts));
- }
-
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(tslist.get(idx++).getNanos(),
- ((TimestampWritable) ((OrcStruct) row).getFieldValue(0)).getNanos());
- }
- }
-
- @Test
- public void testDirectLargeNegatives() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
-
- writer.addRow(-7486502418706614742L);
- writer.addRow(0L);
- writer.addRow(1L);
- writer.addRow(1L);
- writer.addRow(-5535739865598783616L);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- Object row = rows.next(null);
- assertEquals(-7486502418706614742L, ((LongWritable) row).get());
- row = rows.next(row);
- assertEquals(0L, ((LongWritable) row).get());
- row = rows.next(row);
- assertEquals(1L, ((LongWritable) row).get());
- row = rows.next(row);
- assertEquals(1L, ((LongWritable) row).get());
- row = rows.next(row);
- assertEquals(-5535739865598783616L, ((LongWritable) row).get());
- }
-
- @Test
- public void testSeek() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(
- Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 100000; i++) {
- input.add((long) rand.nextInt());
- }
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .compress(CompressionKind.NONE)
- .stripeSize(100000)
- .bufferSize(10000)
- .version(OrcFile.Version.V_0_11)
- .encodingStrategy(encodingStrategy));
- for(Long l : input) {
- writer.addRow(l);
- }
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 55555;
- rows.seekToRow(idx);
- while (rows.hasNext()) {
- Object row = rows.next(null);
- assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java
index 1a97a6d..c7c2c9d 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java
@@ -45,6 +45,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.llap.TypeDesc;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
@@ -537,7 +538,7 @@ public class TestOrcFile {
Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows(null);
+ RecordReader rows = reader.rows();
int idx = 0;
while (rows.hasNext()) {
Object row = rows.next(null);
@@ -574,7 +575,7 @@ public class TestOrcFile {
List<? extends StructField> fields = readerInspector.getAllStructFieldRefs();
HiveDecimalObjectInspector doi = (HiveDecimalObjectInspector) readerInspector.
getStructFieldRef("dec").getFieldObjectInspector();
- RecordReader rows = reader.rows(null);
+ RecordReader rows = reader.rows();
while (rows.hasNext()) {
Object row = rows.next(null);
assertEquals(null, doi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row,
@@ -617,7 +618,7 @@ public class TestOrcFile {
List<? extends StructField> fields = readerInspector.getAllStructFieldRefs();
HiveDecimalObjectInspector doi = (HiveDecimalObjectInspector) readerInspector.
getStructFieldRef("dec").getFieldObjectInspector();
- RecordReader rows = reader.rows(null);
+ RecordReader rows = reader.rows();
int idx = 0;
while (rows.hasNext()) {
Object row = rows.next(null);
@@ -1702,6 +1703,11 @@ public class TestOrcFile {
RecordReader rows = reader.rows();
OrcStruct row = null;
for(int i=COUNT-1; i >= 0; --i) {
+ // since we are walking backwards, seek back a buffer width so that
+ // we load the previous buffer of rows
+ if (i % COUNT == COUNT - 1) {
+ rows.seekToRow(i - (COUNT - 1));
+ }
rows.seekToRow(i);
row = (OrcStruct) rows.next(row);
BigRow expected = createRandomRow(intValues, doubleValues,
@@ -1816,6 +1822,11 @@ public class TestOrcFile {
/* all tests are identical to the other seek() tests */
OrcStruct row = null;
for(int i=COUNT-1; i >= 0; --i) {
+ // since we are walking backwards, seek back a buffer width so that
+ // we load the previous buffer of rows
+ if (i % COUNT == COUNT - 1) {
+ rows.seekToRow(i - (COUNT - 1));
+ }
rows.seekToRow(i);
row = (OrcStruct) rows.next(row);
BigRow expected = createRandomRow(intValues, doubleValues,
@@ -2067,10 +2078,11 @@ public class TestOrcFile {
.range(0L, Long.MAX_VALUE)
.include(new boolean[]{true, true, true})
.searchArgument(sarg, new String[]{null, "int1", "string1"}));
- assertEquals(1000L, rows.getRowNumber());
+ assertEquals(0L, rows.getRowNumber());
OrcStruct row = null;
for(int i=1000; i < 2000; ++i) {
assertTrue(rows.hasNext());
+ assertEquals(i, rows.getRowNumber());
row = (OrcStruct) rows.next(row);
assertEquals(300 * i, ((IntWritable) row.getFieldValue(0)).get());
assertEquals(Integer.toHexString(10*i), row.getFieldValue(1).toString());
@@ -2088,7 +2100,6 @@ public class TestOrcFile {
.range(0L, Long.MAX_VALUE)
.include(new boolean[]{true, true, true})
.searchArgument(sarg, new String[]{null, "int1", "string1"}));
- assertEquals(3500L, rows.getRowNumber());
assertTrue(!rows.hasNext());
// select first 100 and last 100 rows
@@ -2154,4 +2165,53 @@ public class TestOrcFile {
Assert.assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get());
}
}
+
+ static class MyList {
+ List<Integer> list = new ArrayList<>();
+ }
+
+ @Test
+ public void testListExpansion() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector
+ (MyList.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf).inspector(inspector));
+ MyList row = new MyList();
+ row.list.add(1);
+ row.list.add(2);
+ row.list.add(3);
+ writer.addRow(row);
+ row.list.clear();
+ writer.addRow(row);
+ row.list.add(11);
+ row.list.add(12);
+ writer.addRow(row);
+ row.list = null;
+ writer.addRow(row);
+ row.list = new ArrayList<>();
+ row.list.add(21);
+ row.list.add(22);
+ row.list.add(23);
+ row.list.add(24);
+ writer.addRow(row);
+ writer.close();
+ RecordReader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf)).rows();
+ assertEquals(true, reader.hasNext());
+ OrcStruct orcrow = (OrcStruct) reader.next(null);
+ assertEquals(3, ((List<IntWritable>) orcrow.getFieldValue(0)).size());
+ orcrow = (OrcStruct) reader.next(row);
+ assertEquals(0, ((List<IntWritable>) orcrow.getFieldValue(0)).size());
+ orcrow = (OrcStruct) reader.next(row);
+ assertEquals(2, ((List<IntWritable>) orcrow.getFieldValue(0)).size());
+ assertEquals(null, ((OrcStruct) reader.next(row)).getFieldValue(0));
+ orcrow = (OrcStruct) reader.next(row);
+ assertEquals(4, ((List<IntWritable>) orcrow.getFieldValue(0)).size());
+ assertEquals(false, reader.hasNext());
+ reader.close();
+ }
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java
deleted file mode 100644
index e96c809..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java
+++ /dev/null
@@ -1,400 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static junit.framework.Assert.assertEquals;
-import static junit.framework.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-
-import java.io.File;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
-import org.apache.hadoop.io.BooleanWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.CompressionKind;
-import org.apache.orc.IntegerColumnStatistics;
-import org.apache.orc.OrcProto;
-
-import org.apache.orc.StringColumnStatistics;
-import org.apache.orc.StripeInformation;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-
-import com.google.common.collect.Lists;
-
-public class TestOrcNullOptimization {
-
- public static class MyStruct {
- Integer a;
- String b;
- Boolean c;
- List<InnerStruct> list = new ArrayList<InnerStruct>();
-
- public MyStruct(Integer a, String b, Boolean c, List<InnerStruct> l) {
- this.a = a;
- this.b = b;
- this.c = c;
- this.list = l;
- }
- }
-
- public static class InnerStruct {
- Integer z;
-
- public InnerStruct(int z) {
- this.z = z;
- }
- }
-
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcNullOptimization." +
- testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @Test
- public void testMultiStripeWithNull() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcNullOptimization.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector
- (MyStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000));
- Random rand = new Random(100);
- writer.addRow(new MyStruct(null, null, true,
- Lists.newArrayList(new InnerStruct(100))));
- for (int i = 2; i < 20000; i++) {
- writer.addRow(new MyStruct(rand.nextInt(1), "a", true, Lists
- .newArrayList(new InnerStruct(100))));
- }
- writer.addRow(new MyStruct(null, null, true,
- Lists.newArrayList(new InnerStruct(100))));
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- // check the stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(20000, reader.getNumberOfRows());
- assertEquals(20000, stats[0].getNumberOfValues());
-
- assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMaximum());
- assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
- assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
- assertEquals(0, ((IntegerColumnStatistics) stats[1]).getSum());
- assertEquals("count: 19998 hasNull: true min: 0 max: 0 sum: 0",
- stats[1].toString());
-
- assertEquals("a", ((StringColumnStatistics) stats[2]).getMaximum());
- assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
- assertEquals(19998, stats[2].getNumberOfValues());
- assertEquals("count: 19998 hasNull: true min: a max: a sum: 19998",
- stats[2].toString());
-
- // check the inspectors
- StructObjectInspector readerInspector =
- (StructObjectInspector) reader.getObjectInspector();
- assertEquals(ObjectInspector.Category.STRUCT,
- readerInspector.getCategory());
- assertEquals("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
- readerInspector.getTypeName());
-
- RecordReader rows = reader.rows();
-
- List<Boolean> expected = Lists.newArrayList();
- for (StripeInformation sinfo : reader.getStripes()) {
- expected.add(false);
- }
- // only the first and last stripe will have PRESENT stream
- expected.set(0, true);
- expected.set(expected.size() - 1, true);
-
- List<Boolean> got = Lists.newArrayList();
- // check if the strip footer contains PRESENT stream
- for (StripeInformation sinfo : reader.getStripes()) {
- OrcProto.StripeFooter sf =
- ((RecordReaderImpl) rows).readStripeFooter(sinfo);
- got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString())
- != -1);
- }
- assertEquals(expected, got);
-
- // row 1
- OrcStruct row = (OrcStruct) rows.next(null);
- assertNotNull(row);
- assertNull(row.getFieldValue(0));
- assertNull(row.getFieldValue(1));
- assertEquals(new BooleanWritable(true), row.getFieldValue(2));
- assertEquals(new IntWritable(100),
- ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).
- getFieldValue(0));
-
- rows.seekToRow(19998);
- // last-1 row
- row = (OrcStruct) rows.next(null);
- assertNotNull(row);
- assertNotNull(row.getFieldValue(1));
- assertEquals(new IntWritable(0), row.getFieldValue(0));
- assertEquals(new BooleanWritable(true), row.getFieldValue(2));
- assertEquals(new IntWritable(100),
- ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).
- getFieldValue(0));
-
- // last row
- row = (OrcStruct) rows.next(row);
- assertNotNull(row);
- assertNull(row.getFieldValue(0));
- assertNull(row.getFieldValue(1));
- assertEquals(new BooleanWritable(true), row.getFieldValue(2));
- assertEquals(new IntWritable(100),
- ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).
- getFieldValue(0));
-
- rows.close();
- }
-
- @Test
- public void testMultiStripeWithoutNull() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcNullOptimization.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector
- (MyStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000));
- Random rand = new Random(100);
- for (int i = 1; i < 20000; i++) {
- writer.addRow(new MyStruct(rand.nextInt(1), "a", true, Lists
- .newArrayList(new InnerStruct(100))));
- }
- writer.addRow(new MyStruct(0, "b", true,
- Lists.newArrayList(new InnerStruct(100))));
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- // check the stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(20000, reader.getNumberOfRows());
- assertEquals(20000, stats[0].getNumberOfValues());
-
- assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMaximum());
- assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
- assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
- assertEquals(0, ((IntegerColumnStatistics) stats[1]).getSum());
- assertEquals("count: 20000 hasNull: false min: 0 max: 0 sum: 0",
- stats[1].toString());
-
- assertEquals("b", ((StringColumnStatistics) stats[2]).getMaximum());
- assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
- assertEquals(20000, stats[2].getNumberOfValues());
- assertEquals("count: 20000 hasNull: false min: a max: b sum: 20000",
- stats[2].toString());
-
- // check the inspectors
- StructObjectInspector readerInspector =
- (StructObjectInspector) reader.getObjectInspector();
- assertEquals(ObjectInspector.Category.STRUCT,
- readerInspector.getCategory());
- assertEquals("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
- readerInspector.getTypeName());
-
- RecordReader rows = reader.rows();
-
- // none of the stripes will have PRESENT stream
- List<Boolean> expected = Lists.newArrayList();
- for (StripeInformation sinfo : reader.getStripes()) {
- expected.add(false);
- }
-
- List<Boolean> got = Lists.newArrayList();
- // check if the strip footer contains PRESENT stream
- for (StripeInformation sinfo : reader.getStripes()) {
- OrcProto.StripeFooter sf =
- ((RecordReaderImpl) rows).readStripeFooter(sinfo);
- got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString())
- != -1);
- }
- assertEquals(expected, got);
-
- rows.seekToRow(19998);
- // last-1 row
- OrcStruct row = (OrcStruct) rows.next(null);
- assertNotNull(row);
- assertNotNull(row.getFieldValue(1));
- assertEquals(new IntWritable(0), row.getFieldValue(0));
- assertEquals("a", row.getFieldValue(1).toString());
- assertEquals(new BooleanWritable(true), row.getFieldValue(2));
- assertEquals(new IntWritable(100),
- ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).
- getFieldValue(0));
-
- // last row
- row = (OrcStruct) rows.next(row);
- assertNotNull(row);
- assertNotNull(row.getFieldValue(0));
- assertNotNull(row.getFieldValue(1));
- assertEquals("b", row.getFieldValue(1).toString());
- assertEquals(new BooleanWritable(true), row.getFieldValue(2));
- assertEquals(new IntWritable(100),
- ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).
- getFieldValue(0));
- rows.close();
- }
-
- @Test
- public void testColumnsWithNullAndCompression() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcNullOptimization.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector
- (MyStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .stripeSize(100000)
- .bufferSize(10000));
- writer.addRow(new MyStruct(3, "a", true,
- Lists.newArrayList(new InnerStruct(100))));
- writer.addRow(new MyStruct(null, "b", true,
- Lists.newArrayList(new InnerStruct(100))));
- writer.addRow(new MyStruct(3, null, false,
- Lists.newArrayList(new InnerStruct(100))));
- writer.addRow(new MyStruct(3, "d", true,
- Lists.newArrayList(new InnerStruct(100))));
- writer.addRow(new MyStruct(2, "e", true,
- Lists.newArrayList(new InnerStruct(100))));
- writer.addRow(new MyStruct(2, "f", true,
- Lists.newArrayList(new InnerStruct(100))));
- writer.addRow(new MyStruct(2, "g", true,
- Lists.newArrayList(new InnerStruct(100))));
- writer.addRow(new MyStruct(2, "h", true,
- Lists.newArrayList(new InnerStruct(100))));
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- // check the stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(8, reader.getNumberOfRows());
- assertEquals(8, stats[0].getNumberOfValues());
-
- assertEquals(3, ((IntegerColumnStatistics) stats[1]).getMaximum());
- assertEquals(2, ((IntegerColumnStatistics) stats[1]).getMinimum());
- assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
- assertEquals(17, ((IntegerColumnStatistics) stats[1]).getSum());
- assertEquals("count: 7 hasNull: true min: 2 max: 3 sum: 17",
- stats[1].toString());
-
- assertEquals("h", ((StringColumnStatistics) stats[2]).getMaximum());
- assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
- assertEquals(7, stats[2].getNumberOfValues());
- assertEquals("count: 7 hasNull: true min: a max: h sum: 7",
- stats[2].toString());
-
- // check the inspectors
- StructObjectInspector readerInspector =
- (StructObjectInspector) reader.getObjectInspector();
- assertEquals(ObjectInspector.Category.STRUCT,
- readerInspector.getCategory());
- assertEquals("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
- readerInspector.getTypeName());
-
- RecordReader rows = reader.rows();
- // only the last strip will have PRESENT stream
- List<Boolean> expected = Lists.newArrayList();
- for (StripeInformation sinfo : reader.getStripes()) {
- expected.add(false);
- }
- expected.set(expected.size() - 1, true);
-
- List<Boolean> got = Lists.newArrayList();
- // check if the strip footer contains PRESENT stream
- for (StripeInformation sinfo : reader.getStripes()) {
- OrcProto.StripeFooter sf =
- ((RecordReaderImpl) rows).readStripeFooter(sinfo);
- got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString())
- != -1);
- }
- assertEquals(expected, got);
-
- // row 1
- OrcStruct row = (OrcStruct) rows.next(null);
- assertNotNull(row);
- assertEquals(new IntWritable(3), row.getFieldValue(0));
- assertEquals("a", row.getFieldValue(1).toString());
- assertEquals(new BooleanWritable(true), row.getFieldValue(2));
- assertEquals(new IntWritable(100),
- ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).
- getFieldValue(0));
-
- // row 2
- row = (OrcStruct) rows.next(row);
- assertNotNull(row);
- assertNull(row.getFieldValue(0));
- assertEquals("b", row.getFieldValue(1).toString());
- assertEquals(new BooleanWritable(true), row.getFieldValue(2));
- assertEquals(new IntWritable(100),
- ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).
- getFieldValue(0));
-
- // row 3
- row = (OrcStruct) rows.next(row);
- assertNotNull(row);
- assertNull(row.getFieldValue(1));
- assertEquals(new IntWritable(3), row.getFieldValue(0));
- assertEquals(new BooleanWritable(false), row.getFieldValue(2));
- assertEquals(new IntWritable(100),
- ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).
- getFieldValue(0));
- rows.close();
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRecordUpdater.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRecordUpdater.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRecordUpdater.java
index 973cc40..0a61fb8 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRecordUpdater.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRecordUpdater.java
@@ -40,6 +40,8 @@ import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Reporter;
+import org.apache.orc.impl.OrcAcidUtils;
+import org.apache.orc.tools.FileDump;
import org.junit.Test;
public class TestOrcRecordUpdater {
@@ -115,7 +117,7 @@ public class TestOrcRecordUpdater {
assertEquals(5L, updater.getStats().getRowCount());
Path bucketPath = AcidUtils.createFilename(root, options);
- Path sidePath = OrcRecordUpdater.getSideFile(bucketPath);
+ Path sidePath = OrcAcidUtils.getSideFile(bucketPath);
DataInputStream side = fs.open(sidePath);
// read the stopping point for the first flush and make sure we only see
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone1.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone1.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone1.java
deleted file mode 100644
index 526c357..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone1.java
+++ /dev/null
@@ -1,194 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static junit.framework.Assert.assertEquals;
-import static junit.framework.Assert.assertNotNull;
-
-import java.io.File;
-import java.sql.Timestamp;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-import java.util.TimeZone;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.serde2.io.TimestampWritable;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.apache.hadoop.hive.serde2.objectinspector.StructField;
-import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
-import org.apache.hive.common.util.HiveTestUtils;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-
-import com.google.common.collect.Lists;
-
-/**
- *
- */
-@RunWith(Parameterized.class)
-public class TestOrcTimezone1 {
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
- String writerTimeZone;
- String readerTimeZone;
- static TimeZone defaultTimeZone = TimeZone.getDefault();
-
- public TestOrcTimezone1(String writerTZ, String readerTZ) {
- this.writerTimeZone = writerTZ;
- this.readerTimeZone = readerTZ;
- }
-
- @Parameterized.Parameters
- public static Collection<Object[]> data() {
- List<Object[]> result = Arrays.asList(new Object[][]{
- /* Extreme timezones */
- {"GMT-12:00", "GMT+14:00"},
- /* No difference in DST */
- {"America/Los_Angeles", "America/Los_Angeles"}, /* same timezone both with DST */
- {"Europe/Berlin", "Europe/Berlin"}, /* same as above but europe */
- {"America/Phoenix", "Asia/Kolkata"} /* Writer no DST, Reader no DST */,
- {"Europe/Berlin", "America/Los_Angeles"} /* Writer DST, Reader DST */,
- {"Europe/Berlin", "America/Chicago"} /* Writer DST, Reader DST */,
- /* With DST difference */
- {"Europe/Berlin", "UTC"},
- {"UTC", "Europe/Berlin"} /* Writer no DST, Reader DST */,
- {"America/Los_Angeles", "Asia/Kolkata"} /* Writer DST, Reader no DST */,
- {"Europe/Berlin", "Asia/Kolkata"} /* Writer DST, Reader no DST */,
- /* Timezone offsets for the reader has changed historically */
- {"Asia/Saigon", "Pacific/Enderbury"},
- {"UTC", "Asia/Jerusalem"},
-
- // NOTE:
- // "1995-01-01 03:00:00.688888888" this is not a valid time in Pacific/Enderbury timezone.
- // On 1995-01-01 00:00:00 GMT offset moved from -11:00 hr to +13:00 which makes all values
- // on 1995-01-01 invalid. Try this with joda time
- // new MutableDateTime("1995-01-01", DateTimeZone.forTimeZone(readerTimeZone));
- });
- return result;
- }
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile." +
- testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @After
- public void restoreTimeZone() {
- TimeZone.setDefault(defaultTimeZone);
- }
-
- @Test
- public void testTimestampWriter() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Timestamp.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).bufferSize(10000));
- assertEquals(writerTimeZone, TimeZone.getDefault().getID());
- List<String> ts = Lists.newArrayList();
- ts.add("2003-01-01 01:00:00.000000222");
- ts.add("1996-08-02 09:00:00.723100809");
- ts.add("1999-01-01 02:00:00.999999999");
- ts.add("1995-01-02 03:00:00.688888888");
- ts.add("2002-01-01 04:00:00.1");
- ts.add("2010-03-02 05:00:00.000009001");
- ts.add("2005-01-01 06:00:00.000002229");
- ts.add("2006-01-01 07:00:00.900203003");
- ts.add("2003-01-01 08:00:00.800000007");
- ts.add("1998-11-02 10:00:00.857340643");
- ts.add("2008-10-02 11:00:00.0");
- ts.add("2037-01-01 00:00:00.000999");
- ts.add("2014-03-28 00:00:00.0");
- for (String t : ts) {
- writer.addRow(Timestamp.valueOf(t));
- }
- writer.close();
-
- TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- assertEquals(readerTimeZone, TimeZone.getDefault().getID());
- RecordReader rows = reader.rows(null);
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- Timestamp got = ((TimestampWritable) row).getTimestamp();
- assertEquals(ts.get(idx++), got.toString());
- }
- rows.close();
- }
-
- @Test
- public void testReadTimestampFormat_0_11() throws Exception {
- TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
- Path oldFilePath =
- new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc"));
- Reader reader = OrcFile.createReader(oldFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- StructObjectInspector readerInspector = (StructObjectInspector) reader
- .getObjectInspector();
- List<? extends StructField> fields = readerInspector
- .getAllStructFieldRefs();
- TimestampObjectInspector tso = (TimestampObjectInspector) readerInspector
- .getStructFieldRef("ts").getFieldObjectInspector();
-
- RecordReader rows = reader.rows();
- Object row = rows.next(null);
- assertNotNull(row);
- assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"),
- tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row,
- fields.get(12))));
-
- // check the contents of second row
- assertEquals(true, rows.hasNext());
- rows.seekToRow(7499);
- row = rows.next(null);
- assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"),
- tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row,
- fields.get(12))));
-
- // handle the close up
- assertEquals(false, rows.hasNext());
- rows.close();
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone2.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone2.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone2.java
deleted file mode 100644
index 3eae4a9..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcTimezone2.java
+++ /dev/null
@@ -1,142 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.io.File;
-import java.sql.Timestamp;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-import java.util.Random;
-import java.util.TimeZone;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.serde2.io.TimestampWritable;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-
-import com.google.common.collect.Lists;
-
-/**
- *
- */
-@RunWith(Parameterized.class)
-public class TestOrcTimezone2 {
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
- String writerTimeZone;
- String readerTimeZone;
- static TimeZone defaultTimeZone = TimeZone.getDefault();
-
- public TestOrcTimezone2(String writerTZ, String readerTZ) {
- this.writerTimeZone = writerTZ;
- this.readerTimeZone = readerTZ;
- }
-
- @Parameterized.Parameters
- public static Collection<Object[]> data() {
- String[] allTimeZones = TimeZone.getAvailableIDs();
- Random rand = new Random(123);
- int len = allTimeZones.length;
- int n = 500;
- Object[][] data = new Object[n][];
- for (int i = 0; i < n; i++) {
- int wIdx = rand.nextInt(len);
- int rIdx = rand.nextInt(len);
- data[i] = new Object[2];
- data[i][0] = allTimeZones[wIdx];
- data[i][1] = allTimeZones[rIdx];
- }
- return Arrays.asList(data);
- }
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile." +
- testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @After
- public void restoreTimeZone() {
- TimeZone.setDefault(defaultTimeZone);
- }
-
- @Test
- public void testTimestampWriter() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Timestamp.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
-
- TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).bufferSize(10000));
- assertEquals(writerTimeZone, TimeZone.getDefault().getID());
- List<String> ts = Lists.newArrayList();
- ts.add("2003-01-01 01:00:00.000000222");
- ts.add("1999-01-01 02:00:00.999999999");
- ts.add("1995-01-02 03:00:00.688888888");
- ts.add("2002-01-01 04:00:00.1");
- ts.add("2010-03-02 05:00:00.000009001");
- ts.add("2005-01-01 06:00:00.000002229");
- ts.add("2006-01-01 07:00:00.900203003");
- ts.add("2003-01-01 08:00:00.800000007");
- ts.add("1996-08-02 09:00:00.723100809");
- ts.add("1998-11-02 10:00:00.857340643");
- ts.add("2008-10-02 11:00:00.0");
- ts.add("2037-01-01 00:00:00.000999");
- for (String t : ts) {
- writer.addRow(Timestamp.valueOf(t));
- }
- writer.close();
-
- TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- assertEquals(readerTimeZone, TimeZone.getDefault().getID());
- RecordReader rows = reader.rows(null);
- int idx = 0;
- while (rows.hasNext()) {
- Object row = rows.next(null);
- Timestamp got = ((TimestampWritable) row).getTimestamp();
- assertEquals(ts.get(idx++), got.toString());
- }
- rows.close();
- }
-}
[23/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java b/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java
new file mode 100644
index 0000000..6c8ecfd
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java
@@ -0,0 +1,2093 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.math.BigInteger;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TimeZone;
+
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.OrcProto;
+
+/**
+ * Factory for creating ORC tree readers.
+ */
+public class TreeReaderFactory {
+
+ public abstract static class TreeReader {
+ protected final int columnId;
+ protected BitFieldReader present = null;
+ protected boolean valuePresent = false;
+ protected int vectorColumnCount;
+
+ TreeReader(int columnId) throws IOException {
+ this(columnId, null);
+ }
+
+ protected TreeReader(int columnId, InStream in) throws IOException {
+ this.columnId = columnId;
+ if (in == null) {
+ present = null;
+ valuePresent = true;
+ } else {
+ present = new BitFieldReader(in, 1);
+ }
+ vectorColumnCount = -1;
+ }
+
+ void setVectorColumnCount(int vectorColumnCount) {
+ this.vectorColumnCount = vectorColumnCount;
+ }
+
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ static IntegerReader createIntegerReader(OrcProto.ColumnEncoding.Kind kind,
+ InStream in,
+ boolean signed, boolean skipCorrupt) throws IOException {
+ switch (kind) {
+ case DIRECT_V2:
+ case DICTIONARY_V2:
+ return new RunLengthIntegerReaderV2(in, signed, skipCorrupt);
+ case DIRECT:
+ case DICTIONARY:
+ return new RunLengthIntegerReader(in, signed);
+ default:
+ throw new IllegalArgumentException("Unknown encoding " + kind);
+ }
+ }
+
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ checkEncoding(stripeFooter.getColumnsList().get(columnId));
+ InStream in = streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.PRESENT));
+ if (in == null) {
+ present = null;
+ valuePresent = true;
+ } else {
+ present = new BitFieldReader(in, 1);
+ }
+ }
+
+ /**
+ * Seek to the given position.
+ *
+ * @param index the indexes loaded from the file
+ * @throws IOException
+ */
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ public void seek(PositionProvider index) throws IOException {
+ if (present != null) {
+ present.seek(index);
+ }
+ }
+
+ protected long countNonNulls(long rows) throws IOException {
+ if (present != null) {
+ long result = 0;
+ for (long c = 0; c < rows; ++c) {
+ if (present.next() == 1) {
+ result += 1;
+ }
+ }
+ return result;
+ } else {
+ return rows;
+ }
+ }
+
+ abstract void skipRows(long rows) throws IOException;
+
+ /**
+ * Called at the top level to read into the given batch.
+ * @param batch the batch to read into
+ * @param batchSize the number of rows to read
+ * @throws IOException
+ */
+ public void nextBatch(VectorizedRowBatch batch,
+ int batchSize) throws IOException {
+ batch.cols[0].reset();
+ batch.cols[0].ensureSize(batchSize, false);
+ nextVector(batch.cols[0], null, batchSize);
+ }
+
+ /**
+ * Populates the isNull vector array in the previousVector object based on
+ * the present stream values. This function is called from all the child
+ * readers, and they all set the values based on isNull field value.
+ *
+ * @param previous The columnVector object whose isNull value is populated
+ * @param isNull Whether the each value was null at a higher level. If
+ * isNull is null, all values are non-null.
+ * @param batchSize Size of the column vector
+ * @throws IOException
+ */
+ public void nextVector(ColumnVector previous,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (present != null || isNull != null) {
+ // Set noNulls and isNull vector of the ColumnVector based on
+ // present stream
+ previous.noNulls = true;
+ boolean allNull = true;
+ for (int i = 0; i < batchSize; i++) {
+ if (isNull == null || !isNull[i]) {
+ if (present != null && present.next() != 1) {
+ previous.noNulls = false;
+ previous.isNull[i] = true;
+ } else {
+ previous.isNull[i] = false;
+ allNull = false;
+ }
+ } else {
+ previous.noNulls = false;
+ previous.isNull[i] = true;
+ }
+ }
+ previous.isRepeating = !previous.noNulls && allNull;
+ } else {
+ // There is no present stream, this means that all the values are
+ // present.
+ previous.noNulls = true;
+ for (int i = 0; i < batchSize; i++) {
+ previous.isNull[i] = false;
+ }
+ }
+ }
+
+ public BitFieldReader getPresent() {
+ return present;
+ }
+ }
+
+ public static class NullTreeReader extends TreeReader {
+
+ public NullTreeReader(int columnId) throws IOException {
+ super(columnId);
+ }
+
+ @Override
+ public void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter footer) {
+ // PASS
+ }
+
+ @Override
+ void skipRows(long rows) {
+ // PASS
+ }
+
+ @Override
+ public void seek(PositionProvider position) {
+ // PASS
+ }
+
+ @Override
+ public void seek(PositionProvider[] position) {
+ // PASS
+ }
+
+ @Override
+ public void nextVector(ColumnVector vector, boolean[] isNull, int size) {
+ vector.noNulls = false;
+ vector.isNull[0] = true;
+ vector.isRepeating = true;
+ }
+ }
+
+ public static class BooleanTreeReader extends TreeReader {
+ protected BitFieldReader reader = null;
+
+ BooleanTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected BooleanTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ if (data != null) {
+ reader = new BitFieldReader(data, 1);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ reader = new BitFieldReader(streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)), 1);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ LongColumnVector result = (LongColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ // Read value entries based on isNull entries
+ reader.nextVector(result, batchSize);
+ }
+ }
+
+ public static class ByteTreeReader extends TreeReader {
+ protected RunLengthByteReader reader = null;
+
+ ByteTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected ByteTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ this.reader = new RunLengthByteReader(data);
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ reader = new RunLengthByteReader(streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)));
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final LongColumnVector result = (LongColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ // Read value entries based on isNull entries
+ reader.nextVector(result, result.vector, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class ShortTreeReader extends TreeReader {
+ protected IntegerReader reader = null;
+
+ ShortTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected ShortTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final LongColumnVector result = (LongColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ // Read value entries based on isNull entries
+ reader.nextVector(result, result.vector, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class IntTreeReader extends TreeReader {
+ protected IntegerReader reader = null;
+
+ IntTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected IntTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final LongColumnVector result = (LongColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ // Read value entries based on isNull entries
+ reader.nextVector(result, result.vector, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class LongTreeReader extends TreeReader {
+ protected IntegerReader reader = null;
+
+ LongTreeReader(int columnId, boolean skipCorrupt) throws IOException {
+ this(columnId, null, null, null, skipCorrupt);
+ }
+
+ protected LongTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding,
+ boolean skipCorrupt)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, skipCorrupt);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final LongColumnVector result = (LongColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ // Read value entries based on isNull entries
+ reader.nextVector(result, result.vector, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class FloatTreeReader extends TreeReader {
+ protected InStream stream;
+ private final SerializationUtils utils;
+
+ FloatTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected FloatTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ this.utils = new SerializationUtils();
+ this.stream = data;
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final DoubleColumnVector result = (DoubleColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ final boolean hasNulls = !result.noNulls;
+ boolean allNulls = hasNulls;
+
+ if (hasNulls) {
+ // conditions to ensure bounds checks skips
+ for (int i = 0; batchSize <= result.isNull.length && i < batchSize; i++) {
+ allNulls = allNulls & result.isNull[i];
+ }
+ if (allNulls) {
+ result.vector[0] = Double.NaN;
+ result.isRepeating = true;
+ } else {
+ // some nulls
+ result.isRepeating = false;
+ // conditions to ensure bounds checks skips
+ for (int i = 0; batchSize <= result.isNull.length
+ && batchSize <= result.vector.length && i < batchSize; i++) {
+ if (!result.isNull[i]) {
+ result.vector[i] = utils.readFloat(stream);
+ } else {
+ // If the value is not present then set NaN
+ result.vector[i] = Double.NaN;
+ }
+ }
+ }
+ } else {
+ // no nulls & > 1 row (check repeating)
+ boolean repeating = (batchSize > 1);
+ final float f1 = utils.readFloat(stream);
+ result.vector[0] = f1;
+ // conditions to ensure bounds checks skips
+ for (int i = 1; i < batchSize && batchSize <= result.vector.length; i++) {
+ final float f2 = utils.readFloat(stream);
+ repeating = repeating && (f1 == f2);
+ result.vector[i] = f2;
+ }
+ result.isRepeating = repeating;
+ }
+ }
+
+ @Override
+ protected void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ for (int i = 0; i < items; ++i) {
+ utils.readFloat(stream);
+ }
+ }
+ }
+
+ public static class DoubleTreeReader extends TreeReader {
+ protected InStream stream;
+ private final SerializationUtils utils;
+
+ DoubleTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected DoubleTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ this.utils = new SerializationUtils();
+ this.stream = data;
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name =
+ new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final DoubleColumnVector result = (DoubleColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ final boolean hasNulls = !result.noNulls;
+ boolean allNulls = hasNulls;
+
+ if (hasNulls) {
+ // conditions to ensure bounds checks skips
+ for (int i = 0; i < batchSize && batchSize <= result.isNull.length; i++) {
+ allNulls = allNulls & result.isNull[i];
+ }
+ if (allNulls) {
+ result.vector[0] = Double.NaN;
+ result.isRepeating = true;
+ } else {
+ // some nulls
+ result.isRepeating = false;
+ // conditions to ensure bounds checks skips
+ for (int i = 0; batchSize <= result.isNull.length
+ && batchSize <= result.vector.length && i < batchSize; i++) {
+ if (!result.isNull[i]) {
+ result.vector[i] = utils.readDouble(stream);
+ } else {
+ // If the value is not present then set NaN
+ result.vector[i] = Double.NaN;
+ }
+ }
+ }
+ } else {
+ // no nulls
+ boolean repeating = (batchSize > 1);
+ final double d1 = utils.readDouble(stream);
+ result.vector[0] = d1;
+ // conditions to ensure bounds checks skips
+ for (int i = 1; i < batchSize && batchSize <= result.vector.length; i++) {
+ final double d2 = utils.readDouble(stream);
+ repeating = repeating && (d1 == d2);
+ result.vector[i] = d2;
+ }
+ result.isRepeating = repeating;
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long len = items * 8;
+ while (len > 0) {
+ len -= stream.skip(len);
+ }
+ }
+ }
+
+ public static class BinaryTreeReader extends TreeReader {
+ protected InStream stream;
+ protected IntegerReader lengths = null;
+ protected final LongColumnVector scratchlcv;
+
+ BinaryTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null, null);
+ }
+
+ protected BinaryTreeReader(int columnId, InStream present, InStream data, InStream length,
+ OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present);
+ scratchlcv = new LongColumnVector();
+ this.stream = data;
+ if (length != null && encoding != null) {
+ checkEncoding(encoding);
+ this.lengths = createIntegerReader(encoding.getKind(), length, false, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), false, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ lengths.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final BytesColumnVector result = (BytesColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ BytesColumnVectorUtil.readOrcByteArrays(stream, lengths, scratchlcv, result, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long lengthToSkip = 0;
+ for (int i = 0; i < items; ++i) {
+ lengthToSkip += lengths.next();
+ }
+ while (lengthToSkip > 0) {
+ lengthToSkip -= stream.skip(lengthToSkip);
+ }
+ }
+ }
+
+ public static class TimestampTreeReader extends TreeReader {
+ protected IntegerReader data = null;
+ protected IntegerReader nanos = null;
+ private final boolean skipCorrupt;
+ private Map<String, Long> baseTimestampMap;
+ private long base_timestamp;
+ private final TimeZone readerTimeZone;
+ private TimeZone writerTimeZone;
+ private boolean hasSameTZRules;
+
+ TimestampTreeReader(int columnId, boolean skipCorrupt) throws IOException {
+ this(columnId, null, null, null, null, skipCorrupt);
+ }
+
+ protected TimestampTreeReader(int columnId, InStream presentStream, InStream dataStream,
+ InStream nanosStream, OrcProto.ColumnEncoding encoding, boolean skipCorrupt)
+ throws IOException {
+ super(columnId, presentStream);
+ this.skipCorrupt = skipCorrupt;
+ this.baseTimestampMap = new HashMap<>();
+ this.readerTimeZone = TimeZone.getDefault();
+ this.writerTimeZone = readerTimeZone;
+ this.hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
+ this.base_timestamp = getBaseTimestamp(readerTimeZone.getID());
+ if (encoding != null) {
+ checkEncoding(encoding);
+
+ if (dataStream != null) {
+ this.data = createIntegerReader(encoding.getKind(), dataStream, true, skipCorrupt);
+ }
+
+ if (nanosStream != null) {
+ this.nanos = createIntegerReader(encoding.getKind(), nanosStream, false, skipCorrupt);
+ }
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ data = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)), true, skipCorrupt);
+ nanos = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt);
+ base_timestamp = getBaseTimestamp(stripeFooter.getWriterTimezone());
+ }
+
+ private long getBaseTimestamp(String timeZoneId) throws IOException {
+ // to make sure new readers read old files in the same way
+ if (timeZoneId == null || timeZoneId.isEmpty()) {
+ timeZoneId = readerTimeZone.getID();
+ }
+
+ if (!baseTimestampMap.containsKey(timeZoneId)) {
+ writerTimeZone = TimeZone.getTimeZone(timeZoneId);
+ hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ sdf.setTimeZone(writerTimeZone);
+ try {
+ long epoch =
+ sdf.parse(WriterImpl.BASE_TIMESTAMP_STRING).getTime() / WriterImpl.MILLIS_PER_SECOND;
+ baseTimestampMap.put(timeZoneId, epoch);
+ return epoch;
+ } catch (ParseException e) {
+ throw new IOException("Unable to create base timestamp", e);
+ } finally {
+ sdf.setTimeZone(readerTimeZone);
+ }
+ }
+
+ return baseTimestampMap.get(timeZoneId);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ data.seek(index);
+ nanos.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ TimestampColumnVector result = (TimestampColumnVector) previousVector;
+ super.nextVector(previousVector, isNull, batchSize);
+
+ for (int i = 0; i < batchSize; i++) {
+ if (result.noNulls || !result.isNull[i]) {
+ long millis = data.next() + base_timestamp;
+ int newNanos = parseNanos(nanos.next());
+ if (millis < 0 && newNanos != 0) {
+ millis -= 1;
+ }
+ millis *= WriterImpl.MILLIS_PER_SECOND;
+ long offset = 0;
+ // If reader and writer time zones have different rules, adjust the timezone difference
+ // between reader and writer taking day light savings into account.
+ if (!hasSameTZRules) {
+ offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis);
+ }
+ long adjustedMillis = millis + offset;
+ // Sometimes the reader timezone might have changed after adding the adjustedMillis.
+ // To account for that change, check for any difference in reader timezone after
+ // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time).
+ if (!hasSameTZRules &&
+ (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) {
+ long newOffset =
+ writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis);
+ adjustedMillis = millis + newOffset;
+ }
+ result.time[i] = adjustedMillis;
+ result.nanos[i] = newNanos;
+ if (result.isRepeating && i != 0 &&
+ (result.time[0] != result.time[i] ||
+ result.nanos[0] != result.nanos[i])) {
+ result.isRepeating = false;
+ }
+ }
+ }
+ }
+
+ private static int parseNanos(long serialized) {
+ int zeros = 7 & (int) serialized;
+ int result = (int) (serialized >>> 3);
+ if (zeros != 0) {
+ for (int i = 0; i <= zeros; ++i) {
+ result *= 10;
+ }
+ }
+ return result;
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ data.skip(items);
+ nanos.skip(items);
+ }
+ }
+
+ public static class DateTreeReader extends TreeReader {
+ protected IntegerReader reader = null;
+
+ DateTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected DateTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final LongColumnVector result = (LongColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ // Read value entries based on isNull entries
+ reader.nextVector(result, result.vector, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class DecimalTreeReader extends TreeReader {
+ protected InStream valueStream;
+ protected IntegerReader scaleReader = null;
+ private int[] scratchScaleVector;
+
+ private final int precision;
+ private final int scale;
+
+ DecimalTreeReader(int columnId, int precision, int scale) throws IOException {
+ this(columnId, precision, scale, null, null, null, null);
+ }
+
+ protected DecimalTreeReader(int columnId, int precision, int scale, InStream present,
+ InStream valueStream, InStream scaleStream, OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ this.precision = precision;
+ this.scale = scale;
+ this.scratchScaleVector = new int[VectorizedRowBatch.DEFAULT_SIZE];
+ this.valueStream = valueStream;
+ if (scaleStream != null && encoding != null) {
+ checkEncoding(encoding);
+ this.scaleReader = createIntegerReader(encoding.getKind(), scaleStream, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ valueStream = streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA));
+ scaleReader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId, OrcProto.Stream.Kind.SECONDARY)), true, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ valueStream.seek(index);
+ scaleReader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final DecimalColumnVector result = (DecimalColumnVector) previousVector;
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ if (batchSize > scratchScaleVector.length) {
+ scratchScaleVector = new int[(int) batchSize];
+ }
+ // read the scales
+ scaleReader.nextVector(result, scratchScaleVector, batchSize);
+ // Read value entries based on isNull entries
+ if (result.noNulls) {
+ for (int r=0; r < batchSize; ++r) {
+ BigInteger bInt = SerializationUtils.readBigInteger(valueStream);
+ HiveDecimal dec = HiveDecimal.create(bInt, scratchScaleVector[r]);
+ result.set(r, dec);
+ }
+ } else if (!result.isRepeating || !result.isNull[0]) {
+ for (int r=0; r < batchSize; ++r) {
+ if (!result.isNull[r]) {
+ BigInteger bInt = SerializationUtils.readBigInteger(valueStream);
+ HiveDecimal dec = HiveDecimal.create(bInt, scratchScaleVector[r]);
+ result.set(r, dec);
+ }
+ }
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ for (int i = 0; i < items; i++) {
+ SerializationUtils.readBigInteger(valueStream);
+ }
+ scaleReader.skip(items);
+ }
+ }
+
+ /**
+ * A tree reader that will read string columns. At the start of the
+ * stripe, it creates an internal reader based on whether a direct or
+ * dictionary encoding was used.
+ */
+ public static class StringTreeReader extends TreeReader {
+ protected TreeReader reader;
+
+ StringTreeReader(int columnId) throws IOException {
+ super(columnId);
+ }
+
+ protected StringTreeReader(int columnId, InStream present, InStream data, InStream length,
+ InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present);
+ if (encoding != null) {
+ switch (encoding.getKind()) {
+ case DIRECT:
+ case DIRECT_V2:
+ reader = new StringDirectTreeReader(columnId, present, data, length,
+ encoding.getKind());
+ break;
+ case DICTIONARY:
+ case DICTIONARY_V2:
+ reader = new StringDictionaryTreeReader(columnId, present, data, length, dictionary,
+ encoding);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ encoding.getKind());
+ }
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ reader.checkEncoding(encoding);
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ // For each stripe, checks the encoding and initializes the appropriate
+ // reader
+ switch (stripeFooter.getColumnsList().get(columnId).getKind()) {
+ case DIRECT:
+ case DIRECT_V2:
+ reader = new StringDirectTreeReader(columnId);
+ break;
+ case DICTIONARY:
+ case DICTIONARY_V2:
+ reader = new StringDictionaryTreeReader(columnId);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ stripeFooter.getColumnsList().get(columnId).getKind());
+ }
+ reader.startStripe(streams, stripeFooter);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ reader.nextVector(previousVector, isNull, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skipRows(items);
+ }
+ }
+
+ // This class collects together very similar methods for reading an ORC vector of byte arrays and
+ // creating the BytesColumnVector.
+ //
+ public static class BytesColumnVectorUtil {
+
+ private static byte[] commonReadByteArrays(InStream stream, IntegerReader lengths,
+ LongColumnVector scratchlcv,
+ BytesColumnVector result, final int batchSize) throws IOException {
+ // Read lengths
+ scratchlcv.isNull = result.isNull; // Notice we are replacing the isNull vector here...
+ lengths.nextVector(scratchlcv, scratchlcv.vector, batchSize);
+ int totalLength = 0;
+ if (!scratchlcv.isRepeating) {
+ for (int i = 0; i < batchSize; i++) {
+ if (!scratchlcv.isNull[i]) {
+ totalLength += (int) scratchlcv.vector[i];
+ }
+ }
+ } else {
+ if (!scratchlcv.isNull[0]) {
+ totalLength = (int) (batchSize * scratchlcv.vector[0]);
+ }
+ }
+
+ // Read all the strings for this batch
+ byte[] allBytes = new byte[totalLength];
+ int offset = 0;
+ int len = totalLength;
+ while (len > 0) {
+ int bytesRead = stream.read(allBytes, offset, len);
+ if (bytesRead < 0) {
+ throw new EOFException("Can't finish byte read from " + stream);
+ }
+ len -= bytesRead;
+ offset += bytesRead;
+ }
+
+ return allBytes;
+ }
+
+ // This method has the common code for reading in bytes into a BytesColumnVector.
+ public static void readOrcByteArrays(InStream stream,
+ IntegerReader lengths,
+ LongColumnVector scratchlcv,
+ BytesColumnVector result,
+ final int batchSize) throws IOException {
+ if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
+ byte[] allBytes = commonReadByteArrays(stream, lengths, scratchlcv,
+ result, (int) batchSize);
+
+ // Too expensive to figure out 'repeating' by comparisons.
+ result.isRepeating = false;
+ int offset = 0;
+ if (!scratchlcv.isRepeating) {
+ for (int i = 0; i < batchSize; i++) {
+ if (!scratchlcv.isNull[i]) {
+ result.setRef(i, allBytes, offset, (int) scratchlcv.vector[i]);
+ offset += scratchlcv.vector[i];
+ } else {
+ result.setRef(i, allBytes, 0, 0);
+ }
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!scratchlcv.isNull[i]) {
+ result.setRef(i, allBytes, offset, (int) scratchlcv.vector[0]);
+ offset += scratchlcv.vector[0];
+ } else {
+ result.setRef(i, allBytes, 0, 0);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * A reader for string columns that are direct encoded in the current
+ * stripe.
+ */
+ public static class StringDirectTreeReader extends TreeReader {
+ private static final HadoopShims SHIMS = HadoopShims.Factory.get();
+ protected InStream stream;
+ protected HadoopShims.TextReaderShim data;
+ protected IntegerReader lengths;
+ private final LongColumnVector scratchlcv;
+
+ StringDirectTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null, null);
+ }
+
+ protected StringDirectTreeReader(int columnId, InStream present, InStream data,
+ InStream length, OrcProto.ColumnEncoding.Kind encoding) throws IOException {
+ super(columnId, present);
+ this.scratchlcv = new LongColumnVector();
+ this.stream = data;
+ if (length != null && encoding != null) {
+ this.lengths = createIntegerReader(encoding, length, false, false);
+ this.data = SHIMS.getTextReaderShim(this.stream);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT &&
+ encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ data = SHIMS.getTextReaderShim(this.stream);
+ lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId, OrcProto.Stream.Kind.LENGTH)),
+ false, false);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ // don't seek data stream
+ lengths.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final BytesColumnVector result = (BytesColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ BytesColumnVectorUtil.readOrcByteArrays(stream, lengths, scratchlcv,
+ result, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long lengthToSkip = 0;
+ for (int i = 0; i < items; ++i) {
+ lengthToSkip += lengths.next();
+ }
+
+ while (lengthToSkip > 0) {
+ lengthToSkip -= stream.skip(lengthToSkip);
+ }
+ }
+
+ public IntegerReader getLengths() {
+ return lengths;
+ }
+
+ public InStream getStream() {
+ return stream;
+ }
+ }
+
+ /**
+ * A reader for string columns that are dictionary encoded in the current
+ * stripe.
+ */
+ public static class StringDictionaryTreeReader extends TreeReader {
+ private static final byte[] EMPTY_BYTE_ARRAY = new byte[0];
+ private DynamicByteArray dictionaryBuffer;
+ private int[] dictionaryOffsets;
+ protected IntegerReader reader;
+
+ private byte[] dictionaryBufferInBytesCache = null;
+ private final LongColumnVector scratchlcv;
+
+ StringDictionaryTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null, null, null);
+ }
+
+ protected StringDictionaryTreeReader(int columnId, InStream present, InStream data,
+ InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ scratchlcv = new LongColumnVector();
+ if (data != null && encoding != null) {
+ this.reader = createIntegerReader(encoding.getKind(), data, false, false);
+ }
+
+ if (dictionary != null && encoding != null) {
+ readDictionaryStream(dictionary);
+ }
+
+ if (length != null && encoding != null) {
+ readDictionaryLengthStream(length, encoding);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY &&
+ encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+
+ // read the dictionary blob
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DICTIONARY_DATA);
+ InStream in = streams.get(name);
+ readDictionaryStream(in);
+
+ // read the lengths
+ name = new StreamName(columnId, OrcProto.Stream.Kind.LENGTH);
+ in = streams.get(name);
+ readDictionaryLengthStream(in, stripeFooter.getColumnsList().get(columnId));
+
+ // set up the row reader
+ name = new StreamName(columnId, OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), false, false);
+ }
+
+ private void readDictionaryLengthStream(InStream in, OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ int dictionarySize = encoding.getDictionarySize();
+ if (in != null) { // Guard against empty LENGTH stream.
+ IntegerReader lenReader = createIntegerReader(encoding.getKind(), in, false, false);
+ int offset = 0;
+ if (dictionaryOffsets == null ||
+ dictionaryOffsets.length < dictionarySize + 1) {
+ dictionaryOffsets = new int[dictionarySize + 1];
+ }
+ for (int i = 0; i < dictionarySize; ++i) {
+ dictionaryOffsets[i] = offset;
+ offset += (int) lenReader.next();
+ }
+ dictionaryOffsets[dictionarySize] = offset;
+ in.close();
+ }
+
+ }
+
+ private void readDictionaryStream(InStream in) throws IOException {
+ if (in != null) { // Guard against empty dictionary stream.
+ if (in.available() > 0) {
+ dictionaryBuffer = new DynamicByteArray(64, in.available());
+ dictionaryBuffer.readAll(in);
+ // Since its start of strip invalidate the cache.
+ dictionaryBufferInBytesCache = null;
+ }
+ in.close();
+ } else {
+ dictionaryBuffer = null;
+ }
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final BytesColumnVector result = (BytesColumnVector) previousVector;
+ int offset;
+ int length;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ if (dictionaryBuffer != null) {
+
+ // Load dictionaryBuffer into cache.
+ if (dictionaryBufferInBytesCache == null) {
+ dictionaryBufferInBytesCache = dictionaryBuffer.get();
+ }
+
+ // Read string offsets
+ scratchlcv.isNull = result.isNull;
+ scratchlcv.ensureSize((int) batchSize, false);
+ reader.nextVector(scratchlcv, scratchlcv.vector, batchSize);
+ if (!scratchlcv.isRepeating) {
+
+ // The vector has non-repeating strings. Iterate thru the batch
+ // and set strings one by one
+ for (int i = 0; i < batchSize; i++) {
+ if (!scratchlcv.isNull[i]) {
+ offset = dictionaryOffsets[(int) scratchlcv.vector[i]];
+ length = getDictionaryEntryLength((int) scratchlcv.vector[i], offset);
+ result.setRef(i, dictionaryBufferInBytesCache, offset, length);
+ } else {
+ // If the value is null then set offset and length to zero (null string)
+ result.setRef(i, dictionaryBufferInBytesCache, 0, 0);
+ }
+ }
+ } else {
+ // If the value is repeating then just set the first value in the
+ // vector and set the isRepeating flag to true. No need to iterate thru and
+ // set all the elements to the same value
+ offset = dictionaryOffsets[(int) scratchlcv.vector[0]];
+ length = getDictionaryEntryLength((int) scratchlcv.vector[0], offset);
+ result.setRef(0, dictionaryBufferInBytesCache, offset, length);
+ }
+ result.isRepeating = scratchlcv.isRepeating;
+ } else {
+ if (dictionaryOffsets == null) {
+ // Entire stripe contains null strings.
+ result.isRepeating = true;
+ result.noNulls = false;
+ result.isNull[0] = true;
+ result.setRef(0, EMPTY_BYTE_ARRAY, 0, 0);
+ } else {
+ // stripe contains nulls and empty strings
+ for (int i = 0; i < batchSize; i++) {
+ if (!result.isNull[i]) {
+ result.setRef(i, EMPTY_BYTE_ARRAY, 0, 0);
+ }
+ }
+ }
+ }
+ }
+
+ int getDictionaryEntryLength(int entry, int offset) {
+ final int length;
+ // if it isn't the last entry, subtract the offsets otherwise use
+ // the buffer length.
+ if (entry < dictionaryOffsets.length - 1) {
+ length = dictionaryOffsets[entry + 1] - offset;
+ } else {
+ length = dictionaryBuffer.size() - offset;
+ }
+ return length;
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+
+ public IntegerReader getReader() {
+ return reader;
+ }
+ }
+
+ public static class CharTreeReader extends StringTreeReader {
+ int maxLength;
+
+ CharTreeReader(int columnId, int maxLength) throws IOException {
+ this(columnId, maxLength, null, null, null, null, null);
+ }
+
+ protected CharTreeReader(int columnId, int maxLength, InStream present, InStream data,
+ InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present, data, length, dictionary, encoding);
+ this.maxLength = maxLength;
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ // Get the vector of strings from StringTreeReader, then make a 2nd pass to
+ // adjust down the length (right trim and truncate) if necessary.
+ super.nextVector(previousVector, isNull, batchSize);
+ BytesColumnVector result = (BytesColumnVector) previousVector;
+ int adjustedDownLen;
+ if (result.isRepeating) {
+ if (result.noNulls || !result.isNull[0]) {
+ adjustedDownLen = StringExpr
+ .rightTrimAndTruncate(result.vector[0], result.start[0], result.length[0], maxLength);
+ if (adjustedDownLen < result.length[0]) {
+ result.setRef(0, result.vector[0], result.start[0], adjustedDownLen);
+ }
+ }
+ } else {
+ if (result.noNulls) {
+ for (int i = 0; i < batchSize; i++) {
+ adjustedDownLen = StringExpr
+ .rightTrimAndTruncate(result.vector[i], result.start[i], result.length[i],
+ maxLength);
+ if (adjustedDownLen < result.length[i]) {
+ result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
+ }
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!result.isNull[i]) {
+ adjustedDownLen = StringExpr
+ .rightTrimAndTruncate(result.vector[i], result.start[i], result.length[i],
+ maxLength);
+ if (adjustedDownLen < result.length[i]) {
+ result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ public static class VarcharTreeReader extends StringTreeReader {
+ int maxLength;
+
+ VarcharTreeReader(int columnId, int maxLength) throws IOException {
+ this(columnId, maxLength, null, null, null, null, null);
+ }
+
+ protected VarcharTreeReader(int columnId, int maxLength, InStream present, InStream data,
+ InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present, data, length, dictionary, encoding);
+ this.maxLength = maxLength;
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ // Get the vector of strings from StringTreeReader, then make a 2nd pass to
+ // adjust down the length (truncate) if necessary.
+ super.nextVector(previousVector, isNull, batchSize);
+ BytesColumnVector result = (BytesColumnVector) previousVector;
+
+ int adjustedDownLen;
+ if (result.isRepeating) {
+ if (result.noNulls || !result.isNull[0]) {
+ adjustedDownLen = StringExpr
+ .truncate(result.vector[0], result.start[0], result.length[0], maxLength);
+ if (adjustedDownLen < result.length[0]) {
+ result.setRef(0, result.vector[0], result.start[0], adjustedDownLen);
+ }
+ }
+ } else {
+ if (result.noNulls) {
+ for (int i = 0; i < batchSize; i++) {
+ adjustedDownLen = StringExpr
+ .truncate(result.vector[i], result.start[i], result.length[i], maxLength);
+ if (adjustedDownLen < result.length[i]) {
+ result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
+ }
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!result.isNull[i]) {
+ adjustedDownLen = StringExpr
+ .truncate(result.vector[i], result.start[i], result.length[i], maxLength);
+ if (adjustedDownLen < result.length[i]) {
+ result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ protected static class StructTreeReader extends TreeReader {
+ protected final TreeReader[] fields;
+
+ protected StructTreeReader(int columnId,
+ TypeDescription readerSchema,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+
+ List<TypeDescription> childrenTypes = readerSchema.getChildren();
+ this.fields = new TreeReader[childrenTypes.size()];
+ for (int i = 0; i < fields.length; ++i) {
+ TypeDescription subtype = childrenTypes.get(i);
+ this.fields[i] = createTreeReader(subtype, evolution, included, skipCorrupt);
+ }
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ super.seek(index);
+ for (TreeReader kid : fields) {
+ if (kid != null) {
+ kid.seek(index);
+ }
+ }
+ }
+
+ @Override
+ public void nextBatch(VectorizedRowBatch batch,
+ int batchSize) throws IOException {
+ for(int i=0; i < fields.length &&
+ (vectorColumnCount == -1 || i < vectorColumnCount); ++i) {
+ batch.cols[i].reset();
+ batch.cols[i].ensureSize((int) batchSize, false);
+ fields[i].nextVector(batch.cols[i], null, batchSize);
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ super.nextVector(previousVector, isNull, batchSize);
+ StructColumnVector result = (StructColumnVector) previousVector;
+ if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
+ result.isRepeating = false;
+
+ // Read all the members of struct as column vectors
+ boolean[] mask = result.noNulls ? null : result.isNull;
+ for (int f = 0; f < fields.length; f++) {
+ if (fields[f] != null) {
+ fields[f].nextVector(result.fields[f], mask, batchSize);
+ }
+ }
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ for (TreeReader field : fields) {
+ if (field != null) {
+ field.startStripe(streams, stripeFooter);
+ }
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ for (TreeReader field : fields) {
+ if (field != null) {
+ field.skipRows(items);
+ }
+ }
+ }
+ }
+
+ public static class UnionTreeReader extends TreeReader {
+ protected final TreeReader[] fields;
+ protected RunLengthByteReader tags;
+
+ protected UnionTreeReader(int fileColumn,
+ TypeDescription readerSchema,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+ super(fileColumn);
+ List<TypeDescription> childrenTypes = readerSchema.getChildren();
+ int fieldCount = childrenTypes.size();
+ this.fields = new TreeReader[fieldCount];
+ for (int i = 0; i < fieldCount; ++i) {
+ TypeDescription subtype = childrenTypes.get(i);
+ this.fields[i] = createTreeReader(subtype, evolution, included, skipCorrupt);
+ }
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ super.seek(index);
+ tags.seek(index[columnId]);
+ for (TreeReader kid : fields) {
+ kid.seek(index);
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ UnionColumnVector result = (UnionColumnVector) previousVector;
+ super.nextVector(result, isNull, batchSize);
+ if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
+ result.isRepeating = false;
+ tags.nextVector(result.noNulls ? null : result.isNull, result.tags,
+ batchSize);
+ boolean[] ignore = new boolean[(int) batchSize];
+ for (int f = 0; f < result.fields.length; ++f) {
+ // build the ignore list for this tag
+ for (int r = 0; r < batchSize; ++r) {
+ ignore[r] = (!result.noNulls && result.isNull[r]) ||
+ result.tags[r] != f;
+ }
+ fields[f].nextVector(result.fields[f], ignore, batchSize);
+ }
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ tags = new RunLengthByteReader(streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)));
+ for (TreeReader field : fields) {
+ if (field != null) {
+ field.startStripe(streams, stripeFooter);
+ }
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long[] counts = new long[fields.length];
+ for (int i = 0; i < items; ++i) {
+ counts[tags.next()] += 1;
+ }
+ for (int i = 0; i < counts.length; ++i) {
+ fields[i].skipRows(counts[i]);
+ }
+ }
+ }
+
+ public static class ListTreeReader extends TreeReader {
+ protected final TreeReader elementReader;
+ protected IntegerReader lengths = null;
+
+ protected ListTreeReader(int fileColumn,
+ TypeDescription readerSchema,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+ super(fileColumn);
+ TypeDescription elementType = readerSchema.getChildren().get(0);
+ elementReader = createTreeReader(elementType, evolution, included,
+ skipCorrupt);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ super.seek(index);
+ lengths.seek(index[columnId]);
+ elementReader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previous,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ ListColumnVector result = (ListColumnVector) previous;
+ super.nextVector(result, isNull, batchSize);
+ // if we have some none-null values, then read them
+ if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
+ lengths.nextVector(result, result.lengths, batchSize);
+ // even with repeating lengths, the list doesn't repeat
+ result.isRepeating = false;
+ // build the offsets vector and figure out how many children to read
+ result.childCount = 0;
+ for (int r = 0; r < batchSize; ++r) {
+ if (result.noNulls || !result.isNull[r]) {
+ result.offsets[r] = result.childCount;
+ result.childCount += result.lengths[r];
+ }
+ }
+ result.child.ensureSize(result.childCount, false);
+ elementReader.nextVector(result.child, null, result.childCount);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.LENGTH)), false, false);
+ if (elementReader != null) {
+ elementReader.startStripe(streams, stripeFooter);
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long childSkip = 0;
+ for (long i = 0; i < items; ++i) {
+ childSkip += lengths.next();
+ }
+ elementReader.skipRows(childSkip);
+ }
+ }
+
+ public static class MapTreeReader extends TreeReader {
+ protected final TreeReader keyReader;
+ protected final TreeReader valueReader;
+ protected IntegerReader lengths = null;
+
+ protected MapTreeReader(int fileColumn,
+ TypeDescription readerSchema,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+ super(fileColumn);
+ TypeDescription keyType = readerSchema.getChildren().get(0);
+ TypeDescription valueType = readerSchema.getChildren().get(1);
+ keyReader = createTreeReader(keyType, evolution, included, skipCorrupt);
+ valueReader = createTreeReader(valueType, evolution, included, skipCorrupt);
+ }
+
+ @Override
+ void seek(PositionProvider[] index) throws IOException {
+ super.seek(index);
+ lengths.seek(index[columnId]);
+ keyReader.seek(index);
+ valueReader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previous,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ MapColumnVector result = (MapColumnVector) previous;
+ super.nextVector(result, isNull, batchSize);
+ if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
+ lengths.nextVector(result, result.lengths, batchSize);
+ // even with repeating lengths, the map doesn't repeat
+ result.isRepeating = false;
+ // build the offsets vector and figure out how many children to read
+ result.childCount = 0;
+ for (int r = 0; r < batchSize; ++r) {
+ if (result.noNulls || !result.isNull[r]) {
+ result.offsets[r] = result.childCount;
+ result.childCount += result.lengths[r];
+ }
+ }
+ result.keys.ensureSize(result.childCount, false);
+ result.values.ensureSize(result.childCount, false);
+ keyReader.nextVector(result.keys, null, result.childCount);
+ valueReader.nextVector(result.values, null, result.childCount);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.LENGTH)), false, false);
+ if (keyReader != null) {
+ keyReader.startStripe(streams, stripeFooter);
+ }
+ if (valueReader != null) {
+ valueReader.startStripe(streams, stripeFooter);
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long childSkip = 0;
+ for (long i = 0; i < items; ++i) {
+ childSkip += lengths.next();
+ }
+ keyReader.skipRows(childSkip);
+ valueReader.skipRows(childSkip);
+ }
+ }
+
+ public static TreeReader createTreeReader(TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt
+ ) throws IOException {
+ TypeDescription fileType = evolution.getFileType(readerType);
+ if (fileType == null ||
+ (included != null && !included[readerType.getId()])) {
+ return new NullTreeReader(0);
+ }
+ TypeDescription.Category readerTypeCategory = readerType.getCategory();
+ if (!fileType.getCategory().equals(readerTypeCategory) &&
+ (readerTypeCategory != TypeDescription.Category.STRUCT &&
+ readerTypeCategory != TypeDescription.Category.MAP &&
+ readerTypeCategory != TypeDescription.Category.LIST &&
+ readerTypeCategory != TypeDescription.Category.UNION)) {
+ // We only convert complex children.
+ return ConvertTreeReaderFactory.createConvertTreeReader(readerType, evolution,
+ included, skipCorrupt);
+ }
+ switch (readerTypeCategory) {
+ case BOOLEAN:
+ return new BooleanTreeReader(fileType.getId());
+ case BYTE:
+ return new ByteTreeReader(fileType.getId());
+ case DOUBLE:
+ return new DoubleTreeReader(fileType.getId());
+ case FLOAT:
+ return new FloatTreeReader(fileType.getId());
+ case SHORT:
+ return new ShortTreeReader(fileType.getId());
+ case INT:
+ return new IntTreeReader(fileType.getId());
+ case LONG:
+ return new LongTreeReader(fileType.getId(), skipCorrupt);
+ case STRING:
+ return new StringTreeReader(fileType.getId());
+ case CHAR:
+ return new CharTreeReader(fileType.getId(), readerType.getMaxLength());
+ case VARCHAR:
+ return new VarcharTreeReader(fileType.getId(), readerType.getMaxLength());
+ case BINARY:
+ return new BinaryTreeReader(fileType.getId());
+ case TIMESTAMP:
+ return new TimestampTreeReader(fileType.getId(), skipCorrupt);
+ case DATE:
+ return new DateTreeReader(fileType.getId());
+ case DECIMAL:
+ return new DecimalTreeReader(fileType.getId(), readerType.getPrecision(),
+ readerType.getScale());
+ case STRUCT:
+ return new StructTreeReader(fileType.getId(), readerType,
+ evolution, included, skipCorrupt);
+ case LIST:
+ return new ListTreeReader(fileType.getId(), readerType,
+ evolution, included, skipCorrupt);
+ case MAP:
+ return new MapTreeReader(fileType.getId(), readerType, evolution,
+ included, skipCorrupt);
+ case UNION:
+ return new UnionTreeReader(fileType.getId(), readerType,
+ evolution, included, skipCorrupt);
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerTypeCategory);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/ZeroCopyShims.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/ZeroCopyShims.java b/orc/src/java/org/apache/orc/impl/ZeroCopyShims.java
new file mode 100644
index 0000000..de02c8b
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/ZeroCopyShims.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.EnumSet;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.ReadOption;
+import org.apache.hadoop.io.ByteBufferPool;
+
+class ZeroCopyShims {
+ private static final class ByteBufferPoolAdapter implements ByteBufferPool {
+ private HadoopShims.ByteBufferPoolShim pool;
+
+ public ByteBufferPoolAdapter(HadoopShims.ByteBufferPoolShim pool) {
+ this.pool = pool;
+ }
+
+ @Override
+ public final ByteBuffer getBuffer(boolean direct, int length) {
+ return this.pool.getBuffer(direct, length);
+ }
+
+ @Override
+ public final void putBuffer(ByteBuffer buffer) {
+ this.pool.putBuffer(buffer);
+ }
+ }
+
+ private static final class ZeroCopyAdapter implements HadoopShims.ZeroCopyReaderShim {
+ private final FSDataInputStream in;
+ private final ByteBufferPoolAdapter pool;
+ private final static EnumSet<ReadOption> CHECK_SUM = EnumSet
+ .noneOf(ReadOption.class);
+ private final static EnumSet<ReadOption> NO_CHECK_SUM = EnumSet
+ .of(ReadOption.SKIP_CHECKSUMS);
+
+ public ZeroCopyAdapter(FSDataInputStream in,
+ HadoopShims.ByteBufferPoolShim poolshim) {
+ this.in = in;
+ if (poolshim != null) {
+ pool = new ByteBufferPoolAdapter(poolshim);
+ } else {
+ pool = null;
+ }
+ }
+
+ public final ByteBuffer readBuffer(int maxLength, boolean verifyChecksums)
+ throws IOException {
+ EnumSet<ReadOption> options = NO_CHECK_SUM;
+ if (verifyChecksums) {
+ options = CHECK_SUM;
+ }
+ return this.in.read(this.pool, maxLength, options);
+ }
+
+ public final void releaseBuffer(ByteBuffer buffer) {
+ this.in.releaseBuffer(buffer);
+ }
+
+ @Override
+ public final void close() throws IOException {
+ this.in.close();
+ }
+ }
+
+ public static HadoopShims.ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in,
+ HadoopShims.ByteBufferPoolShim pool) throws IOException {
+ return new ZeroCopyAdapter(in, pool);
+ }
+
+}
[13/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
deleted file mode 100644
index 9c2f88f..0000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
+++ /dev/null
@@ -1,884 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.PrintStream;
-import java.text.DecimalFormat;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.Options;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.hdfs.DistributedFileSystem;
-import org.apache.hadoop.hive.ql.io.AcidUtils;
-import org.apache.orc.BloomFilterIO;
-import org.apache.hadoop.hive.serde2.io.ByteWritable;
-import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.hive.serde2.io.ShortWritable;
-import org.apache.hadoop.io.BooleanWritable;
-import org.apache.hadoop.io.FloatWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.impl.ColumnStatisticsImpl;
-import org.apache.orc.impl.OrcIndex;
-import org.apache.orc.OrcProto;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.StripeStatistics;
-import org.codehaus.jettison.json.JSONException;
-import org.codehaus.jettison.json.JSONWriter;
-
-import com.google.common.base.Joiner;
-import com.google.common.base.Strings;
-import com.google.common.collect.Lists;
-
-/**
- * A tool for printing out the file structure of ORC files.
- */
-public final class FileDump {
- public static final String UNKNOWN = "UNKNOWN";
- public static final String SEPARATOR = Strings.repeat("_", 120) + "\n";
- public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024;
- public static final String DEFAULT_BACKUP_PATH = System.getProperty("java.io.tmpdir");
- public static final PathFilter HIDDEN_AND_SIDE_FILE_FILTER = new PathFilter() {
- public boolean accept(Path p) {
- String name = p.getName();
- return !name.startsWith("_") && !name.startsWith(".") && !name.endsWith(
- AcidUtils.DELTA_SIDE_FILE_SUFFIX);
- }
- };
-
- // not used
- private FileDump() {
- }
-
- public static void main(String[] args) throws Exception {
- Configuration conf = new Configuration();
-
- List<Integer> rowIndexCols = null;
- Options opts = createOptions();
- CommandLine cli = new GnuParser().parse(opts, args);
-
- if (cli.hasOption('h')) {
- HelpFormatter formatter = new HelpFormatter();
- formatter.printHelp("orcfiledump", opts);
- return;
- }
-
- boolean dumpData = cli.hasOption('d');
- boolean recover = cli.hasOption("recover");
- boolean skipDump = cli.hasOption("skip-dump");
- String backupPath = DEFAULT_BACKUP_PATH;
- if (cli.hasOption("backup-path")) {
- backupPath = cli.getOptionValue("backup-path");
- }
-
- if (cli.hasOption("r")) {
- String[] colStrs = cli.getOptionValue("r").split(",");
- rowIndexCols = new ArrayList<Integer>(colStrs.length);
- for (String colStr : colStrs) {
- rowIndexCols.add(Integer.parseInt(colStr));
- }
- }
-
- boolean printTimeZone = cli.hasOption('t');
- boolean jsonFormat = cli.hasOption('j');
- String[] files = cli.getArgs();
- if (files.length == 0) {
- System.err.println("Error : ORC files are not specified");
- return;
- }
-
- // if the specified path is directory, iterate through all files and print the file dump
- List<String> filesInPath = Lists.newArrayList();
- for (String filename : files) {
- Path path = new Path(filename);
- filesInPath.addAll(getAllFilesInPath(path, conf));
- }
-
- if (dumpData) {
- printData(filesInPath, conf);
- } else if (recover && skipDump) {
- recoverFiles(filesInPath, conf, backupPath);
- } else {
- if (jsonFormat) {
- boolean prettyPrint = cli.hasOption('p');
- JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone);
- } else {
- printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath);
- }
- }
- }
-
- /**
- * This method returns an ORC reader object if the specified file is readable. If the specified
- * file has side file (_flush_length) file, then max footer offset will be read from the side
- * file and orc reader will be created from that offset. Since both data file and side file
- * use hflush() for flushing the data, there could be some inconsistencies and both files could be
- * out-of-sync. Following are the cases under which null will be returned
- *
- * 1) If the file specified by path or its side file is still open for writes
- * 2) If *_flush_length file does not return any footer offset
- * 3) If *_flush_length returns a valid footer offset but the data file is not readable at that
- * position (incomplete data file)
- * 4) If *_flush_length file length is not a multiple of 8, then reader will be created from
- * previous valid footer. If there is no such footer (file length > 0 and < 8), then null will
- * be returned
- *
- * Also, if this method detects any file corruption (mismatch between data file and side file)
- * then it will add the corresponding file to the specified input list for corrupted files.
- *
- * In all other cases, where the file is readable this method will return a reader object.
- *
- * @param path - file to get reader for
- * @param conf - configuration object
- * @param corruptFiles - fills this list with all possible corrupted files
- * @return - reader for the specified file or null
- * @throws IOException
- */
- static Reader getReader(final Path path, final Configuration conf,
- final List<String> corruptFiles) throws IOException {
- FileSystem fs = path.getFileSystem(conf);
- long dataFileLen = fs.getFileStatus(path).getLen();
- System.err.println("Processing data file " + path + " [length: " + dataFileLen + "]");
- Path sideFile = OrcRecordUpdater.getSideFile(path);
- final boolean sideFileExists = fs.exists(sideFile);
- boolean openDataFile = false;
- boolean openSideFile = false;
- if (fs instanceof DistributedFileSystem) {
- DistributedFileSystem dfs = (DistributedFileSystem) fs;
- openDataFile = !dfs.isFileClosed(path);
- openSideFile = sideFileExists && !dfs.isFileClosed(sideFile);
- }
-
- if (openDataFile || openSideFile) {
- if (openDataFile && openSideFile) {
- System.err.println("Unable to perform file dump as " + path + " and " + sideFile +
- " are still open for writes.");
- } else if (openSideFile) {
- System.err.println("Unable to perform file dump as " + sideFile +
- " is still open for writes.");
- } else {
- System.err.println("Unable to perform file dump as " + path +
- " is still open for writes.");
- }
-
- return null;
- }
-
- Reader reader = null;
- if (sideFileExists) {
- final long maxLen = OrcRawRecordMerger.getLastFlushLength(fs, path);
- final long sideFileLen = fs.getFileStatus(sideFile).getLen();
- System.err.println("Found flush length file " + sideFile
- + " [length: " + sideFileLen + ", maxFooterOffset: " + maxLen + "]");
- // no offsets read from side file
- if (maxLen == -1) {
-
- // if data file is larger than last flush length, then additional data could be recovered
- if (dataFileLen > maxLen) {
- System.err.println("Data file has more data than max footer offset:" + maxLen +
- ". Adding data file to recovery list.");
- if (corruptFiles != null) {
- corruptFiles.add(path.toUri().toString());
- }
- }
- return null;
- }
-
- try {
- reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen));
-
- // if data file is larger than last flush length, then additional data could be recovered
- if (dataFileLen > maxLen) {
- System.err.println("Data file has more data than max footer offset:" + maxLen +
- ". Adding data file to recovery list.");
- if (corruptFiles != null) {
- corruptFiles.add(path.toUri().toString());
- }
- }
- } catch (Exception e) {
- if (corruptFiles != null) {
- corruptFiles.add(path.toUri().toString());
- }
- System.err.println("Unable to read data from max footer offset." +
- " Adding data file to recovery list.");
- return null;
- }
- } else {
- reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
- }
-
- return reader;
- }
-
- public static Collection<String> getAllFilesInPath(final Path path,
- final Configuration conf) throws IOException {
- List<String> filesInPath = Lists.newArrayList();
- FileSystem fs = path.getFileSystem(conf);
- FileStatus fileStatus = fs.getFileStatus(path);
- if (fileStatus.isDir()) {
- FileStatus[] fileStatuses = fs.listStatus(path, HIDDEN_AND_SIDE_FILE_FILTER);
- for (FileStatus fileInPath : fileStatuses) {
- if (fileInPath.isDir()) {
- filesInPath.addAll(getAllFilesInPath(fileInPath.getPath(), conf));
- } else {
- filesInPath.add(fileInPath.getPath().toString());
- }
- }
- } else {
- filesInPath.add(path.toString());
- }
-
- return filesInPath;
- }
-
- private static void printData(List<String> files,
- Configuration conf) throws IOException,
- JSONException {
- for (String file : files) {
- try {
- Path path = new Path(file);
- Reader reader = getReader(path, conf, Lists.<String>newArrayList());
- if (reader == null) {
- continue;
- }
- printJsonData(reader);
- System.out.println(SEPARATOR);
- } catch (Exception e) {
- System.err.println("Unable to dump data for file: " + file);
- continue;
- }
- }
- }
-
- private static void printMetaData(List<String> files, Configuration conf,
- List<Integer> rowIndexCols, boolean printTimeZone, final boolean recover,
- final String backupPath)
- throws IOException {
- List<String> corruptFiles = Lists.newArrayList();
- for (String filename : files) {
- printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles);
- System.out.println(SEPARATOR);
- }
-
- if (!corruptFiles.isEmpty()) {
- if (recover) {
- recoverFiles(corruptFiles, conf, backupPath);
- } else {
- System.err.println(corruptFiles.size() + " file(s) are corrupted." +
- " Run the following command to recover corrupted files.\n");
- String fileNames = Joiner.on(" ").skipNulls().join(corruptFiles);
- System.err.println("hive --orcfiledump --recover --skip-dump " + fileNames);
- System.out.println(SEPARATOR);
- }
- }
- }
-
- private static void printMetaDataImpl(final String filename,
- final Configuration conf, final List<Integer> rowIndexCols, final boolean printTimeZone,
- final List<String> corruptFiles) throws IOException {
- Path file = new Path(filename);
- Reader reader = getReader(file, conf, corruptFiles);
- // if we can create reader then footer is not corrupt and file will readable
- if (reader == null) {
- return;
- }
-
- System.out.println("Structure for " + filename);
- System.out.println("File Version: " + reader.getFileVersion().getName() +
- " with " + reader.getWriterVersion());
- RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
- System.out.println("Rows: " + reader.getNumberOfRows());
- System.out.println("Compression: " + reader.getCompression());
- if (reader.getCompression() != CompressionKind.NONE) {
- System.out.println("Compression size: " + reader.getCompressionSize());
- }
- System.out.println("Type: " + reader.getObjectInspector().getTypeName());
- System.out.println("\nStripe Statistics:");
- List<StripeStatistics> stripeStats = reader.getStripeStatistics();
- for (int n = 0; n < stripeStats.size(); n++) {
- System.out.println(" Stripe " + (n + 1) + ":");
- StripeStatistics ss = stripeStats.get(n);
- for (int i = 0; i < ss.getColumnStatistics().length; ++i) {
- System.out.println(" Column " + i + ": " +
- ss.getColumnStatistics()[i].toString());
- }
- }
- ColumnStatistics[] stats = reader.getStatistics();
- int colCount = stats.length;
- System.out.println("\nFile Statistics:");
- for (int i = 0; i < stats.length; ++i) {
- System.out.println(" Column " + i + ": " + stats[i].toString());
- }
- System.out.println("\nStripes:");
- int stripeIx = -1;
- for (StripeInformation stripe : reader.getStripes()) {
- ++stripeIx;
- long stripeStart = stripe.getOffset();
- OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
- if (printTimeZone) {
- String tz = footer.getWriterTimezone();
- if (tz == null || tz.isEmpty()) {
- tz = UNKNOWN;
- }
- System.out.println(" Stripe: " + stripe.toString() + " timezone: " + tz);
- } else {
- System.out.println(" Stripe: " + stripe.toString());
- }
- long sectionStart = stripeStart;
- for (OrcProto.Stream section : footer.getStreamsList()) {
- String kind = section.hasKind() ? section.getKind().name() : UNKNOWN;
- System.out.println(" Stream: column " + section.getColumn() +
- " section " + kind + " start: " + sectionStart +
- " length " + section.getLength());
- sectionStart += section.getLength();
- }
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- StringBuilder buf = new StringBuilder();
- buf.append(" Encoding column ");
- buf.append(i);
- buf.append(": ");
- buf.append(encoding.getKind());
- if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
- encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
- buf.append("[");
- buf.append(encoding.getDictionarySize());
- buf.append("]");
- }
- System.out.println(buf);
- }
- if (rowIndexCols != null && !rowIndexCols.isEmpty()) {
- // include the columns that are specified, only if the columns are included, bloom filter
- // will be read
- boolean[] sargColumns = new boolean[colCount];
- for (int colIdx : rowIndexCols) {
- sargColumns[colIdx] = true;
- }
- OrcIndex indices = rows
- .readRowIndex(stripeIx, null, null, null, sargColumns);
- for (int col : rowIndexCols) {
- StringBuilder buf = new StringBuilder();
- String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex());
- buf.append(rowIdxString);
- String bloomFilString = getFormattedBloomFilters(col, indices.getBloomFilterIndex());
- buf.append(bloomFilString);
- System.out.println(buf);
- }
- }
- }
-
- FileSystem fs = file.getFileSystem(conf);
- long fileLen = fs.getFileStatus(file).getLen();
- long paddedBytes = getTotalPaddingSize(reader);
- // empty ORC file is ~45 bytes. Assumption here is file length always >0
- double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
- DecimalFormat format = new DecimalFormat("##.##");
- System.out.println("\nFile length: " + fileLen + " bytes");
- System.out.println("Padding length: " + paddedBytes + " bytes");
- System.out.println("Padding ratio: " + format.format(percentPadding) + "%");
- OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(reader);
- if (acidStats != null) {
- System.out.println("ACID stats:" + acidStats);
- }
- rows.close();
- }
-
- private static void recoverFiles(final List<String> corruptFiles, final Configuration conf,
- final String backup)
- throws IOException {
- for (String corruptFile : corruptFiles) {
- System.err.println("Recovering file " + corruptFile);
- Path corruptPath = new Path(corruptFile);
- FileSystem fs = corruptPath.getFileSystem(conf);
- FSDataInputStream fdis = fs.open(corruptPath);
- try {
- long corruptFileLen = fs.getFileStatus(corruptPath).getLen();
- long remaining = corruptFileLen;
- List<Long> footerOffsets = Lists.newArrayList();
-
- // start reading the data file form top to bottom and record the valid footers
- while (remaining > 0) {
- int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
- byte[] data = new byte[toRead];
- long startPos = corruptFileLen - remaining;
- fdis.readFully(startPos, data, 0, toRead);
-
- // find all MAGIC string and see if the file is readable from there
- int index = 0;
- long nextFooterOffset;
-
- while (index != -1) {
- index = indexOf(data, OrcFile.MAGIC.getBytes(), index + 1);
- if (index != -1) {
- nextFooterOffset = startPos + index + OrcFile.MAGIC.length() + 1;
- if (isReadable(corruptPath, conf, nextFooterOffset)) {
- footerOffsets.add(nextFooterOffset);
- }
- }
- }
-
- System.err.println("Scanning for valid footers - startPos: " + startPos +
- " toRead: " + toRead + " remaining: " + remaining);
- remaining = remaining - toRead;
- }
-
- System.err.println("Readable footerOffsets: " + footerOffsets);
- recoverFile(corruptPath, fs, conf, footerOffsets, backup);
- } catch (Exception e) {
- Path recoveryFile = getRecoveryFile(corruptPath);
- if (fs.exists(recoveryFile)) {
- fs.delete(recoveryFile, false);
- }
- System.err.println("Unable to recover file " + corruptFile);
- e.printStackTrace();
- System.err.println(SEPARATOR);
- continue;
- } finally {
- fdis.close();
- }
- System.err.println(corruptFile + " recovered successfully!");
- System.err.println(SEPARATOR);
- }
- }
-
- private static void recoverFile(final Path corruptPath, final FileSystem fs,
- final Configuration conf, final List<Long> footerOffsets, final String backup)
- throws IOException {
-
- // first recover the file to .recovered file and then once successful rename it to actual file
- Path recoveredPath = getRecoveryFile(corruptPath);
-
- // make sure that file does not exist
- if (fs.exists(recoveredPath)) {
- fs.delete(recoveredPath, false);
- }
-
- // if there are no valid footers, the file should still be readable so create an empty orc file
- if (footerOffsets == null || footerOffsets.isEmpty()) {
- System.err.println("No readable footers found. Creating empty orc file.");
- TypeDescription schema = TypeDescription.createStruct();
- Writer writer = OrcFile.createWriter(recoveredPath,
- OrcFile.writerOptions(conf).setSchema(schema));
- writer.close();
- } else {
- FSDataInputStream fdis = fs.open(corruptPath);
- FileStatus fileStatus = fs.getFileStatus(corruptPath);
- // read corrupt file and copy it to recovered file until last valid footer
- FSDataOutputStream fdos = fs.create(recoveredPath, true,
- conf.getInt("io.file.buffer.size", 4096),
- fileStatus.getReplication(),
- fileStatus.getBlockSize());
- try {
- long fileLen = footerOffsets.get(footerOffsets.size() - 1);
- long remaining = fileLen;
-
- while (remaining > 0) {
- int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
- byte[] data = new byte[toRead];
- long startPos = fileLen - remaining;
- fdis.readFully(startPos, data, 0, toRead);
- fdos.write(data);
- System.err.println("Copying data to recovery file - startPos: " + startPos +
- " toRead: " + toRead + " remaining: " + remaining);
- remaining = remaining - toRead;
- }
- } catch (Exception e) {
- fs.delete(recoveredPath, false);
- throw new IOException(e);
- } finally {
- fdis.close();
- fdos.close();
- }
- }
-
- // validate the recovered file once again and start moving corrupt files to backup folder
- if (isReadable(recoveredPath, conf, Long.MAX_VALUE)) {
- Path backupDataPath;
- String scheme = corruptPath.toUri().getScheme();
- String authority = corruptPath.toUri().getAuthority();
- String filePath = corruptPath.toUri().getPath();
-
- // use the same filesystem as corrupt file if backup-path is not explicitly specified
- if (backup.equals(DEFAULT_BACKUP_PATH)) {
- backupDataPath = new Path(scheme, authority, DEFAULT_BACKUP_PATH + filePath);
- } else {
- backupDataPath = Path.mergePaths(new Path(backup), corruptPath);
- }
-
- // Move data file to backup path
- moveFiles(fs, corruptPath, backupDataPath);
-
- // Move side file to backup path
- Path sideFilePath = OrcRecordUpdater.getSideFile(corruptPath);
- Path backupSideFilePath = new Path(backupDataPath.getParent(), sideFilePath.getName());
- moveFiles(fs, sideFilePath, backupSideFilePath);
-
- // finally move recovered file to actual file
- moveFiles(fs, recoveredPath, corruptPath);
-
- // we are done recovering, backing up and validating
- System.err.println("Validation of recovered file successful!");
- }
- }
-
- private static void moveFiles(final FileSystem fs, final Path src, final Path dest)
- throws IOException {
- try {
- // create the dest directory if not exist
- if (!fs.exists(dest.getParent())) {
- fs.mkdirs(dest.getParent());
- }
-
- // if the destination file exists for some reason delete it
- fs.delete(dest, false);
-
- if (fs.rename(src, dest)) {
- System.err.println("Moved " + src + " to " + dest);
- } else {
- throw new IOException("Unable to move " + src + " to " + dest);
- }
-
- } catch (Exception e) {
- throw new IOException("Unable to move " + src + " to " + dest, e);
- }
- }
-
- private static Path getRecoveryFile(final Path corruptPath) {
- return new Path(corruptPath.getParent(), corruptPath.getName() + ".recovered");
- }
-
- private static boolean isReadable(final Path corruptPath, final Configuration conf,
- final long maxLen) {
- try {
- OrcFile.createReader(corruptPath, OrcFile.readerOptions(conf).maxLength(maxLen));
- return true;
- } catch (Exception e) {
- // ignore this exception as maxLen is unreadable
- return false;
- }
- }
-
- // search for byte pattern in another byte array
- private static int indexOf(final byte[] data, final byte[] pattern, final int index) {
- if (data == null || data.length == 0 || pattern == null || pattern.length == 0 ||
- index > data.length || index < 0) {
- return -1;
- }
-
- int j = 0;
- for (int i = index; i < data.length; i++) {
- if (pattern[j] == data[i]) {
- j++;
- } else {
- j = 0;
- }
-
- if (j == pattern.length) {
- return i - pattern.length + 1;
- }
- }
-
- return -1;
- }
-
- private static String getFormattedBloomFilters(int col,
- OrcProto.BloomFilterIndex[] bloomFilterIndex) {
- StringBuilder buf = new StringBuilder();
- BloomFilterIO stripeLevelBF = null;
- if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
- int idx = 0;
- buf.append("\n Bloom filters for column ").append(col).append(":");
- for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
- BloomFilterIO toMerge = new BloomFilterIO(bf);
- buf.append("\n Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge));
- if (stripeLevelBF == null) {
- stripeLevelBF = toMerge;
- } else {
- stripeLevelBF.merge(toMerge);
- }
- }
- String bloomFilterStats = getBloomFilterStats(stripeLevelBF);
- buf.append("\n Stripe level merge:").append(bloomFilterStats);
- }
- return buf.toString();
- }
-
- private static String getBloomFilterStats(BloomFilterIO bf) {
- StringBuilder sb = new StringBuilder();
- int bitCount = bf.getBitSize();
- int popCount = 0;
- for (long l : bf.getBitSet()) {
- popCount += Long.bitCount(l);
- }
- int k = bf.getNumHashFunctions();
- float loadFactor = (float) popCount / (float) bitCount;
- float expectedFpp = (float) Math.pow(loadFactor, k);
- DecimalFormat df = new DecimalFormat("###.####");
- sb.append(" numHashFunctions: ").append(k);
- sb.append(" bitCount: ").append(bitCount);
- sb.append(" popCount: ").append(popCount);
- sb.append(" loadFactor: ").append(df.format(loadFactor));
- sb.append(" expectedFpp: ").append(expectedFpp);
- return sb.toString();
- }
-
- private static String getFormattedRowIndices(int col,
- OrcProto.RowIndex[] rowGroupIndex) {
- StringBuilder buf = new StringBuilder();
- OrcProto.RowIndex index;
- buf.append(" Row group indices for column ").append(col).append(":");
- if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
- ((index = rowGroupIndex[col]) == null)) {
- buf.append(" not found\n");
- return buf.toString();
- }
-
- for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
- buf.append("\n Entry ").append(entryIx).append(": ");
- OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
- if (entry == null) {
- buf.append("unknown\n");
- continue;
- }
- OrcProto.ColumnStatistics colStats = entry.getStatistics();
- if (colStats == null) {
- buf.append("no stats at ");
- } else {
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(colStats);
- buf.append(cs.toString());
- }
- buf.append(" positions: ");
- for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
- if (posIx != 0) {
- buf.append(",");
- }
- buf.append(entry.getPositions(posIx));
- }
- }
- return buf.toString();
- }
-
- public static long getTotalPaddingSize(Reader reader) throws IOException {
- long paddedBytes = 0;
- List<StripeInformation> stripes = reader.getStripes();
- for (int i = 1; i < stripes.size(); i++) {
- long prevStripeOffset = stripes.get(i - 1).getOffset();
- long prevStripeLen = stripes.get(i - 1).getLength();
- paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + prevStripeLen);
- }
- return paddedBytes;
- }
-
- static Options createOptions() {
- Options result = new Options();
-
- // add -d and --data to print the rows
- result.addOption(OptionBuilder
- .withLongOpt("data")
- .withDescription("Should the data be printed")
- .create('d'));
-
- // to avoid breaking unit tests (when run in different time zones) for file dump, printing
- // of timezone is made optional
- result.addOption(OptionBuilder
- .withLongOpt("timezone")
- .withDescription("Print writer's time zone")
- .create('t'));
-
- result.addOption(OptionBuilder
- .withLongOpt("help")
- .withDescription("print help message")
- .create('h'));
-
- result.addOption(OptionBuilder
- .withLongOpt("rowindex")
- .withArgName("comma separated list of column ids for which row index should be printed")
- .withDescription("Dump stats for column number(s)")
- .hasArg()
- .create('r'));
-
- result.addOption(OptionBuilder
- .withLongOpt("json")
- .withDescription("Print metadata in JSON format")
- .create('j'));
-
- result.addOption(OptionBuilder
- .withLongOpt("pretty")
- .withDescription("Pretty print json metadata output")
- .create('p'));
-
- result.addOption(OptionBuilder
- .withLongOpt("recover")
- .withDescription("recover corrupted orc files generated by streaming")
- .create());
-
- result.addOption(OptionBuilder
- .withLongOpt("skip-dump")
- .withDescription("used along with --recover to directly recover files without dumping")
- .create());
-
- result.addOption(OptionBuilder
- .withLongOpt("backup-path")
- .withDescription("specify a backup path to store the corrupted files (default: /tmp)")
- .hasArg()
- .create());
- return result;
- }
-
- private static void printMap(JSONWriter writer,
- Map<Object, Object> obj,
- List<OrcProto.Type> types,
- OrcProto.Type type
- ) throws IOException, JSONException {
- writer.array();
- int keyType = type.getSubtypes(0);
- int valueType = type.getSubtypes(1);
- for (Map.Entry<Object, Object> item : obj.entrySet()) {
- writer.object();
- writer.key("_key");
- printObject(writer, item.getKey(), types, keyType);
- writer.key("_value");
- printObject(writer, item.getValue(), types, valueType);
- writer.endObject();
- }
- writer.endArray();
- }
-
- private static void printList(JSONWriter writer,
- List<Object> obj,
- List<OrcProto.Type> types,
- OrcProto.Type type
- ) throws IOException, JSONException {
- int subtype = type.getSubtypes(0);
- writer.array();
- for (Object item : obj) {
- printObject(writer, item, types, subtype);
- }
- writer.endArray();
- }
-
- private static void printUnion(JSONWriter writer,
- OrcUnion obj,
- List<OrcProto.Type> types,
- OrcProto.Type type
- ) throws IOException, JSONException {
- int subtype = type.getSubtypes(obj.getTag());
- printObject(writer, obj.getObject(), types, subtype);
- }
-
- static void printStruct(JSONWriter writer,
- OrcStruct obj,
- List<OrcProto.Type> types,
- OrcProto.Type type) throws IOException, JSONException {
- writer.object();
- List<Integer> fieldTypes = type.getSubtypesList();
- for (int i = 0; i < fieldTypes.size(); ++i) {
- writer.key(type.getFieldNames(i));
- printObject(writer, obj.getFieldValue(i), types, fieldTypes.get(i));
- }
- writer.endObject();
- }
-
- static void printObject(JSONWriter writer,
- Object obj,
- List<OrcProto.Type> types,
- int typeId) throws IOException, JSONException {
- OrcProto.Type type = types.get(typeId);
- if (obj == null) {
- writer.value(null);
- } else {
- switch (type.getKind()) {
- case STRUCT:
- printStruct(writer, (OrcStruct) obj, types, type);
- break;
- case UNION:
- printUnion(writer, (OrcUnion) obj, types, type);
- break;
- case LIST:
- printList(writer, (List<Object>) obj, types, type);
- break;
- case MAP:
- printMap(writer, (Map<Object, Object>) obj, types, type);
- break;
- case BYTE:
- writer.value(((ByteWritable) obj).get());
- break;
- case SHORT:
- writer.value(((ShortWritable) obj).get());
- break;
- case INT:
- writer.value(((IntWritable) obj).get());
- break;
- case LONG:
- writer.value(((LongWritable) obj).get());
- break;
- case FLOAT:
- writer.value(((FloatWritable) obj).get());
- break;
- case DOUBLE:
- writer.value(((DoubleWritable) obj).get());
- break;
- case BOOLEAN:
- writer.value(((BooleanWritable) obj).get());
- break;
- default:
- writer.value(obj.toString());
- break;
- }
- }
- }
-
- static void printJsonData(final Reader reader) throws IOException, JSONException {
- PrintStream printStream = System.out;
- OutputStreamWriter out = new OutputStreamWriter(printStream, "UTF-8");
- RecordReader rows = reader.rows(null);
- Object row = null;
- try {
- List<OrcProto.Type> types = reader.getTypes();
- while (rows.hasNext()) {
- row = rows.next(row);
- JSONWriter writer = new JSONWriter(out);
- printObject(writer, row, types, 0);
- out.write("\n");
- out.flush();
- if (printStream.checkError()) {
- throw new IOException("Error encountered when writing to stdout.");
- }
- }
- } finally {
- rows.close();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java
deleted file mode 100644
index 00de545..0000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java
+++ /dev/null
@@ -1,401 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.codehaus.jettison.json.JSONArray;
-import org.apache.orc.BloomFilterIO;
-import org.apache.orc.BinaryColumnStatistics;
-import org.apache.orc.BooleanColumnStatistics;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.impl.ColumnStatisticsImpl;
-import org.apache.orc.DateColumnStatistics;
-import org.apache.orc.DecimalColumnStatistics;
-import org.apache.orc.DoubleColumnStatistics;
-import org.apache.orc.IntegerColumnStatistics;
-import org.apache.orc.impl.OrcIndex;
-import org.apache.orc.OrcProto;
-import org.apache.orc.StringColumnStatistics;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.StripeStatistics;
-import org.apache.orc.TimestampColumnStatistics;
-import org.codehaus.jettison.json.JSONException;
-import org.codehaus.jettison.json.JSONObject;
-import org.codehaus.jettison.json.JSONStringer;
-import org.codehaus.jettison.json.JSONWriter;
-
-/**
- * File dump tool with json formatted output.
- */
-public class JsonFileDump {
-
- public static void printJsonMetaData(List<String> files,
- Configuration conf,
- List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone)
- throws JSONException, IOException {
- if (files.isEmpty()) {
- return;
- }
- JSONStringer writer = new JSONStringer();
- boolean multiFile = files.size() > 1;
- if (multiFile) {
- writer.array();
- } else {
- writer.object();
- }
- for (String filename : files) {
- try {
- if (multiFile) {
- writer.object();
- }
- writer.key("fileName").value(filename);
- Path path = new Path(filename);
- Reader reader = FileDump.getReader(path, conf, null);
- if (reader == null) {
- writer.key("status").value("FAILED");
- continue;
- }
- writer.key("fileVersion").value(reader.getFileVersion().getName());
- writer.key("writerVersion").value(reader.getWriterVersion());
- RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
- writer.key("numberOfRows").value(reader.getNumberOfRows());
- writer.key("compression").value(reader.getCompression());
- if (reader.getCompression() != CompressionKind.NONE) {
- writer.key("compressionBufferSize").value(reader.getCompressionSize());
- }
- writer.key("schemaString").value(reader.getObjectInspector().getTypeName());
- writer.key("schema").array();
- writeSchema(writer, reader.getTypes());
- writer.endArray();
-
- writer.key("stripeStatistics").array();
- List<StripeStatistics> stripeStatistics = reader.getStripeStatistics();
- for (int n = 0; n < stripeStatistics.size(); n++) {
- writer.object();
- writer.key("stripeNumber").value(n + 1);
- StripeStatistics ss = stripeStatistics.get(n);
- writer.key("columnStatistics").array();
- for (int i = 0; i < ss.getColumnStatistics().length; i++) {
- writer.object();
- writer.key("columnId").value(i);
- writeColumnStatistics(writer, ss.getColumnStatistics()[i]);
- writer.endObject();
- }
- writer.endArray();
- writer.endObject();
- }
- writer.endArray();
-
- ColumnStatistics[] stats = reader.getStatistics();
- int colCount = stats.length;
- writer.key("fileStatistics").array();
- for (int i = 0; i < stats.length; ++i) {
- writer.object();
- writer.key("columnId").value(i);
- writeColumnStatistics(writer, stats[i]);
- writer.endObject();
- }
- writer.endArray();
-
- writer.key("stripes").array();
- int stripeIx = -1;
- for (StripeInformation stripe : reader.getStripes()) {
- ++stripeIx;
- long stripeStart = stripe.getOffset();
- OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
- writer.object(); // start of stripe information
- writer.key("stripeNumber").value(stripeIx + 1);
- writer.key("stripeInformation");
- writeStripeInformation(writer, stripe);
- if (printTimeZone) {
- writer.key("writerTimezone").value(
- footer.hasWriterTimezone() ? footer.getWriterTimezone() : FileDump.UNKNOWN);
- }
- long sectionStart = stripeStart;
-
- writer.key("streams").array();
- for (OrcProto.Stream section : footer.getStreamsList()) {
- writer.object();
- String kind = section.hasKind() ? section.getKind().name() : FileDump.UNKNOWN;
- writer.key("columnId").value(section.getColumn());
- writer.key("section").value(kind);
- writer.key("startOffset").value(sectionStart);
- writer.key("length").value(section.getLength());
- sectionStart += section.getLength();
- writer.endObject();
- }
- writer.endArray();
-
- writer.key("encodings").array();
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- writer.object();
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- writer.key("columnId").value(i);
- writer.key("kind").value(encoding.getKind());
- if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
- encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
- writer.key("dictionarySize").value(encoding.getDictionarySize());
- }
- writer.endObject();
- }
- writer.endArray();
-
- if (rowIndexCols != null && !rowIndexCols.isEmpty()) {
- // include the columns that are specified, only if the columns are included, bloom filter
- // will be read
- boolean[] sargColumns = new boolean[colCount];
- for (int colIdx : rowIndexCols) {
- sargColumns[colIdx] = true;
- }
- OrcIndex indices = rows.readRowIndex(stripeIx, null, sargColumns);
- writer.key("indexes").array();
- for (int col : rowIndexCols) {
- writer.object();
- writer.key("columnId").value(col);
- writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
- writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex());
- writer.endObject();
- }
- writer.endArray();
- }
- writer.endObject(); // end of stripe information
- }
- writer.endArray();
-
- FileSystem fs = path.getFileSystem(conf);
- long fileLen = fs.getContentSummary(path).getLength();
- long paddedBytes = FileDump.getTotalPaddingSize(reader);
- // empty ORC file is ~45 bytes. Assumption here is file length always >0
- double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
- writer.key("fileLength").value(fileLen);
- writer.key("paddingLength").value(paddedBytes);
- writer.key("paddingRatio").value(percentPadding);
- OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(reader);
- if (acidStats != null) {
- writer.key("numInserts").value(acidStats.inserts);
- writer.key("numDeletes").value(acidStats.deletes);
- writer.key("numUpdates").value(acidStats.updates);
- }
- writer.key("status").value("OK");
- rows.close();
-
- writer.endObject();
- } catch (Exception e) {
- writer.key("status").value("FAILED");
- throw e;
- }
- }
- if (multiFile) {
- writer.endArray();
- }
-
- if (prettyPrint) {
- final String prettyJson;
- if (multiFile) {
- JSONArray jsonArray = new JSONArray(writer.toString());
- prettyJson = jsonArray.toString(2);
- } else {
- JSONObject jsonObject = new JSONObject(writer.toString());
- prettyJson = jsonObject.toString(2);
- }
- System.out.println(prettyJson);
- } else {
- System.out.println(writer.toString());
- }
- }
-
- private static void writeSchema(JSONStringer writer, List<OrcProto.Type> types)
- throws JSONException {
- int i = 0;
- for(OrcProto.Type type : types) {
- writer.object();
- writer.key("columnId").value(i++);
- writer.key("columnType").value(type.getKind());
- if (type.getFieldNamesCount() > 0) {
- writer.key("childColumnNames").array();
- for (String field : type.getFieldNamesList()) {
- writer.value(field);
- }
- writer.endArray();
- writer.key("childColumnIds").array();
- for (Integer colId : type.getSubtypesList()) {
- writer.value(colId);
- }
- writer.endArray();
- }
- if (type.hasPrecision()) {
- writer.key("precision").value(type.getPrecision());
- }
-
- if (type.hasScale()) {
- writer.key("scale").value(type.getScale());
- }
-
- if (type.hasMaximumLength()) {
- writer.key("maxLength").value(type.getMaximumLength());
- }
- writer.endObject();
- }
- }
-
- private static void writeStripeInformation(JSONWriter writer, StripeInformation stripe)
- throws JSONException {
- writer.object();
- writer.key("offset").value(stripe.getOffset());
- writer.key("indexLength").value(stripe.getIndexLength());
- writer.key("dataLength").value(stripe.getDataLength());
- writer.key("footerLength").value(stripe.getFooterLength());
- writer.key("rowCount").value(stripe.getNumberOfRows());
- writer.endObject();
- }
-
- private static void writeColumnStatistics(JSONWriter writer, ColumnStatistics cs)
- throws JSONException {
- if (cs != null) {
- writer.key("count").value(cs.getNumberOfValues());
- writer.key("hasNull").value(cs.hasNull());
- if (cs instanceof BinaryColumnStatistics) {
- writer.key("totalLength").value(((BinaryColumnStatistics) cs).getSum());
- writer.key("type").value(OrcProto.Type.Kind.BINARY);
- } else if (cs instanceof BooleanColumnStatistics) {
- writer.key("trueCount").value(((BooleanColumnStatistics) cs).getTrueCount());
- writer.key("falseCount").value(((BooleanColumnStatistics) cs).getFalseCount());
- writer.key("type").value(OrcProto.Type.Kind.BOOLEAN);
- } else if (cs instanceof IntegerColumnStatistics) {
- writer.key("min").value(((IntegerColumnStatistics) cs).getMinimum());
- writer.key("max").value(((IntegerColumnStatistics) cs).getMaximum());
- if (((IntegerColumnStatistics) cs).isSumDefined()) {
- writer.key("sum").value(((IntegerColumnStatistics) cs).getSum());
- }
- writer.key("type").value(OrcProto.Type.Kind.LONG);
- } else if (cs instanceof DoubleColumnStatistics) {
- writer.key("min").value(((DoubleColumnStatistics) cs).getMinimum());
- writer.key("max").value(((DoubleColumnStatistics) cs).getMaximum());
- writer.key("sum").value(((DoubleColumnStatistics) cs).getSum());
- writer.key("type").value(OrcProto.Type.Kind.DOUBLE);
- } else if (cs instanceof StringColumnStatistics) {
- writer.key("min").value(((StringColumnStatistics) cs).getMinimum());
- writer.key("max").value(((StringColumnStatistics) cs).getMaximum());
- writer.key("totalLength").value(((StringColumnStatistics) cs).getSum());
- writer.key("type").value(OrcProto.Type.Kind.STRING);
- } else if (cs instanceof DateColumnStatistics) {
- if (((DateColumnStatistics) cs).getMaximum() != null) {
- writer.key("min").value(((DateColumnStatistics) cs).getMinimum());
- writer.key("max").value(((DateColumnStatistics) cs).getMaximum());
- }
- writer.key("type").value(OrcProto.Type.Kind.DATE);
- } else if (cs instanceof TimestampColumnStatistics) {
- if (((TimestampColumnStatistics) cs).getMaximum() != null) {
- writer.key("min").value(((TimestampColumnStatistics) cs).getMinimum());
- writer.key("max").value(((TimestampColumnStatistics) cs).getMaximum());
- }
- writer.key("type").value(OrcProto.Type.Kind.TIMESTAMP);
- } else if (cs instanceof DecimalColumnStatistics) {
- if (((DecimalColumnStatistics) cs).getMaximum() != null) {
- writer.key("min").value(((DecimalColumnStatistics) cs).getMinimum());
- writer.key("max").value(((DecimalColumnStatistics) cs).getMaximum());
- writer.key("sum").value(((DecimalColumnStatistics) cs).getSum());
- }
- writer.key("type").value(OrcProto.Type.Kind.DECIMAL);
- }
- }
- }
-
- private static void writeBloomFilterIndexes(JSONWriter writer, int col,
- OrcProto.BloomFilterIndex[] bloomFilterIndex) throws JSONException {
-
- BloomFilterIO stripeLevelBF = null;
- if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
- int entryIx = 0;
- writer.key("bloomFilterIndexes").array();
- for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
- writer.object();
- writer.key("entryId").value(entryIx++);
- BloomFilterIO toMerge = new BloomFilterIO(bf);
- writeBloomFilterStats(writer, toMerge);
- if (stripeLevelBF == null) {
- stripeLevelBF = toMerge;
- } else {
- stripeLevelBF.merge(toMerge);
- }
- writer.endObject();
- }
- writer.endArray();
- }
- if (stripeLevelBF != null) {
- writer.key("stripeLevelBloomFilter");
- writer.object();
- writeBloomFilterStats(writer, stripeLevelBF);
- writer.endObject();
- }
- }
-
- private static void writeBloomFilterStats(JSONWriter writer, BloomFilterIO bf)
- throws JSONException {
- int bitCount = bf.getBitSize();
- int popCount = 0;
- for (long l : bf.getBitSet()) {
- popCount += Long.bitCount(l);
- }
- int k = bf.getNumHashFunctions();
- float loadFactor = (float) popCount / (float) bitCount;
- float expectedFpp = (float) Math.pow(loadFactor, k);
- writer.key("numHashFunctions").value(k);
- writer.key("bitCount").value(bitCount);
- writer.key("popCount").value(popCount);
- writer.key("loadFactor").value(loadFactor);
- writer.key("expectedFpp").value(expectedFpp);
- }
-
- private static void writeRowGroupIndexes(JSONWriter writer, int col,
- OrcProto.RowIndex[] rowGroupIndex)
- throws JSONException {
-
- OrcProto.RowIndex index;
- if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
- ((index = rowGroupIndex[col]) == null)) {
- return;
- }
-
- writer.key("rowGroupIndexes").array();
- for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
- writer.object();
- writer.key("entryId").value(entryIx);
- OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
- if (entry == null) {
- continue;
- }
- OrcProto.ColumnStatistics colStats = entry.getStatistics();
- writeColumnStatistics(writer, ColumnStatisticsImpl.deserialize(colStats));
- writer.key("positions").array();
- for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
- writer.value(entry.getPositions(posIx));
- }
- writer.endArray();
- writer.endObject();
- }
- writer.endArray();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java
index 0dd58b7..b9094bf 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java
@@ -18,10 +18,7 @@
package org.apache.hadoop.hive.ql.io.orc;
import java.io.IOException;
-import java.util.ArrayDeque;
-import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Deque;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
@@ -29,22 +26,20 @@ import java.util.TreeMap;
import org.apache.orc.OrcUtils;
import org.apache.orc.StripeInformation;
import org.apache.orc.TypeDescription;
+import org.apache.orc.impl.AcidStats;
+import org.apache.orc.impl.OrcAcidUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.ValidTxnList;
-import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.io.AcidInputFormat;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.RecordIdentifier;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
@@ -494,7 +489,7 @@ public class OrcRawRecordMerger implements AcidInputFormat.RawReader<OrcStruct>{
Path deltaFile = AcidUtils.createBucketFile(delta, bucket);
AcidUtils.ParsedDelta deltaDir = AcidUtils.parsedDelta(delta);
FileSystem fs = deltaFile.getFileSystem(conf);
- long length = getLastFlushLength(fs, deltaFile);
+ long length = OrcAcidUtils.getLastFlushLength(fs, deltaFile);
if (length != -1 && fs.exists(deltaFile)) {
Reader deltaReader = OrcFile.createReader(deltaFile,
OrcFile.readerOptions(conf).maxLength(length));
@@ -504,7 +499,7 @@ public class OrcRawRecordMerger implements AcidInputFormat.RawReader<OrcStruct>{
// it can produce wrong results (if the latest valid version of the record is filtered out by
// the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record)
// unless the delta only has insert events
- OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(deltaReader);
+ AcidStats acidStats = OrcAcidUtils.parseAcidStats(deltaReader);
if(acidStats.deletes > 0 || acidStats.updates > 0) {
deltaEventOptions = eventOptions.clone().searchArgument(null, null);
}
@@ -536,28 +531,6 @@ public class OrcRawRecordMerger implements AcidInputFormat.RawReader<OrcStruct>{
}
}
- /**
- * Read the side file to get the last flush length.
- * @param fs the file system to use
- * @param deltaFile the path of the delta file
- * @return the maximum size of the file to use
- * @throws IOException
- */
- static long getLastFlushLength(FileSystem fs,
- Path deltaFile) throws IOException {
- Path lengths = OrcRecordUpdater.getSideFile(deltaFile);
- long result = Long.MAX_VALUE;
- try (FSDataInputStream stream = fs.open(lengths)) {
- result = -1;
- while (stream.available() > 0) {
- result = stream.readLong();
- }
- return result;
- } catch (IOException ioe) {
- return result;
- }
- }
-
@VisibleForTesting
RecordIdentifier getMinKey() {
return minKey;
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
index d085c58..4bf2403 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
@@ -25,6 +25,8 @@ import java.nio.charset.CharsetDecoder;
import java.util.ArrayList;
import java.util.List;
+import org.apache.orc.impl.AcidStats;
+import org.apache.orc.impl.OrcAcidUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -55,7 +57,6 @@ public class OrcRecordUpdater implements RecordUpdater {
public static final String ACID_KEY_INDEX_NAME = "hive.acid.key.index";
public static final String ACID_FORMAT = "_orc_acid_version";
- public static final String ACID_STATS = "hive.acid.stats";
public static final int ORC_ACID_VERSION = 0;
@@ -102,46 +103,6 @@ public class OrcRecordUpdater implements RecordUpdater {
private LongObjectInspector origTxnInspector; // OI for the original txn inside the record
// identifer
- static class AcidStats {
- long inserts;
- long updates;
- long deletes;
-
- AcidStats() {
- // nothing
- }
-
- AcidStats(String serialized) {
- String[] parts = serialized.split(",");
- inserts = Long.parseLong(parts[0]);
- updates = Long.parseLong(parts[1]);
- deletes = Long.parseLong(parts[2]);
- }
-
- String serialize() {
- StringBuilder builder = new StringBuilder();
- builder.append(inserts);
- builder.append(",");
- builder.append(updates);
- builder.append(",");
- builder.append(deletes);
- return builder.toString();
- }
-
- @Override
- public String toString() {
- StringBuilder builder = new StringBuilder();
- builder.append(" inserts: ").append(inserts);
- builder.append(" updates: ").append(updates);
- builder.append(" deletes: ").append(deletes);
- return builder.toString();
- }
- }
-
- public static Path getSideFile(Path main) {
- return new Path(main + AcidUtils.DELTA_SIDE_FILE_SUFFIX);
- }
-
static int getOperation(OrcStruct struct) {
return ((IntWritable) struct.getFieldValue(OPERATION)).get();
}
@@ -237,7 +198,7 @@ public class OrcRecordUpdater implements RecordUpdater {
}
if (options.getMinimumTransactionId() != options.getMaximumTransactionId()
&& !options.isWritingBase()){
- flushLengths = fs.create(getSideFile(this.path), true, 8,
+ flushLengths = fs.create(OrcAcidUtils.getSideFile(this.path), true, 8,
options.getReporter());
} else {
flushLengths = null;
@@ -297,7 +258,7 @@ public class OrcRecordUpdater implements RecordUpdater {
}
Reader reader = OrcFile.createReader(matchingBucket, OrcFile.readerOptions(options.getConfiguration()));
//no close() on Reader?!
- AcidStats acidStats = parseAcidStats(reader);
+ AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader);
if(acidStats.inserts > 0) {
return acidStats.inserts;
}
@@ -412,7 +373,7 @@ public class OrcRecordUpdater implements RecordUpdater {
}
if (flushLengths != null) {
flushLengths.close();
- fs.delete(getSideFile(path), false);
+ fs.delete(OrcAcidUtils.getSideFile(path), false);
}
writer = null;
}
@@ -456,26 +417,6 @@ public class OrcRecordUpdater implements RecordUpdater {
}
return result;
}
- /**
- * {@link KeyIndexBuilder} creates these
- */
- static AcidStats parseAcidStats(Reader reader) {
- if (reader.hasMetadataValue(OrcRecordUpdater.ACID_STATS)) {
- String statsSerialized;
- try {
- ByteBuffer val =
- reader.getMetadataValue(OrcRecordUpdater.ACID_STATS)
- .duplicate();
- statsSerialized = utf8Decoder.decode(val).toString();
- } catch (CharacterCodingException e) {
- throw new IllegalArgumentException("Bad string encoding for " +
- OrcRecordUpdater.ACID_STATS, e);
- }
- return new AcidStats(statsSerialized);
- } else {
- return null;
- }
- }
static class KeyIndexBuilder implements OrcFile.WriterCallback {
StringBuilder lastKey = new StringBuilder();
@@ -500,7 +441,7 @@ public class OrcRecordUpdater implements RecordUpdater {
) throws IOException {
context.getWriter().addUserMetadata(ACID_KEY_INDEX_NAME,
UTF8.encode(lastKey.toString()));
- context.getWriter().addUserMetadata(ACID_STATS,
+ context.getWriter().addUserMetadata(OrcAcidUtils.ACID_STATS,
UTF8.encode(acidStats.serialize()));
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
index b7437be..3a2e7d8 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
@@ -22,17 +22,9 @@ import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
import java.util.List;
-import java.util.Set;
-import com.google.common.collect.Lists;
-import org.apache.orc.OrcUtils;
-import org.apache.orc.TypeDescription;
import org.apache.orc.impl.BufferChunk;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.impl.ColumnStatisticsImpl;
import org.apache.orc.CompressionCodec;
import org.apache.orc.FileMetaInfo;
import org.apache.orc.FileMetadata;
@@ -41,47 +33,25 @@ import org.apache.orc.StripeInformation;
import org.apache.orc.StripeStatistics;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.io.DiskRange;
-import org.apache.hadoop.hive.ql.io.FileFormatException;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
-import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.io.Text;
import org.apache.orc.OrcProto;
+import com.google.common.collect.Lists;
import com.google.protobuf.CodedInputStream;
-public class ReaderImpl implements Reader {
+public class ReaderImpl extends org.apache.orc.impl.ReaderImpl
+ implements Reader {
private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class);
private static final int DIRECTORY_SIZE_GUESS = 16 * 1024;
- protected final FileSystem fileSystem;
- private final long maxLength;
- protected final Path path;
- protected final org.apache.orc.CompressionKind compressionKind;
- protected final CompressionCodec codec;
- protected final int bufferSize;
- private final List<OrcProto.StripeStatistics> stripeStats;
- private final int metadataSize;
- protected final List<OrcProto.Type> types;
- private final TypeDescription schema;
- private final List<OrcProto.UserMetadataItem> userMetadata;
- private final List<OrcProto.ColumnStatistics> fileStats;
- private final List<StripeInformation> stripes;
- protected final int rowIndexStride;
- private final long contentLength, numberOfRows;
-
private final ObjectInspector inspector;
- private long deserializedSize = -1;
- protected final Configuration conf;
- private final List<Integer> versionList;
- private final OrcFile.WriterVersion writerVersion;
//serialized footer - Keeping this around for use by getFileMetaInfo()
// will help avoid cpu cycles spend in deserializing at cost of increased
@@ -91,83 +61,9 @@ public class ReaderImpl implements Reader {
// This will only be set if the file footer/metadata was read from disk.
private final ByteBuffer footerMetaAndPsBuffer;
- public static class StripeInformationImpl
- implements StripeInformation {
- private final OrcProto.StripeInformation stripe;
-
- public StripeInformationImpl(OrcProto.StripeInformation stripe) {
- this.stripe = stripe;
- }
-
- @Override
- public long getOffset() {
- return stripe.getOffset();
- }
-
- @Override
- public long getLength() {
- return stripe.getDataLength() + getIndexLength() + getFooterLength();
- }
-
- @Override
- public long getDataLength() {
- return stripe.getDataLength();
- }
-
- @Override
- public long getFooterLength() {
- return stripe.getFooterLength();
- }
-
- @Override
- public long getIndexLength() {
- return stripe.getIndexLength();
- }
-
- @Override
- public long getNumberOfRows() {
- return stripe.getNumberOfRows();
- }
-
- @Override
- public String toString() {
- return "offset: " + getOffset() + " data: " + getDataLength() +
- " rows: " + getNumberOfRows() + " tail: " + getFooterLength() +
- " index: " + getIndexLength();
- }
- }
-
@Override
- public long getNumberOfRows() {
- return numberOfRows;
- }
-
- @Override
- public List<String> getMetadataKeys() {
- List<String> result = new ArrayList<String>();
- for(OrcProto.UserMetadataItem item: userMetadata) {
- result.add(item.getName());
- }
- return result;
- }
-
- @Override
- public ByteBuffer getMetadataValue(String key) {
- for(OrcProto.UserMetadataItem item: userMetadata) {
- if (item.hasName() && item.getName().equals(key)) {
- return item.getValue().asReadOnlyByteBuffer();
- }
- }
- throw new IllegalArgumentException("Can't find user metadata " + key);
- }
-
- public boolean hasMetadataValue(String key) {
- for(OrcProto.UserMetadataItem item: userMetadata) {
- if (item.hasName() && item.getName().equals(key)) {
- return true;
- }
- }
- return false;
+ public ObjectInspector getObjectInspector() {
+ return inspector;
}
@Override
@@ -181,181 +77,19 @@ public class ReaderImpl implements Reader {
compressionKind);
}
- @Override
- public org.apache.orc.CompressionKind getCompressionKind() {
- return compressionKind;
- }
-
- @Override
- public int getCompressionSize() {
- return bufferSize;
- }
-
- @Override
- public List<StripeInformation> getStripes() {
- return stripes;
- }
-
- @Override
- public ObjectInspector getObjectInspector() {
- return inspector;
- }
-
- @Override
- public long getContentLength() {
- return contentLength;
- }
-
- @Override
- public List<OrcProto.Type> getTypes() {
- return types;
- }
-
- @Override
- public OrcFile.Version getFileVersion() {
- for (OrcFile.Version version: OrcFile.Version.values()) {
- if ((versionList != null && !versionList.isEmpty()) &&
- version.getMajor() == versionList.get(0) &&
- version.getMinor() == versionList.get(1)) {
- return version;
- }
- }
- return OrcFile.Version.V_0_11;
- }
-
- @Override
- public OrcFile.WriterVersion getWriterVersion() {
- return writerVersion;
- }
-
- @Override
- public int getRowIndexStride() {
- return rowIndexStride;
- }
-
- @Override
- public ColumnStatistics[] getStatistics() {
- ColumnStatistics[] result = new ColumnStatistics[types.size()];
- for(int i=0; i < result.length; ++i) {
- result[i] = ColumnStatisticsImpl.deserialize(fileStats.get(i));
- }
- return result;
- }
-
- @Override
- public TypeDescription getSchema() {
- return schema;
- }
-
- /**
- * Ensure this is an ORC file to prevent users from trying to read text
- * files or RC files as ORC files.
- * @param in the file being read
- * @param path the filename for error messages
- * @param psLen the postscript length
- * @param buffer the tail of the file
- * @throws IOException
- */
- static void ensureOrcFooter(FSDataInputStream in,
- Path path,
- int psLen,
- ByteBuffer buffer) throws IOException {
- int magicLength = OrcFile.MAGIC.length();
- int fullLength = magicLength + 1;
- if (psLen < fullLength || buffer.remaining() < fullLength) {
- throw new FileFormatException("Malformed ORC file " + path +
- ". Invalid postscript length " + psLen);
- }
- int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;
- byte[] array = buffer.array();
- // now look for the magic string at the end of the postscript.
- if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {
- // If it isn't there, this may be the 0.11.0 version of ORC.
- // Read the first 3 bytes of the file to check for the header
- byte[] header = new byte[magicLength];
- in.readFully(0, header, 0, magicLength);
- // if it isn't there, this isn't an ORC file
- if (!Text.decode(header, 0 , magicLength).equals(OrcFile.MAGIC)) {
- throw new FileFormatException("Malformed ORC file " + path +
- ". Invalid postscript.");
- }
- }
- }
-
- /**
- * Build a version string out of an array.
- * @param version the version number as a list
- * @return the human readable form of the version string
- */
- private static String versionString(List<Integer> version) {
- StringBuilder buffer = new StringBuilder();
- for(int i=0; i < version.size(); ++i) {
- if (i != 0) {
- buffer.append('.');
- }
- buffer.append(version.get(i));
- }
- return buffer.toString();
- }
-
- /**
- * Check to see if this ORC file is from a future version and if so,
- * warn the user that we may not be able to read all of the column encodings.
- * @param log the logger to write any error message to
- * @param path the data source path for error messages
- * @param version the version of hive that wrote the file.
- */
- static void checkOrcVersion(Logger log, Path path, List<Integer> version) {
- if (version.size() >= 1) {
- int major = version.get(0);
- int minor = 0;
- if (version.size() >= 2) {
- minor = version.get(1);
- }
- if (major > OrcFile.Version.CURRENT.getMajor() ||
- (major == OrcFile.Version.CURRENT.getMajor() &&
- minor > OrcFile.Version.CURRENT.getMinor())) {
- log.warn(path + " was written by a future Hive version " +
- versionString(version) +
- ". This file may not be readable by this version of Hive.");
- }
- }
- }
-
/**
* Constructor that let's the user specify additional options.
* @param path pathname for file
* @param options options for reading
* @throws IOException
*/
- public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException {
- FileSystem fs = options.getFilesystem();
- if (fs == null) {
- fs = path.getFileSystem(options.getConfiguration());
- }
- this.fileSystem = fs;
- this.path = path;
- this.conf = options.getConfiguration();
- this.maxLength = options.getMaxLength();
-
+ public ReaderImpl(Path path,
+ OrcFile.ReaderOptions options) throws IOException {
+ super(path, options);
FileMetadata fileMetadata = options.getFileMetadata();
if (fileMetadata != null) {
- this.compressionKind = fileMetadata.getCompressionKind();
- this.bufferSize = fileMetadata.getCompressionBufferSize();
- this.codec = WriterImpl.createCodec(compressionKind);
- this.metadataSize = fileMetadata.getMetadataSize();
- this.stripeStats = fileMetadata.getStripeStats();
- this.versionList = fileMetadata.getVersionList();
- this.writerVersion = OrcFile.WriterVersion.from(fileMetadata.getWriterVersionNum());
- this.types = fileMetadata.getTypes();
- this.rowIndexStride = fileMetadata.getRowIndexStride();
- this.contentLength = fileMetadata.getContentLength();
- this.numberOfRows = fileMetadata.getNumberOfRows();
- this.fileStats = fileMetadata.getFileStats();
- this.stripes = fileMetadata.getStripes();
this.inspector = OrcStruct.createObjectInspector(0, fileMetadata.getTypes());
this.footerByteBuffer = null; // not cached and not needed here
- this.userMetadata = null; // not cached and not needed here
this.footerMetaAndPsBuffer = null;
} else {
FileMetaInfo footerMetaData;
@@ -363,7 +97,7 @@ public class ReaderImpl implements Reader {
footerMetaData = options.getFileMetaInfo();
this.footerMetaAndPsBuffer = null;
} else {
- footerMetaData = extractMetaInfoFromFooter(fs, path,
+ footerMetaData = extractMetaInfoFromFooter(fileSystem, path,
options.getMaxLength());
this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer;
}
@@ -374,37 +108,8 @@ public class ReaderImpl implements Reader {
footerMetaData.footerBuffer
);
this.footerByteBuffer = footerMetaData.footerBuffer;
- this.compressionKind = rInfo.compressionKind;
- this.codec = rInfo.codec;
- this.bufferSize = rInfo.bufferSize;
- this.metadataSize = rInfo.metadataSize;
- this.stripeStats = rInfo.metadata.getStripeStatsList();
- this.types = rInfo.footer.getTypesList();
- this.rowIndexStride = rInfo.footer.getRowIndexStride();
- this.contentLength = rInfo.footer.getContentLength();
- this.numberOfRows = rInfo.footer.getNumberOfRows();
- this.userMetadata = rInfo.footer.getMetadataList();
- this.fileStats = rInfo.footer.getStatisticsList();
this.inspector = rInfo.inspector;
- this.versionList = footerMetaData.versionList;
- this.writerVersion = footerMetaData.writerVersion;
- this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList());
}
- this.schema = OrcUtils.convertTypeFromProtobuf(this.types, 0);
- }
-
- /**
- * Get the WriterVersion based on the ORC file postscript.
- * @param writerVersion the integer writer version
- * @return the writer version of the file
- */
- static OrcFile.WriterVersion getWriterVersion(int writerVersion) {
- for(OrcFile.WriterVersion version: OrcFile.WriterVersion.values()) {
- if (version.getId() == writerVersion) {
- return version;
- }
- }
- return OrcFile.WriterVersion.FUTURE;
}
/** Extracts the necessary metadata from an externally store buffer (fullFooterBuffer). */
@@ -565,20 +270,6 @@ public class ReaderImpl implements Reader {
);
}
- private static OrcFile.WriterVersion extractWriterVersion(OrcProto.PostScript ps) {
- return (ps.hasWriterVersion()
- ? getWriterVersion(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL);
- }
-
- private static List<StripeInformation> convertProtoStripesToStripes(
- List<OrcProto.StripeInformation> stripes) {
- List<StripeInformation> result = new ArrayList<StripeInformation>(stripes.size());
- for (OrcProto.StripeInformation info : stripes) {
- result.add(new StripeInformationImpl(info));
- }
- return result;
- }
-
/**
* MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl
* from serialized fields.
@@ -617,7 +308,8 @@ public class ReaderImpl implements Reader {
public FileMetaInfo getFileMetaInfo() {
return new FileMetaInfo(compressionKind.toString(), bufferSize,
- metadataSize, footerByteBuffer, versionList, writerVersion, footerMetaAndPsBuffer);
+ getMetadataSize(), footerByteBuffer, getVersionList(),
+ getWriterVersion(), footerMetaAndPsBuffer);
}
/** Same as FileMetaInfo, but with extra fields. FileMetaInfo is serialized for splits
@@ -697,184 +389,7 @@ public class ReaderImpl implements Reader {
}
@Override
- public long getRawDataSize() {
- // if the deserializedSize is not computed, then compute it, else
- // return the already computed size. since we are reading from the footer
- // we don't have to compute deserialized size repeatedly
- if (deserializedSize == -1) {
- List<Integer> indices = Lists.newArrayList();
- for (int i = 0; i < fileStats.size(); ++i) {
- indices.add(i);
- }
- deserializedSize = getRawDataSizeFromColIndices(indices);
- }
- return deserializedSize;
- }
-
- @Override
- public long getRawDataSizeFromColIndices(List<Integer> colIndices) {
- return getRawDataSizeFromColIndices(colIndices, types, fileStats);
- }
-
- public static long getRawDataSizeFromColIndices(
- List<Integer> colIndices, List<OrcProto.Type> types,
- List<OrcProto.ColumnStatistics> stats) {
- long result = 0;
- for (int colIdx : colIndices) {
- result += getRawDataSizeOfColumn(colIdx, types, stats);
- }
- return result;
- }
-
- private static long getRawDataSizeOfColumn(int colIdx, List<OrcProto.Type> types,
- List<OrcProto.ColumnStatistics> stats) {
- OrcProto.ColumnStatistics colStat = stats.get(colIdx);
- long numVals = colStat.getNumberOfValues();
- OrcProto.Type type = types.get(colIdx);
-
- switch (type.getKind()) {
- case BINARY:
- // old orc format doesn't support binary statistics. checking for binary
- // statistics is not required as protocol buffers takes care of it.
- return colStat.getBinaryStatistics().getSum();
- case STRING:
- case CHAR:
- case VARCHAR:
- // old orc format doesn't support sum for string statistics. checking for
- // existence is not required as protocol buffers takes care of it.
-
- // ORC strings are deserialized to java strings. so use java data model's
- // string size
- numVals = numVals == 0 ? 1 : numVals;
- int avgStrLen = (int) (colStat.getStringStatistics().getSum() / numVals);
- return numVals * JavaDataModel.get().lengthForStringOfLength(avgStrLen);
- case TIMESTAMP:
- return numVals * JavaDataModel.get().lengthOfTimestamp();
- case DATE:
- return numVals * JavaDataModel.get().lengthOfDate();
- case DECIMAL:
- return numVals * JavaDataModel.get().lengthOfDecimal();
- case DOUBLE:
- case LONG:
- return numVals * JavaDataModel.get().primitive2();
- case FLOAT:
- case INT:
- case SHORT:
- case BOOLEAN:
- case BYTE:
- return numVals * JavaDataModel.get().primitive1();
- default:
- LOG.debug("Unknown primitive category: " + type.getKind());
- break;
- }
-
- return 0;
- }
-
- @Override
- public long getRawDataSizeOfColumns(List<String> colNames) {
- List<Integer> colIndices = getColumnIndicesFromNames(colNames);
- return getRawDataSizeFromColIndices(colIndices);
- }
-
- private List<Integer> getColumnIndicesFromNames(List<String> colNames) {
- // top level struct
- OrcProto.Type type = types.get(0);
- List<Integer> colIndices = Lists.newArrayList();
- List<String> fieldNames = type.getFieldNamesList();
- int fieldIdx = 0;
- for (String colName : colNames) {
- if (fieldNames.contains(colName)) {
- fieldIdx = fieldNames.indexOf(colName);
- } else {
- String s = "Cannot find field for: " + colName + " in ";
- for (String fn : fieldNames) {
- s += fn + ", ";
- }
- LOG.warn(s);
- continue;
- }
-
- // a single field may span multiple columns. find start and end column
- // index for the requested field
- int idxStart = type.getSubtypes(fieldIdx);
-
- int idxEnd;
-
- // if the specified is the last field and then end index will be last
- // column index
- if (fieldIdx + 1 > fieldNames.size() - 1) {
- idxEnd = getLastIdx() + 1;
- } else {
- idxEnd = type.getSubtypes(fieldIdx + 1);
- }
-
- // if start index and end index are same then the field is a primitive
- // field else complex field (like map, list, struct, union)
- if (idxStart == idxEnd) {
- // simple field
- colIndices.add(idxStart);
- } else {
- // complex fields spans multiple columns
- for (int i = idxStart; i < idxEnd; i++) {
- colIndices.add(i);
- }
- }
- }
- return colIndices;
- }
-
- private int getLastIdx() {
- Set<Integer> indices = new HashSet<>();
- for (OrcProto.Type type : types) {
- indices.addAll(type.getSubtypesList());
- }
- return Collections.max(indices);
- }
-
- @Override
- public List<OrcProto.StripeStatistics> getOrcProtoStripeStatistics() {
- return stripeStats;
- }
-
- @Override
- public List<OrcProto.ColumnStatistics> getOrcProtoFileStatistics() {
- return fileStats;
- }
-
- @Override
- public List<StripeStatistics> getStripeStatistics() {
- List<StripeStatistics> result = new ArrayList<>();
- for (OrcProto.StripeStatistics ss : stripeStats) {
- result.add(new StripeStatistics(ss.getColStatsList()));
- }
- return result;
- }
-
- public List<OrcProto.UserMetadataItem> getOrcProtoUserMetadata() {
- return userMetadata;
- }
-
- @Override
- public List<Integer> getVersionList() {
- return versionList;
- }
-
- @Override
- public int getMetadataSize() {
- return metadataSize;
- }
-
- @Override
public String toString() {
- StringBuilder buffer = new StringBuilder();
- buffer.append("ORC Reader(");
- buffer.append(path);
- if (maxLength != -1) {
- buffer.append(", ");
- buffer.append(maxLength);
- }
- buffer.append(")");
- return buffer.toString();
+ return "Hive " + super.toString();
}
}
[27/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
HIVE-11417. Move the ReaderImpl and RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Fixes #72.
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/ffb79509
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/ffb79509
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/ffb79509
Branch: refs/heads/master
Commit: ffb79509bcaefb9e7f916930edb022371b9d810f
Parents: fd06601
Author: Owen O'Malley <om...@apache.org>
Authored: Fri May 20 14:20:00 2016 -0700
Committer: Owen O'Malley <om...@apache.org>
Committed: Fri May 20 14:21:39 2016 -0700
----------------------------------------------------------------------
bin/ext/orcfiledump.cmd | 2 +-
bin/ext/orcfiledump.sh | 2 +-
.../hive/hcatalog/streaming/TestStreaming.java | 9 +-
.../llap/io/decode/OrcEncodedDataConsumer.java | 2 +-
.../llap/io/encoded/OrcEncodedDataReader.java | 15 +-
.../hive/llap/io/metadata/OrcFileMetadata.java | 4 +-
orc/pom.xml | 27 +
.../org/apache/orc/FileFormatException.java | 30 +
orc/src/java/org/apache/orc/OrcFile.java | 6 +
orc/src/java/org/apache/orc/Reader.java | 2 +-
.../java/org/apache/orc/TypeDescription.java | 18 +-
orc/src/java/org/apache/orc/impl/AcidStats.java | 60 +
.../orc/impl/ConvertTreeReaderFactory.java | 2840 +++++++++++++
.../java/org/apache/orc/impl/HadoopShims.java | 79 +
.../org/apache/orc/impl/HadoopShimsCurrent.java | 30 +
.../org/apache/orc/impl/HadoopShims_2_2.java | 71 +-
.../java/org/apache/orc/impl/IntegerReader.java | 3 +-
.../java/org/apache/orc/impl/OrcAcidUtils.java | 85 +
.../java/org/apache/orc/impl/ReaderImpl.java | 758 ++++
.../org/apache/orc/impl/RecordReaderImpl.java | 1215 ++++++
.../org/apache/orc/impl/RecordReaderUtils.java | 578 +++
.../org/apache/orc/impl/SchemaEvolution.java | 190 +
.../org/apache/orc/impl/TreeReaderFactory.java | 2093 ++++++++++
.../java/org/apache/orc/impl/ZeroCopyShims.java | 89 +
orc/src/java/org/apache/orc/tools/FileDump.java | 934 +++++
.../java/org/apache/orc/tools/JsonFileDump.java | 406 ++
.../org/apache/orc/TestColumnStatistics.java | 364 ++
.../org/apache/orc/TestNewIntegerEncoding.java | 1373 +++++++
.../org/apache/orc/TestOrcNullOptimization.java | 415 ++
.../test/org/apache/orc/TestOrcTimezone1.java | 189 +
.../test/org/apache/orc/TestOrcTimezone2.java | 143 +
.../org/apache/orc/TestStringDictionary.java | 290 ++
.../org/apache/orc/TestTypeDescription.java | 68 +
.../org/apache/orc/TestUnrolledBitPack.java | 114 +
.../test/org/apache/orc/TestVectorOrcFile.java | 2782 +++++++++++++
.../org/apache/orc/impl/TestOrcWideTable.java | 64 +
orc/src/test/org/apache/orc/impl/TestRLEv2.java | 307 ++
.../org/apache/orc/impl/TestReaderImpl.java | 152 +
.../apache/orc/impl/TestRecordReaderImpl.java | 1691 ++++++++
.../org/apache/orc/impl/TestStreamName.java | 49 +
.../test/org/apache/orc/tools/TestFileDump.java | 486 +++
.../org/apache/orc/tools/TestJsonFileDump.java | 150 +
orc/src/test/resources/orc-file-11-format.orc | Bin 0 -> 373336 bytes
.../resources/orc-file-dump-bloomfilter.out | 179 +
.../resources/orc-file-dump-bloomfilter2.out | 179 +
.../orc-file-dump-dictionary-threshold.out | 190 +
orc/src/test/resources/orc-file-dump.json | 1355 +++++++
orc/src/test/resources/orc-file-dump.out | 195 +
orc/src/test/resources/orc-file-has-null.out | 112 +
.../expressions/CastDecimalToTimestamp.java | 8 +-
.../expressions/CastDoubleToTimestamp.java | 13 +-
.../vector/expressions/CastLongToTimestamp.java | 4 +-
.../CastMillisecondsLongToTimestamp.java | 7 +-
.../ql/exec/vector/expressions/StringExpr.java | 354 --
.../hive/ql/hooks/PostExecOrcFileDump.java | 4 +-
.../hadoop/hive/ql/io/FileFormatException.java | 30 -
.../ql/io/orc/ConvertTreeReaderFactory.java | 3750 ------------------
.../apache/hadoop/hive/ql/io/orc/FileDump.java | 884 -----
.../hadoop/hive/ql/io/orc/JsonFileDump.java | 401 --
.../hive/ql/io/orc/OrcRawRecordMerger.java | 35 +-
.../hadoop/hive/ql/io/orc/OrcRecordUpdater.java | 71 +-
.../hadoop/hive/ql/io/orc/ReaderImpl.java | 509 +--
.../hadoop/hive/ql/io/orc/RecordReaderImpl.java | 1823 ++++-----
.../hive/ql/io/orc/RecordReaderUtils.java | 586 ---
.../hadoop/hive/ql/io/orc/SchemaEvolution.java | 190 -
.../hive/ql/io/orc/TreeReaderFactory.java | 2525 ------------
.../ql/io/orc/encoded/EncodedReaderImpl.java | 2 +-
.../orc/encoded/EncodedTreeReaderFactory.java | 2 +-
.../apache/hadoop/hive/ql/TestTxnCommands.java | 2 -
.../TestTimestampWritableAndColumnVector.java | 7 +-
.../vector/expressions/TestVectorTypeCasts.java | 10 +-
.../exec/vector/udf/TestVectorUDFAdaptor.java | 2 -
.../hive/ql/io/orc/TestColumnStatistics.java | 352 --
.../hadoop/hive/ql/io/orc/TestFileDump.java | 418 --
.../hadoop/hive/ql/io/orc/TestJsonFileDump.java | 139 -
.../hive/ql/io/orc/TestNewIntegerEncoding.java | 1342 -------
.../hadoop/hive/ql/io/orc/TestOrcFile.java | 70 +-
.../hive/ql/io/orc/TestOrcNullOptimization.java | 400 --
.../hive/ql/io/orc/TestOrcRecordUpdater.java | 4 +-
.../hadoop/hive/ql/io/orc/TestOrcTimezone1.java | 194 -
.../hadoop/hive/ql/io/orc/TestOrcTimezone2.java | 142 -
.../hadoop/hive/ql/io/orc/TestOrcWideTable.java | 64 -
.../apache/hadoop/hive/ql/io/orc/TestRLEv2.java | 297 --
.../hadoop/hive/ql/io/orc/TestReaderImpl.java | 151 -
.../hive/ql/io/orc/TestRecordReaderImpl.java | 1678 --------
.../hadoop/hive/ql/io/orc/TestStreamName.java | 50 -
.../hive/ql/io/orc/TestStringDictionary.java | 261 --
.../hive/ql/io/orc/TestTypeDescription.java | 68 -
.../hive/ql/io/orc/TestUnrolledBitPack.java | 114 -
.../hive/ql/io/orc/TestVectorOrcFile.java | 2791 -------------
.../resources/orc-file-dump-bloomfilter.out | 179 -
.../resources/orc-file-dump-bloomfilter2.out | 179 -
.../orc-file-dump-dictionary-threshold.out | 190 -
ql/src/test/resources/orc-file-dump.json | 1355 -------
ql/src/test/resources/orc-file-dump.out | 195 -
ql/src/test/resources/orc-file-has-null.out | 112 -
.../results/clientpositive/orc_create.q.out | 12 +-
.../clientpositive/orc_int_type_promotion.q.out | 12 +-
...vol_orc_vec_mapwork_part_all_primitive.q.out | 40 +-
...vol_orc_vec_mapwork_part_all_primitive.q.out | 40 +-
.../clientpositive/vector_complex_all.q.out | 6 +-
.../hive/serde2/io/TimestampWritable.java | 114 +-
.../PrimitiveObjectInspectorUtils.java | 7 +-
.../hive/serde2/io/TestTimestampWritable.java | 41 +-
.../apache/hadoop/hive/shims/Hadoop23Shims.java | 63 +-
.../apache/hadoop/hive/shims/ZeroCopyShims.java | 86 -
.../apache/hadoop/hive/shims/HadoopShims.java | 70 -
.../hadoop/hive/shims/HadoopShimsSecure.java | 29 -
.../ql/exec/vector/TimestampColumnVector.java | 9 +-
.../ql/exec/vector/expressions/StringExpr.java | 354 ++
.../hive/ql/io/sarg/SearchArgumentImpl.java | 16 +-
.../hadoop/hive/ql/util/TimestampUtils.java | 94 +
112 files changed, 21796 insertions(+), 21556 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/bin/ext/orcfiledump.cmd
----------------------------------------------------------------------
diff --git a/bin/ext/orcfiledump.cmd b/bin/ext/orcfiledump.cmd
index f78ed7f..ff4b410 100644
--- a/bin/ext/orcfiledump.cmd
+++ b/bin/ext/orcfiledump.cmd
@@ -14,7 +14,7 @@
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
-set CLASS=org.apache.hadoop.hive.ql.io.orc.FileDump
+set CLASS=org.apache.orc.tools.FileDump
set HIVE_OPTS=
set HADOOP_CLASSPATH=
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/bin/ext/orcfiledump.sh
----------------------------------------------------------------------
diff --git a/bin/ext/orcfiledump.sh b/bin/ext/orcfiledump.sh
index 74f1a1e..c84e61c 100644
--- a/bin/ext/orcfiledump.sh
+++ b/bin/ext/orcfiledump.sh
@@ -17,7 +17,7 @@ THISSERVICE=orcfiledump
export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
orcfiledump () {
- CLASS=org.apache.hadoop.hive.ql.io.orc.FileDump
+ CLASS=org.apache.orc.tools.FileDump
HIVE_OPTS=''
execHiveCmd $CLASS "$@"
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
----------------------------------------------------------------------
diff --git a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
index 6016425..4d2a2ee 100644
--- a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
+++ b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
@@ -57,16 +57,15 @@ import org.apache.hadoop.hive.metastore.api.ShowLocksResponseElement;
import org.apache.hadoop.hive.metastore.api.TxnAbortedException;
import org.apache.hadoop.hive.metastore.api.TxnInfo;
import org.apache.hadoop.hive.metastore.api.TxnState;
-import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.metastore.txn.TxnDbUtil;
import org.apache.hadoop.hive.ql.CommandNeedRetryException;
import org.apache.hadoop.hive.ql.Driver;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.IOConstants;
-import org.apache.hadoop.hive.ql.io.orc.FileDump;
+import org.apache.orc.impl.OrcAcidUtils;
+import org.apache.orc.tools.FileDump;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
-import org.apache.hadoop.hive.ql.io.orc.OrcRecordUpdater;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.ql.io.orc.Reader;
import org.apache.hadoop.hive.ql.io.orc.RecordReader;
@@ -1089,7 +1088,7 @@ public class TestStreaming {
Reader reader = OrcFile.createReader(orcFile,
OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows(null);
+ RecordReader rows = reader.rows();
StructObjectInspector inspector = (StructObjectInspector) reader
.getObjectInspector();
@@ -1561,7 +1560,7 @@ public class TestStreaming {
final Map<String, List<Long>> offsetMap, final String key, final int numEntries)
throws IOException {
Path dataPath = new Path(file);
- Path sideFilePath = OrcRecordUpdater.getSideFile(dataPath);
+ Path sideFilePath = OrcAcidUtils.getSideFile(dataPath);
Path cPath = new Path(sideFilePath.getParent(), sideFilePath.getName() + ".corrupt");
FileSystem fs = sideFilePath.getFileSystem(conf);
List<Long> offsets = offsetMap.get(key);
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java
index a689f10..619d1a4 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java
@@ -42,7 +42,7 @@ import org.apache.hadoop.hive.ql.io.orc.encoded.EncodedTreeReaderFactory.Settabl
import org.apache.hadoop.hive.ql.io.orc.encoded.OrcBatchKey;
import org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch;
import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl;
-import org.apache.hadoop.hive.ql.io.orc.TreeReaderFactory;
+import org.apache.orc.impl.TreeReaderFactory;
import org.apache.hadoop.hive.ql.io.orc.WriterImpl;
import org.apache.orc.OrcProto;
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java
index 7effe69..69c0647 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java
@@ -67,13 +67,12 @@ import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSplit;
import org.apache.hadoop.hive.ql.io.orc.encoded.Reader;
import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl;
-import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.SargApplier;
import org.apache.hadoop.hive.ql.io.orc.encoded.EncodedOrcFile;
import org.apache.hadoop.hive.ql.io.orc.encoded.EncodedReader;
import org.apache.hadoop.hive.ql.io.orc.encoded.OrcBatchKey;
import org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch;
import org.apache.hadoop.hive.ql.io.orc.encoded.Reader.PoolFactory;
-import org.apache.hadoop.hive.ql.io.orc.RecordReaderUtils;
+import org.apache.orc.impl.RecordReaderUtils;
import org.apache.orc.StripeInformation;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.mapred.FileSplit;
@@ -343,7 +342,8 @@ public class OrcEncodedDataReader extends CallableWithNdc<Void>
// intermediate changes for individual columns will unset values in the array.
// Skip this case for 0-column read. We could probably special-case it just like we do
// in EncodedReaderImpl, but for now it's not that important.
- if (colRgs.length > 0 && colRgs[0] == SargApplier.READ_NO_RGS) continue;
+ if (colRgs.length > 0 && colRgs[0] ==
+ RecordReaderImpl.SargApplier.READ_NO_RGS) continue;
// 6.1. Determine the columns to read (usually the same as requested).
if (cols == null || cols.size() == colRgs.length) {
@@ -691,12 +691,13 @@ public class OrcEncodedDataReader extends CallableWithNdc<Void>
*/
private boolean determineRgsToRead(boolean[] globalIncludes, int rowIndexStride,
ArrayList<OrcStripeMetadata> metadata) throws IOException {
- SargApplier sargApp = null;
+ RecordReaderImpl.SargApplier sargApp = null;
if (sarg != null && rowIndexStride != 0) {
List<OrcProto.Type> types = fileMetadata.getTypes();
String[] colNamesForSarg = OrcInputFormat.getSargColumnNames(
columnNames, types, globalIncludes, fileMetadata.isOriginalFormat());
- sargApp = new SargApplier(sarg, colNamesForSarg, rowIndexStride, types, globalIncludes.length);
+ sargApp = new RecordReaderImpl.SargApplier(sarg, colNamesForSarg,
+ rowIndexStride, types, globalIncludes.length);
}
boolean hasAnyData = false;
// readState should have been initialized by this time with an empty array.
@@ -710,8 +711,8 @@ public class OrcEncodedDataReader extends CallableWithNdc<Void>
rgsToRead = sargApp.pickRowGroups(stripe, stripeMetadata.getRowIndexes(),
stripeMetadata.getBloomFilterIndexes(), true);
}
- boolean isNone = rgsToRead == SargApplier.READ_NO_RGS,
- isAll = rgsToRead == SargApplier.READ_ALL_RGS;
+ boolean isNone = rgsToRead == RecordReaderImpl.SargApplier.READ_NO_RGS,
+ isAll = rgsToRead == RecordReaderImpl.SargApplier.READ_ALL_RGS;
hasAnyData = hasAnyData || !isNone;
if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
if (isNone) {
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/llap-server/src/java/org/apache/hadoop/hive/llap/io/metadata/OrcFileMetadata.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/metadata/OrcFileMetadata.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/metadata/OrcFileMetadata.java
index 4e42a0f..c9b0a4d 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/metadata/OrcFileMetadata.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/metadata/OrcFileMetadata.java
@@ -29,11 +29,11 @@ import org.apache.hadoop.hive.llap.cache.LlapCacheableBuffer;
import org.apache.hadoop.hive.ql.io.SyntheticFileId;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.Reader;
-import org.apache.hadoop.hive.ql.io.orc.ReaderImpl.StripeInformationImpl;
import org.apache.orc.CompressionKind;
import org.apache.orc.FileMetadata;
import org.apache.orc.OrcProto;
import org.apache.orc.StripeInformation;
+import org.apache.orc.impl.ReaderImpl;
/** ORC file metadata. Currently contains some duplicate info due to how different parts
* of ORC use different info. Ideally we would get rid of protobuf structs in code beyond reading,
@@ -72,7 +72,7 @@ public final class OrcFileMetadata extends LlapCacheableBuffer implements FileMe
@VisibleForTesting
public static OrcFileMetadata createDummy(Object fileKey) {
OrcFileMetadata ofm = new OrcFileMetadata(fileKey);
- ofm.stripes.add(new StripeInformationImpl(
+ ofm.stripes.add(new ReaderImpl.StripeInformationImpl(
OrcProto.StripeInformation.getDefaultInstance()));
ofm.fileStats.add(OrcProto.ColumnStatistics.getDefaultInstance());
ofm.stripeStats.add(OrcProto.StripeStatistics.newBuilder().addColStats(createStatsDummy()).build());
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/pom.xml
----------------------------------------------------------------------
diff --git a/orc/pom.xml b/orc/pom.xml
index 2d80c97..cc27077 100644
--- a/orc/pom.xml
+++ b/orc/pom.xml
@@ -72,6 +72,33 @@
</exclusions>
</dependency>
<dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-hdfs</artifactId>
+ <version>${hadoop.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>javax.servlet</groupId>
+ <artifactId>servlet-api</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>javax.servlet.jsp</groupId>
+ <artifactId>jsp-api</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>jetty</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>jetty-util</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.avro</groupId>
+ <artifactId>avro</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
<groupId>org.iq80.snappy</groupId>
<artifactId>snappy</artifactId>
<version>${snappy.version}</version>
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/FileFormatException.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/FileFormatException.java b/orc/src/java/org/apache/orc/FileFormatException.java
new file mode 100644
index 0000000..2cebea7
--- /dev/null
+++ b/orc/src/java/org/apache/orc/FileFormatException.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import java.io.IOException;
+
+/**
+ * Thrown when an invalid file format is encountered.
+ */
+public class FileFormatException extends IOException {
+
+ public FileFormatException(String errMsg) {
+ super(errMsg);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/OrcFile.java b/orc/src/java/org/apache/orc/OrcFile.java
index 85506ff..7dd7333 100644
--- a/orc/src/java/org/apache/orc/OrcFile.java
+++ b/orc/src/java/org/apache/orc/OrcFile.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.orc.impl.MemoryManager;
+import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.impl.WriterImpl;
/**
@@ -212,6 +213,11 @@ public class OrcFile {
return new ReaderOptions(conf);
}
+ public static Reader createReader(Path path,
+ ReaderOptions options) throws IOException {
+ return new ReaderImpl(path, options);
+ }
+
public interface WriterContext {
Writer getWriter();
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/Reader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/Reader.java b/orc/src/java/org/apache/orc/Reader.java
index 39de763..87f3293 100644
--- a/orc/src/java/org/apache/orc/Reader.java
+++ b/orc/src/java/org/apache/orc/Reader.java
@@ -334,7 +334,7 @@ public interface Reader {
* @return a new RecordReader
* @throws IOException
*/
- RecordReader rowsOptions(Options options) throws IOException;
+ RecordReader rows(Options options) throws IOException;
/**
* @return List of integers representing version of the file, in order from major to minor.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/TypeDescription.java b/orc/src/java/org/apache/orc/TypeDescription.java
index b8e057e..ffe3c1f 100644
--- a/orc/src/java/org/apache/orc/TypeDescription.java
+++ b/orc/src/java/org/apache/orc/TypeDescription.java
@@ -344,25 +344,25 @@ public class TypeDescription {
case INT:
case LONG:
case DATE:
- return new LongColumnVector();
+ return new LongColumnVector(maxSize);
case TIMESTAMP:
- return new TimestampColumnVector();
+ return new TimestampColumnVector(maxSize);
case FLOAT:
case DOUBLE:
- return new DoubleColumnVector();
+ return new DoubleColumnVector(maxSize);
case DECIMAL:
- return new DecimalColumnVector(precision, scale);
+ return new DecimalColumnVector(maxSize, precision, scale);
case STRING:
case BINARY:
case CHAR:
case VARCHAR:
- return new BytesColumnVector();
+ return new BytesColumnVector(maxSize);
case STRUCT: {
ColumnVector[] fieldVector = new ColumnVector[children.size()];
for(int i=0; i < fieldVector.length; ++i) {
fieldVector[i] = children.get(i).createColumn(maxSize);
}
- return new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
+ return new StructColumnVector(maxSize,
fieldVector);
}
case UNION: {
@@ -370,14 +370,14 @@ public class TypeDescription {
for(int i=0; i < fieldVector.length; ++i) {
fieldVector[i] = children.get(i).createColumn(maxSize);
}
- return new UnionColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
+ return new UnionColumnVector(maxSize,
fieldVector);
}
case LIST:
- return new ListColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
+ return new ListColumnVector(maxSize,
children.get(0).createColumn(maxSize));
case MAP:
- return new MapColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
+ return new MapColumnVector(maxSize,
children.get(0).createColumn(maxSize),
children.get(1).createColumn(maxSize));
default:
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/AcidStats.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/AcidStats.java b/orc/src/java/org/apache/orc/impl/AcidStats.java
new file mode 100644
index 0000000..6657fe9
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/AcidStats.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+/**
+ * Statistics about the ACID operations in an ORC file
+ */
+public class AcidStats {
+ public long inserts;
+ public long updates;
+ public long deletes;
+
+ public AcidStats() {
+ inserts = 0;
+ updates = 0;
+ deletes = 0;
+ }
+
+ public AcidStats(String serialized) {
+ String[] parts = serialized.split(",");
+ inserts = Long.parseLong(parts[0]);
+ updates = Long.parseLong(parts[1]);
+ deletes = Long.parseLong(parts[2]);
+ }
+
+ public String serialize() {
+ StringBuilder builder = new StringBuilder();
+ builder.append(inserts);
+ builder.append(",");
+ builder.append(updates);
+ builder.append(",");
+ builder.append(deletes);
+ return builder.toString();
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append(" inserts: ").append(inserts);
+ builder.append(" updates: ").append(updates);
+ builder.append(" deletes: ").append(deletes);
+ return builder.toString();
+ }
+}
[12/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
index 2199b11..e46ca51 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
@@ -18,1218 +18,923 @@
package org.apache.hadoop.hive.ql.io.orc;
import java.io.IOException;
-import java.math.BigDecimal;
-import java.sql.Date;
-import java.sql.Timestamp;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
-import java.util.Map;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.orc.BooleanColumnStatistics;
-import org.apache.orc.impl.BufferChunk;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.impl.ColumnStatisticsImpl;
-import org.apache.orc.CompressionCodec;
-import org.apache.orc.DataReader;
-import org.apache.orc.DateColumnStatistics;
-import org.apache.orc.DecimalColumnStatistics;
-import org.apache.orc.DoubleColumnStatistics;
-import org.apache.orc.impl.DataReaderProperties;
-import org.apache.orc.impl.InStream;
-import org.apache.orc.IntegerColumnStatistics;
-import org.apache.orc.OrcConf;
-import org.apache.orc.impl.OrcIndex;
-import org.apache.orc.impl.PositionProvider;
-import org.apache.orc.impl.StreamName;
-import org.apache.orc.StringColumnStatistics;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.TimestampColumnStatistics;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.io.DiskRange;
-import org.apache.hadoop.hive.common.io.DiskRangeList;
-import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.BloomFilterIO;
-import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
+import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
+import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
-import org.apache.orc.OrcProto;
+import org.apache.orc.TypeDescription;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-public class RecordReaderImpl implements RecordReader {
+public class RecordReaderImpl extends org.apache.orc.impl.RecordReaderImpl
+ implements RecordReader {
static final Logger LOG = LoggerFactory.getLogger(RecordReaderImpl.class);
- private static final boolean isLogDebugEnabled = LOG.isDebugEnabled();
- private static final Object UNKNOWN_VALUE = new Object();
- private final Path path;
- private final long firstRow;
- private final List<StripeInformation> stripes =
- new ArrayList<StripeInformation>();
- private OrcProto.StripeFooter stripeFooter;
- private final long totalRowCount;
- private final CompressionCodec codec;
- private final List<OrcProto.Type> types;
- private final int bufferSize;
- private final boolean[] included;
- private final long rowIndexStride;
- private long rowInStripe = 0;
- private int currentStripe = -1;
- private long rowBaseInStripe = 0;
- private long rowCountInStripe = 0;
- private final Map<StreamName, InStream> streams =
- new HashMap<StreamName, InStream>();
- DiskRangeList bufferChunks = null;
- private final TreeReaderFactory.TreeReader reader;
- private final OrcProto.RowIndex[] indexes;
- private final OrcProto.BloomFilterIndex[] bloomFilterIndices;
- private final SargApplier sargApp;
- // an array about which row groups aren't skipped
- private boolean[] includedRowGroups = null;
- private final DataReader dataReader;
+ private final VectorizedRowBatch batch;
+ private int rowInBatch;
+ private long baseRow;
- /**
- * Given a list of column names, find the given column and return the index.
- *
- * @param columnNames the list of potential column names
- * @param columnName the column name to look for
- * @param rootColumn offset the result with the rootColumn
- * @return the column number or -1 if the column wasn't found
- */
- static int findColumns(String[] columnNames,
- String columnName,
- int rootColumn) {
- for(int i=0; i < columnNames.length; ++i) {
- if (columnName.equals(columnNames[i])) {
- return i + rootColumn;
- }
- }
- return -1;
+ protected RecordReaderImpl(ReaderImpl fileReader,
+ Reader.Options options) throws IOException {
+ super(fileReader, options);
+ batch = this.schema.createRowBatch();
+ rowInBatch = 0;
}
/**
- * Find the mapping from predicate leaves to columns.
- * @param sargLeaves the search argument that we need to map
- * @param columnNames the names of the columns
- * @param rootColumn the offset of the top level row, which offsets the
- * result
- * @return an array mapping the sarg leaves to concrete column numbers
+ * If the current batch is empty, get a new one.
+ * @return true if we have rows available.
+ * @throws IOException
*/
- public static int[] mapSargColumnsToOrcInternalColIdx(List<PredicateLeaf> sargLeaves,
- String[] columnNames,
- int rootColumn) {
- int[] result = new int[sargLeaves.size()];
- Arrays.fill(result, -1);
- for(int i=0; i < result.length; ++i) {
- String colName = sargLeaves.get(i).getColumnName();
- result[i] = findColumns(columnNames, colName, rootColumn);
+ boolean ensureBatch() throws IOException {
+ if (rowInBatch >= batch.size) {
+ baseRow = super.getRowNumber();
+ rowInBatch = 0;
+ return super.nextBatch(batch);
}
- return result;
+ return true;
}
- protected RecordReaderImpl(ReaderImpl fileReader,
- Reader.Options options) throws IOException {
- SchemaEvolution treeReaderSchema;
- this.included = options.getInclude();
- included[0] = true;
- if (options.getSchema() == null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Schema on read not provided -- using file schema " +
- fileReader.getSchema());
- }
- treeReaderSchema = new SchemaEvolution(fileReader.getSchema(), included);
- } else {
+ @Override
+ public long getRowNumber() {
+ return baseRow + rowInBatch;
+ }
- // Now that we are creating a record reader for a file, validate that the schema to read
- // is compatible with the file schema.
- //
- treeReaderSchema = new SchemaEvolution(fileReader.getSchema(),
- options.getSchema(),
- included);
- }
- this.path = fileReader.path;
- this.codec = fileReader.codec;
- this.types = fileReader.types;
- this.bufferSize = fileReader.bufferSize;
- this.rowIndexStride = fileReader.rowIndexStride;
- FileSystem fileSystem = fileReader.fileSystem;
- SearchArgument sarg = options.getSearchArgument();
- if (sarg != null && rowIndexStride != 0) {
- sargApp = new SargApplier(
- sarg, options.getColumnNames(), rowIndexStride, types, included.length);
- } else {
- sargApp = null;
- }
- long rows = 0;
- long skippedRows = 0;
- long offset = options.getOffset();
- long maxOffset = options.getMaxOffset();
- for(StripeInformation stripe: fileReader.getStripes()) {
- long stripeStart = stripe.getOffset();
- if (offset > stripeStart) {
- skippedRows += stripe.getNumberOfRows();
- } else if (stripeStart < maxOffset) {
- this.stripes.add(stripe);
- rows += stripe.getNumberOfRows();
- }
- }
+ @Override
+ public boolean hasNext() throws IOException {
+ return ensureBatch();
+ }
- Boolean zeroCopy = options.getUseZeroCopy();
- if (zeroCopy == null) {
- zeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(fileReader.conf);
- }
- if (options.getDataReader() == null) {
- dataReader = RecordReaderUtils.createDefaultDataReader(
- DataReaderProperties.builder()
- .withBufferSize(bufferSize)
- .withCompression(fileReader.compressionKind)
- .withFileSystem(fileSystem)
- .withPath(path)
- .withTypeCount(types.size())
- .withZeroCopy(zeroCopy)
- .build());
+ @Override
+ public void seekToRow(long row) throws IOException {
+ if (row >= baseRow && row < baseRow + batch.size) {
+ rowInBatch = (int) (row - baseRow);
} else {
- dataReader = options.getDataReader();
+ super.seekToRow(row);
+ batch.size = 0;
+ ensureBatch();
}
- firstRow = skippedRows;
- totalRowCount = rows;
- Boolean skipCorrupt = options.getSkipCorruptRecords();
- if (skipCorrupt == null) {
- skipCorrupt = OrcConf.SKIP_CORRUPT_DATA.getBoolean(fileReader.conf);
- }
-
- reader = TreeReaderFactory.createTreeReader(treeReaderSchema.getReaderSchema(),
- treeReaderSchema, included, skipCorrupt);
- indexes = new OrcProto.RowIndex[types.size()];
- bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()];
- advanceToNextRow(reader, 0L, true);
}
- public static final class PositionProviderImpl implements PositionProvider {
- private final OrcProto.RowIndexEntry entry;
- private int index;
-
- public PositionProviderImpl(OrcProto.RowIndexEntry entry) {
- this(entry, 0);
+ @Override
+ public Object next(Object previous) throws IOException {
+ if (!ensureBatch()) {
+ return null;
}
-
- public PositionProviderImpl(OrcProto.RowIndexEntry entry, int startPos) {
- this.entry = entry;
- this.index = startPos;
+ if (schema.getCategory() == TypeDescription.Category.STRUCT) {
+ OrcStruct result;
+ List<TypeDescription> children = schema.getChildren();
+ int numberOfChildren = children.size();
+ if (previous == null || previous.getClass() != OrcStruct.class) {
+ result = new OrcStruct(numberOfChildren);
+ previous = result;
+ } else {
+ result = (OrcStruct) previous;
+ if (result.getNumFields() != numberOfChildren) {
+ result.setNumFields(numberOfChildren);
+ }
+ }
+ for(int i=0; i < numberOfChildren; ++i) {
+ result.setFieldValue(i, nextValue(batch.cols[i], rowInBatch,
+ children.get(i), result.getFieldValue(i)));
+ }
+ } else {
+ previous = nextValue(batch.cols[0], rowInBatch, schema, previous);
}
+ rowInBatch += 1;
+ return previous;
+ }
- @Override
- public long getNext() {
- return entry.getPositions(index++);
+ public boolean nextBatch(VectorizedRowBatch theirBatch) throws IOException {
+ // If the user hasn't been reading by row, use the fast path.
+ if (rowInBatch >= batch.size) {
+ return super.nextBatch(theirBatch);
}
+ copyIntoBatch(theirBatch, batch, rowInBatch);
+ rowInBatch += theirBatch.size;
+ return theirBatch.size > 0;
}
- OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException {
- return dataReader.readStripeFooter(stripe);
+ @Override
+ public void close() throws IOException {
+ super.close();
+ // free the memory for the column vectors
+ batch.cols = null;
}
- enum Location {
- BEFORE, MIN, MIDDLE, MAX, AFTER
- }
+ /* Routines for stubbing into Writables */
- /**
- * Given a point and min and max, determine if the point is before, at the
- * min, in the middle, at the max, or after the range.
- * @param point the point to test
- * @param min the minimum point
- * @param max the maximum point
- * @param <T> the type of the comparision
- * @return the location of the point
- */
- static <T> Location compareToRange(Comparable<T> point, T min, T max) {
- int minCompare = point.compareTo(min);
- if (minCompare < 0) {
- return Location.BEFORE;
- } else if (minCompare == 0) {
- return Location.MIN;
+ static BooleanWritable nextBoolean(ColumnVector vector,
+ int row,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
}
- int maxCompare = point.compareTo(max);
- if (maxCompare > 0) {
- return Location.AFTER;
- } else if (maxCompare == 0) {
- return Location.MAX;
+ if (vector.noNulls || !vector.isNull[row]) {
+ BooleanWritable result;
+ if (previous == null || previous.getClass() != BooleanWritable.class) {
+ result = new BooleanWritable();
+ } else {
+ result = (BooleanWritable) previous;
+ }
+ result.set(((LongColumnVector) vector).vector[row] != 0);
+ return result;
+ } else {
+ return null;
}
- return Location.MIDDLE;
}
- /**
- * Get the maximum value out of an index entry.
- * @param index
- * the index entry
- * @return the object for the maximum value or null if there isn't one
- */
- static Object getMax(ColumnStatistics index) {
- if (index instanceof IntegerColumnStatistics) {
- return ((IntegerColumnStatistics) index).getMaximum();
- } else if (index instanceof DoubleColumnStatistics) {
- return ((DoubleColumnStatistics) index).getMaximum();
- } else if (index instanceof StringColumnStatistics) {
- return ((StringColumnStatistics) index).getMaximum();
- } else if (index instanceof DateColumnStatistics) {
- return ((DateColumnStatistics) index).getMaximum();
- } else if (index instanceof DecimalColumnStatistics) {
- return ((DecimalColumnStatistics) index).getMaximum();
- } else if (index instanceof TimestampColumnStatistics) {
- return ((TimestampColumnStatistics) index).getMaximum();
- } else if (index instanceof BooleanColumnStatistics) {
- if (((BooleanColumnStatistics)index).getTrueCount()!=0) {
- return Boolean.TRUE;
+ static ByteWritable nextByte(ColumnVector vector,
+ int row,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
+ }
+ if (vector.noNulls || !vector.isNull[row]) {
+ ByteWritable result;
+ if (previous == null || previous.getClass() != ByteWritable.class) {
+ result = new ByteWritable();
} else {
- return Boolean.FALSE;
+ result = (ByteWritable) previous;
}
+ result.set((byte) ((LongColumnVector) vector).vector[row]);
+ return result;
} else {
return null;
}
}
- /**
- * Get the minimum value out of an index entry.
- * @param index
- * the index entry
- * @return the object for the minimum value or null if there isn't one
- */
- static Object getMin(ColumnStatistics index) {
- if (index instanceof IntegerColumnStatistics) {
- return ((IntegerColumnStatistics) index).getMinimum();
- } else if (index instanceof DoubleColumnStatistics) {
- return ((DoubleColumnStatistics) index).getMinimum();
- } else if (index instanceof StringColumnStatistics) {
- return ((StringColumnStatistics) index).getMinimum();
- } else if (index instanceof DateColumnStatistics) {
- return ((DateColumnStatistics) index).getMinimum();
- } else if (index instanceof DecimalColumnStatistics) {
- return ((DecimalColumnStatistics) index).getMinimum();
- } else if (index instanceof TimestampColumnStatistics) {
- return ((TimestampColumnStatistics) index).getMinimum();
- } else if (index instanceof BooleanColumnStatistics) {
- if (((BooleanColumnStatistics)index).getFalseCount()!=0) {
- return Boolean.FALSE;
+ static ShortWritable nextShort(ColumnVector vector,
+ int row,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
+ }
+ if (vector.noNulls || !vector.isNull[row]) {
+ ShortWritable result;
+ if (previous == null || previous.getClass() != ShortWritable.class) {
+ result = new ShortWritable();
} else {
- return Boolean.TRUE;
+ result = (ShortWritable) previous;
}
+ result.set((short) ((LongColumnVector) vector).vector[row]);
+ return result;
} else {
- return UNKNOWN_VALUE; // null is not safe here
+ return null;
}
}
- /**
- * Evaluate a predicate with respect to the statistics from the column
- * that is referenced in the predicate.
- * @param statsProto the statistics for the column mentioned in the predicate
- * @param predicate the leaf predicate we need to evaluation
- * @param bloomFilter
- * @return the set of truth values that may be returned for the given
- * predicate.
- */
- static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto,
- PredicateLeaf predicate, OrcProto.BloomFilter bloomFilter) {
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto);
- Object minValue = getMin(cs);
- Object maxValue = getMax(cs);
- BloomFilterIO bf = null;
- if (bloomFilter != null) {
- bf = new BloomFilterIO(bloomFilter);
+ static IntWritable nextInt(ColumnVector vector,
+ int row,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
}
- return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(), bf);
- }
-
- /**
- * Evaluate a predicate with respect to the statistics from the column
- * that is referenced in the predicate.
- * @param stats the statistics for the column mentioned in the predicate
- * @param predicate the leaf predicate we need to evaluation
- * @return the set of truth values that may be returned for the given
- * predicate.
- */
- static TruthValue evaluatePredicate(ColumnStatistics stats,
- PredicateLeaf predicate, BloomFilterIO bloomFilter) {
- Object minValue = getMin(stats);
- Object maxValue = getMax(stats);
- return evaluatePredicateRange(predicate, minValue, maxValue, stats.hasNull(), bloomFilter);
- }
-
- static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min,
- Object max, boolean hasNull, BloomFilterIO bloomFilter) {
- // if we didn't have any values, everything must have been null
- if (min == null) {
- if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) {
- return TruthValue.YES;
+ if (vector.noNulls || !vector.isNull[row]) {
+ IntWritable result;
+ if (previous == null || previous.getClass() != IntWritable.class) {
+ result = new IntWritable();
} else {
- return TruthValue.NULL;
+ result = (IntWritable) previous;
}
- } else if (min == UNKNOWN_VALUE) {
- return TruthValue.YES_NO_NULL;
+ result.set((int) ((LongColumnVector) vector).vector[row]);
+ return result;
+ } else {
+ return null;
}
+ }
- TruthValue result;
- Object baseObj = predicate.getLiteral();
- try {
- // Predicate object and stats objects are converted to the type of the predicate object.
- Object minValue = getBaseObjectForComparison(predicate.getType(), min);
- Object maxValue = getBaseObjectForComparison(predicate.getType(), max);
- Object predObj = getBaseObjectForComparison(predicate.getType(), baseObj);
-
- result = evaluatePredicateMinMax(predicate, predObj, minValue, maxValue, hasNull);
- if (shouldEvaluateBloomFilter(predicate, result, bloomFilter)) {
- result = evaluatePredicateBloomFilter(predicate, predObj, bloomFilter, hasNull);
- }
- // in case failed conversion, return the default YES_NO_NULL truth value
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- final String statsType = min == null ?
- (max == null ? "null" : max.getClass().getSimpleName()) :
- min.getClass().getSimpleName();
- final String predicateType = baseObj == null ? "null" : baseObj.getClass().getSimpleName();
- final String reason = e.getClass().getSimpleName() + " when evaluating predicate." +
- " Skipping ORC PPD." +
- " Exception: " + e.getMessage() +
- " StatsType: " + statsType +
- " PredicateType: " + predicateType;
- LOG.warn(reason);
- LOG.debug(reason, e);
- }
- if (predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS) || !hasNull) {
- result = TruthValue.YES_NO;
+ static LongWritable nextLong(ColumnVector vector,
+ int row,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
+ }
+ if (vector.noNulls || !vector.isNull[row]) {
+ LongWritable result;
+ if (previous == null || previous.getClass() != LongWritable.class) {
+ result = new LongWritable();
} else {
- result = TruthValue.YES_NO_NULL;
+ result = (LongWritable) previous;
}
+ result.set(((LongColumnVector) vector).vector[row]);
+ return result;
+ } else {
+ return null;
}
- return result;
}
- private static boolean shouldEvaluateBloomFilter(PredicateLeaf predicate,
- TruthValue result, BloomFilterIO bloomFilter) {
- // evaluate bloom filter only when
- // 1) Bloom filter is available
- // 2) Min/Max evaluation yield YES or MAYBE
- // 3) Predicate is EQUALS or IN list
- if (bloomFilter != null
- && result != TruthValue.NO_NULL && result != TruthValue.NO
- && (predicate.getOperator().equals(PredicateLeaf.Operator.EQUALS)
- || predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS)
- || predicate.getOperator().equals(PredicateLeaf.Operator.IN))) {
- return true;
+ static FloatWritable nextFloat(ColumnVector vector,
+ int row,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
}
- return false;
- }
-
- private static TruthValue evaluatePredicateMinMax(PredicateLeaf predicate, Object predObj,
- Object minValue,
- Object maxValue,
- boolean hasNull) {
- Location loc;
-
- switch (predicate.getOperator()) {
- case NULL_SAFE_EQUALS:
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.BEFORE || loc == Location.AFTER) {
- return TruthValue.NO;
- } else {
- return TruthValue.YES_NO;
- }
- case EQUALS:
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (minValue.equals(maxValue) && loc == Location.MIN) {
- return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
- } else if (loc == Location.BEFORE || loc == Location.AFTER) {
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- } else {
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- case LESS_THAN:
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.AFTER) {
- return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
- } else if (loc == Location.BEFORE || loc == Location.MIN) {
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- } else {
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- case LESS_THAN_EQUALS:
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.AFTER || loc == Location.MAX) {
- return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
- } else if (loc == Location.BEFORE) {
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- } else {
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- case IN:
- if (minValue.equals(maxValue)) {
- // for a single value, look through to see if that value is in the
- // set
- for (Object arg : predicate.getLiteralList()) {
- predObj = getBaseObjectForComparison(predicate.getType(), arg);
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.MIN) {
- return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
- }
- }
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- } else {
- // are all of the values outside of the range?
- for (Object arg : predicate.getLiteralList()) {
- predObj = getBaseObjectForComparison(predicate.getType(), arg);
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.MIN || loc == Location.MIDDLE ||
- loc == Location.MAX) {
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- }
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- }
- case BETWEEN:
- List<Object> args = predicate.getLiteralList();
- Object predObj1 = getBaseObjectForComparison(predicate.getType(), args.get(0));
-
- loc = compareToRange((Comparable) predObj1, minValue, maxValue);
- if (loc == Location.BEFORE || loc == Location.MIN) {
- Object predObj2 = getBaseObjectForComparison(predicate.getType(), args.get(1));
-
- Location loc2 = compareToRange((Comparable) predObj2, minValue, maxValue);
- if (loc2 == Location.AFTER || loc2 == Location.MAX) {
- return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
- } else if (loc2 == Location.BEFORE) {
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- } else {
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- } else if (loc == Location.AFTER) {
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- } else {
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- case IS_NULL:
- // min = null condition above handles the all-nulls YES case
- return hasNull ? TruthValue.YES_NO : TruthValue.NO;
- default:
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ if (vector.noNulls || !vector.isNull[row]) {
+ FloatWritable result;
+ if (previous == null || previous.getClass() != FloatWritable.class) {
+ result = new FloatWritable();
+ } else {
+ result = (FloatWritable) previous;
+ }
+ result.set((float) ((DoubleColumnVector) vector).vector[row]);
+ return result;
+ } else {
+ return null;
}
}
- private static TruthValue evaluatePredicateBloomFilter(PredicateLeaf predicate,
- final Object predObj, BloomFilterIO bloomFilter, boolean hasNull) {
- switch (predicate.getOperator()) {
- case NULL_SAFE_EQUALS:
- // null safe equals does not return *_NULL variant. So set hasNull to false
- return checkInBloomFilter(bloomFilter, predObj, false);
- case EQUALS:
- return checkInBloomFilter(bloomFilter, predObj, hasNull);
- case IN:
- for (Object arg : predicate.getLiteralList()) {
- // if atleast one value in IN list exist in bloom filter, qualify the row group/stripe
- Object predObjItem = getBaseObjectForComparison(predicate.getType(), arg);
- TruthValue result = checkInBloomFilter(bloomFilter, predObjItem, hasNull);
- if (result == TruthValue.YES_NO_NULL || result == TruthValue.YES_NO) {
- return result;
- }
- }
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- default:
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ static DoubleWritable nextDouble(ColumnVector vector,
+ int row,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
}
- }
-
- private static TruthValue checkInBloomFilter(BloomFilterIO bf, Object predObj, boolean hasNull) {
- TruthValue result = hasNull ? TruthValue.NO_NULL : TruthValue.NO;
-
- if (predObj instanceof Long) {
- if (bf.testLong(((Long) predObj).longValue())) {
- result = TruthValue.YES_NO_NULL;
- }
- } else if (predObj instanceof Double) {
- if (bf.testDouble(((Double) predObj).doubleValue())) {
- result = TruthValue.YES_NO_NULL;
- }
- } else if (predObj instanceof String || predObj instanceof Text ||
- predObj instanceof HiveDecimalWritable ||
- predObj instanceof BigDecimal) {
- if (bf.testString(predObj.toString())) {
- result = TruthValue.YES_NO_NULL;
- }
- } else if (predObj instanceof Timestamp) {
- if (bf.testLong(((Timestamp) predObj).getTime())) {
- result = TruthValue.YES_NO_NULL;
- }
- } else if (predObj instanceof TimestampWritable) {
- if (bf.testLong(((TimestampWritable) predObj).getTimestamp().getTime())) {
- result = TruthValue.YES_NO_NULL;
- }
- } else if (predObj instanceof Date) {
- if (bf.testLong(DateWritable.dateToDays((Date) predObj))) {
- result = TruthValue.YES_NO_NULL;
+ if (vector.noNulls || !vector.isNull[row]) {
+ DoubleWritable result;
+ if (previous == null || previous.getClass() != DoubleWritable.class) {
+ result = new DoubleWritable();
+ } else {
+ result = (DoubleWritable) previous;
}
+ result.set(((DoubleColumnVector) vector).vector[row]);
+ return result;
} else {
- // if the predicate object is null and if hasNull says there are no nulls then return NO
- if (predObj == null && !hasNull) {
- result = TruthValue.NO;
- } else {
- result = TruthValue.YES_NO_NULL;
- }
- }
-
- if (result == TruthValue.YES_NO_NULL && !hasNull) {
- result = TruthValue.YES_NO;
- }
-
- if (LOG.isDebugEnabled()) {
- LOG.debug("Bloom filter evaluation: " + result.toString());
+ return null;
}
-
- return result;
}
- private static Object getBaseObjectForComparison(PredicateLeaf.Type type, Object obj) {
- if (obj == null) {
- return null;
+ static Text nextString(ColumnVector vector,
+ int row,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
}
- switch (type) {
- case BOOLEAN:
- if (obj instanceof Boolean) {
- return obj;
- } else {
- // will only be true if the string conversion yields "true", all other values are
- // considered false
- return Boolean.valueOf(obj.toString());
- }
- case DATE:
- if (obj instanceof Date) {
- return obj;
- } else if (obj instanceof String) {
- return Date.valueOf((String) obj);
- } else if (obj instanceof Timestamp) {
- return DateWritable.timeToDate(((Timestamp) obj).getTime() / 1000L);
- }
- // always string, but prevent the comparison to numbers (are they days/seconds/milliseconds?)
- break;
- case DECIMAL:
- if (obj instanceof Boolean) {
- return new HiveDecimalWritable(((Boolean) obj).booleanValue() ?
- HiveDecimal.ONE : HiveDecimal.ZERO);
- } else if (obj instanceof Integer) {
- return new HiveDecimalWritable(((Integer) obj).intValue());
- } else if (obj instanceof Long) {
- return new HiveDecimalWritable(((Long) obj));
- } else if (obj instanceof Float || obj instanceof Double ||
- obj instanceof String) {
- return new HiveDecimalWritable(obj.toString());
- } else if (obj instanceof BigDecimal) {
- return new HiveDecimalWritable(HiveDecimal.create((BigDecimal) obj));
- } else if (obj instanceof HiveDecimal) {
- return new HiveDecimalWritable((HiveDecimal) obj);
- } else if (obj instanceof HiveDecimalWritable) {
- return obj;
- } else if (obj instanceof Timestamp) {
- return new HiveDecimalWritable(
- new Double(new TimestampWritable((Timestamp) obj).getDouble()).toString());
- }
- break;
- case FLOAT:
- if (obj instanceof Number) {
- // widening conversion
- return ((Number) obj).doubleValue();
- } else if (obj instanceof HiveDecimal) {
- return ((HiveDecimal) obj).doubleValue();
- } else if (obj instanceof String) {
- return Double.valueOf(obj.toString());
- } else if (obj instanceof Timestamp) {
- return new TimestampWritable((Timestamp)obj).getDouble();
- } else if (obj instanceof HiveDecimal) {
- return ((HiveDecimal) obj).doubleValue();
- } else if (obj instanceof BigDecimal) {
- return ((BigDecimal) obj).doubleValue();
- }
- break;
- case LONG:
- if (obj instanceof Number) {
- // widening conversion
- return ((Number) obj).longValue();
- } else if (obj instanceof HiveDecimal) {
- return ((HiveDecimal) obj).longValue();
- } else if (obj instanceof String) {
- return Long.valueOf(obj.toString());
- }
- break;
- case STRING:
- if (obj != null) {
- return (obj.toString());
- }
- break;
- case TIMESTAMP:
- if (obj instanceof Timestamp) {
- return obj;
- } else if (obj instanceof Integer) {
- return TimestampWritable.longToTimestamp(((Number) obj).longValue(), false);
- } else if (obj instanceof Float) {
- return TimestampWritable.doubleToTimestamp(((Float) obj).doubleValue());
- } else if (obj instanceof Double) {
- return TimestampWritable.doubleToTimestamp(((Double) obj).doubleValue());
- } else if (obj instanceof HiveDecimal) {
- return TimestampWritable.decimalToTimestamp((HiveDecimal) obj);
- } else if (obj instanceof HiveDecimalWritable) {
- return TimestampWritable.decimalToTimestamp(((HiveDecimalWritable) obj).getHiveDecimal());
- } else if (obj instanceof Date) {
- return new Timestamp(((Date) obj).getTime());
- }
- // float/double conversion to timestamp is interpreted as seconds whereas integer conversion
- // to timestamp is interpreted as milliseconds by default. The integer to timestamp casting
- // is also config driven. The filter operator changes its promotion based on config:
- // "int.timestamp.conversion.in.seconds". Disable PPD for integer cases.
- break;
- default:
- break;
+ if (vector.noNulls || !vector.isNull[row]) {
+ Text result;
+ if (previous == null || previous.getClass() != Text.class) {
+ result = new Text();
+ } else {
+ result = (Text) previous;
+ }
+ BytesColumnVector bytes = (BytesColumnVector) vector;
+ result.set(bytes.vector[row], bytes.start[row], bytes.length[row]);
+ return result;
+ } else {
+ return null;
}
-
- throw new IllegalArgumentException(String.format(
- "ORC SARGS could not convert from %s to %s", obj == null ? "(null)" : obj.getClass()
- .getSimpleName(), type));
}
- public static class SargApplier {
- public final static boolean[] READ_ALL_RGS = null;
- public final static boolean[] READ_NO_RGS = new boolean[0];
-
- private final SearchArgument sarg;
- private final List<PredicateLeaf> sargLeaves;
- private final int[] filterColumns;
- private final long rowIndexStride;
- // same as the above array, but indices are set to true
- private final boolean[] sargColumns;
-
- public SargApplier(SearchArgument sarg, String[] columnNames, long rowIndexStride,
- List<OrcProto.Type> types, int includedCount) {
- this.sarg = sarg;
- sargLeaves = sarg.getLeaves();
- filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves, columnNames, 0);
- this.rowIndexStride = rowIndexStride;
- // included will not be null, row options will fill the array with trues if null
- sargColumns = new boolean[includedCount];
- for (int i : filterColumns) {
- // filter columns may have -1 as index which could be partition column in SARG.
- if (i > 0) {
- sargColumns[i] = true;
- }
- }
+ static HiveCharWritable nextChar(ColumnVector vector,
+ int row,
+ int size,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
}
-
- /**
- * Pick the row groups that we need to load from the current stripe.
- *
- * @return an array with a boolean for each row group or null if all of the
- * row groups must be read.
- * @throws IOException
- */
- public boolean[] pickRowGroups(StripeInformation stripe, OrcProto.RowIndex[] indexes,
- OrcProto.BloomFilterIndex[] bloomFilterIndices, boolean returnNone) throws IOException {
- long rowsInStripe = stripe.getNumberOfRows();
- int groupsInStripe = (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride);
- boolean[] result = new boolean[groupsInStripe]; // TODO: avoid alloc?
- TruthValue[] leafValues = new TruthValue[sargLeaves.size()];
- boolean hasSelected = false, hasSkipped = false;
- for (int rowGroup = 0; rowGroup < result.length; ++rowGroup) {
- for (int pred = 0; pred < leafValues.length; ++pred) {
- int columnIx = filterColumns[pred];
- if (columnIx != -1) {
- if (indexes[columnIx] == null) {
- throw new AssertionError("Index is not populated for " + columnIx);
- }
- OrcProto.RowIndexEntry entry = indexes[columnIx].getEntry(rowGroup);
- if (entry == null) {
- throw new AssertionError("RG is not populated for " + columnIx + " rg " + rowGroup);
- }
- OrcProto.ColumnStatistics stats = entry.getStatistics();
- OrcProto.BloomFilter bf = null;
- if (bloomFilterIndices != null && bloomFilterIndices[filterColumns[pred]] != null) {
- bf = bloomFilterIndices[filterColumns[pred]].getBloomFilter(rowGroup);
- }
- leafValues[pred] = evaluatePredicateProto(stats, sargLeaves.get(pred), bf);
- if (LOG.isTraceEnabled()) {
- LOG.trace("Stats = " + stats);
- LOG.trace("Setting " + sargLeaves.get(pred) + " to " + leafValues[pred]);
- }
- } else {
- // the column is a virtual column
- leafValues[pred] = TruthValue.YES_NO_NULL;
- }
- }
- result[rowGroup] = sarg.evaluate(leafValues).isNeeded();
- hasSelected = hasSelected || result[rowGroup];
- hasSkipped = hasSkipped || (!result[rowGroup]);
- if (LOG.isDebugEnabled()) {
- LOG.debug("Row group " + (rowIndexStride * rowGroup) + " to " +
- (rowIndexStride * (rowGroup + 1) - 1) + " is " +
- (result[rowGroup] ? "" : "not ") + "included.");
- }
+ if (vector.noNulls || !vector.isNull[row]) {
+ HiveCharWritable result;
+ if (previous == null || previous.getClass() != HiveCharWritable.class) {
+ result = new HiveCharWritable();
+ } else {
+ result = (HiveCharWritable) previous;
}
-
- return hasSkipped ? ((hasSelected || !returnNone) ? result : READ_NO_RGS) : READ_ALL_RGS;
+ BytesColumnVector bytes = (BytesColumnVector) vector;
+ result.set(bytes.toString(row), size);
+ return result;
+ } else {
+ return null;
}
}
- /**
- * Pick the row groups that we need to load from the current stripe.
- *
- * @return an array with a boolean for each row group or null if all of the
- * row groups must be read.
- * @throws IOException
- */
- protected boolean[] pickRowGroups() throws IOException {
- // if we don't have a sarg or indexes, we read everything
- if (sargApp == null) {
+ static HiveVarcharWritable nextVarchar(ColumnVector vector,
+ int row,
+ int size,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
+ }
+ if (vector.noNulls || !vector.isNull[row]) {
+ HiveVarcharWritable result;
+ if (previous == null || previous.getClass() != HiveVarcharWritable.class) {
+ result = new HiveVarcharWritable();
+ } else {
+ result = (HiveVarcharWritable) previous;
+ }
+ BytesColumnVector bytes = (BytesColumnVector) vector;
+ result.set(bytes.toString(row), size);
+ return result;
+ } else {
return null;
}
- readRowIndex(currentStripe, included, sargApp.sargColumns);
- return sargApp.pickRowGroups(stripes.get(currentStripe), indexes, bloomFilterIndices, false);
}
- private void clearStreams() {
- // explicit close of all streams to de-ref ByteBuffers
- for (InStream is : streams.values()) {
- is.close();
+ static BytesWritable nextBinary(ColumnVector vector,
+ int row,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
}
- if (bufferChunks != null) {
- if (dataReader.isTrackingDiskRanges()) {
- for (DiskRangeList range = bufferChunks; range != null; range = range.next) {
- if (!(range instanceof BufferChunk)) {
- continue;
- }
- dataReader.releaseBuffer(((BufferChunk) range).getChunk());
- }
+ if (vector.noNulls || !vector.isNull[row]) {
+ BytesWritable result;
+ if (previous == null || previous.getClass() != BytesWritable.class) {
+ result = new BytesWritable();
+ } else {
+ result = (BytesWritable) previous;
}
+ BytesColumnVector bytes = (BytesColumnVector) vector;
+ result.set(bytes.vector[row], bytes.start[row], bytes.length[row]);
+ return result;
+ } else {
+ return null;
}
- bufferChunks = null;
- streams.clear();
}
- /**
- * Read the current stripe into memory.
- *
- * @throws IOException
- */
- private void readStripe() throws IOException {
- StripeInformation stripe = beginReadStripe();
- includedRowGroups = pickRowGroups();
-
- // move forward to the first unskipped row
- if (includedRowGroups != null) {
- while (rowInStripe < rowCountInStripe &&
- !includedRowGroups[(int) (rowInStripe / rowIndexStride)]) {
- rowInStripe = Math.min(rowCountInStripe, rowInStripe + rowIndexStride);
- }
+ static HiveDecimalWritable nextDecimal(ColumnVector vector,
+ int row,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
}
-
- // if we haven't skipped the whole stripe, read the data
- if (rowInStripe < rowCountInStripe) {
- // if we aren't projecting columns or filtering rows, just read it all
- if (included == null && includedRowGroups == null) {
- readAllDataStreams(stripe);
+ if (vector.noNulls || !vector.isNull[row]) {
+ HiveDecimalWritable result;
+ if (previous == null || previous.getClass() != HiveDecimalWritable.class) {
+ result = new HiveDecimalWritable();
} else {
- readPartialDataStreams(stripe);
- }
- reader.startStripe(streams, stripeFooter);
- // if we skipped the first row group, move the pointers forward
- if (rowInStripe != 0) {
- seekToRowEntry(reader, (int) (rowInStripe / rowIndexStride));
+ result = (HiveDecimalWritable) previous;
}
+ result.set(((DecimalColumnVector) vector).vector[row]);
+ return result;
+ } else {
+ return null;
}
}
- private StripeInformation beginReadStripe() throws IOException {
- StripeInformation stripe = stripes.get(currentStripe);
- stripeFooter = readStripeFooter(stripe);
- clearStreams();
- // setup the position in the stripe
- rowCountInStripe = stripe.getNumberOfRows();
- rowInStripe = 0;
- rowBaseInStripe = 0;
- for (int i = 0; i < currentStripe; ++i) {
- rowBaseInStripe += stripes.get(i).getNumberOfRows();
+ static DateWritable nextDate(ColumnVector vector,
+ int row,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
}
- // reset all of the indexes
- for (int i = 0; i < indexes.length; ++i) {
- indexes[i] = null;
+ if (vector.noNulls || !vector.isNull[row]) {
+ DateWritable result;
+ if (previous == null || previous.getClass() != DateWritable.class) {
+ result = new DateWritable();
+ } else {
+ result = (DateWritable) previous;
+ }
+ int date = (int) ((LongColumnVector) vector).vector[row];
+ result.set(date);
+ return result;
+ } else {
+ return null;
}
- return stripe;
}
- private void readAllDataStreams(StripeInformation stripe) throws IOException {
- long start = stripe.getIndexLength();
- long end = start + stripe.getDataLength();
- // explicitly trigger 1 big read
- DiskRangeList toRead = new DiskRangeList(start, end);
- bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false);
- List<OrcProto.Stream> streamDescriptions = stripeFooter.getStreamsList();
- createStreams(streamDescriptions, bufferChunks, null, codec, bufferSize, streams);
- }
-
- /**
- * Plan the ranges of the file that we need to read given the list of
- * columns and row groups.
- *
- * @param streamList the list of streams available
- * @param indexes the indexes that have been loaded
- * @param includedColumns which columns are needed
- * @param includedRowGroups which row groups are needed
- * @param isCompressed does the file have generic compression
- * @param encodings the encodings for each column
- * @param types the types of the columns
- * @param compressionSize the compression block size
- * @return the list of disk ranges that will be loaded
- */
- static DiskRangeList planReadPartialDataStreams
- (List<OrcProto.Stream> streamList,
- OrcProto.RowIndex[] indexes,
- boolean[] includedColumns,
- boolean[] includedRowGroups,
- boolean isCompressed,
- List<OrcProto.ColumnEncoding> encodings,
- List<OrcProto.Type> types,
- int compressionSize,
- boolean doMergeBuffers) {
- long offset = 0;
- // figure out which columns have a present stream
- boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, types);
- CreateHelper list = new CreateHelper();
- for (OrcProto.Stream stream : streamList) {
- long length = stream.getLength();
- int column = stream.getColumn();
- OrcProto.Stream.Kind streamKind = stream.getKind();
- // since stream kind is optional, first check if it exists
- if (stream.hasKind() &&
- (StreamName.getArea(streamKind) == StreamName.Area.DATA) &&
- (column < includedColumns.length && includedColumns[column])) {
- // if we aren't filtering or it is a dictionary, load it.
- if (includedRowGroups == null
- || RecordReaderUtils.isDictionary(streamKind, encodings.get(column))) {
- RecordReaderUtils.addEntireStreamToRanges(offset, length, list, doMergeBuffers);
- } else {
- RecordReaderUtils.addRgFilteredStreamToRanges(stream, includedRowGroups,
- isCompressed, indexes[column], encodings.get(column), types.get(column),
- compressionSize, hasNull[column], offset, length, list, doMergeBuffers);
- }
+ static TimestampWritable nextTimestamp(ColumnVector vector,
+ int row,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
+ }
+ if (vector.noNulls || !vector.isNull[row]) {
+ TimestampWritable result;
+ if (previous == null || previous.getClass() != TimestampWritable.class) {
+ result = new TimestampWritable();
+ } else {
+ result = (TimestampWritable) previous;
}
- offset += length;
+ TimestampColumnVector tcv = (TimestampColumnVector) vector;
+ result.setInternal(tcv.time[row], tcv.nanos[row]);
+ return result;
+ } else {
+ return null;
}
- return list.extract();
}
- void createStreams(List<OrcProto.Stream> streamDescriptions,
- DiskRangeList ranges,
- boolean[] includeColumn,
- CompressionCodec codec,
- int bufferSize,
- Map<StreamName, InStream> streams) throws IOException {
- long streamOffset = 0;
- for (OrcProto.Stream streamDesc : streamDescriptions) {
- int column = streamDesc.getColumn();
- if ((includeColumn != null &&
- (column < included.length && !includeColumn[column])) ||
- streamDesc.hasKind() &&
- (StreamName.getArea(streamDesc.getKind()) != StreamName.Area.DATA)) {
- streamOffset += streamDesc.getLength();
- continue;
+ static OrcStruct nextStruct(ColumnVector vector,
+ int row,
+ TypeDescription schema,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
+ }
+ if (vector.noNulls || !vector.isNull[row]) {
+ OrcStruct result;
+ List<TypeDescription> childrenTypes = schema.getChildren();
+ int numChildren = childrenTypes.size();
+ if (previous == null || previous.getClass() != OrcStruct.class) {
+ result = new OrcStruct(numChildren);
+ } else {
+ result = (OrcStruct) previous;
+ result.setNumFields(numChildren);
+ }
+ StructColumnVector struct = (StructColumnVector) vector;
+ for(int f=0; f < numChildren; ++f) {
+ result.setFieldValue(f, nextValue(struct.fields[f], row,
+ childrenTypes.get(f), result.getFieldValue(f)));
}
- List<DiskRange> buffers = RecordReaderUtils.getStreamBuffers(
- ranges, streamOffset, streamDesc.getLength());
- StreamName name = new StreamName(column, streamDesc.getKind());
- streams.put(name, InStream.create(name.toString(), buffers,
- streamDesc.getLength(), codec, bufferSize));
- streamOffset += streamDesc.getLength();
+ return result;
+ } else {
+ return null;
}
}
- private void readPartialDataStreams(StripeInformation stripe) throws IOException {
- List<OrcProto.Stream> streamList = stripeFooter.getStreamsList();
- DiskRangeList toRead = planReadPartialDataStreams(streamList,
- indexes, included, includedRowGroups, codec != null,
- stripeFooter.getColumnsList(), types, bufferSize, true);
- if (LOG.isDebugEnabled()) {
- LOG.debug("chunks = " + RecordReaderUtils.stringifyDiskRanges(toRead));
+ static OrcUnion nextUnion(ColumnVector vector,
+ int row,
+ TypeDescription schema,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
}
- bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false);
- if (LOG.isDebugEnabled()) {
- LOG.debug("merge = " + RecordReaderUtils.stringifyDiskRanges(bufferChunks));
+ if (vector.noNulls || !vector.isNull[row]) {
+ OrcUnion result;
+ List<TypeDescription> childrenTypes = schema.getChildren();
+ if (previous == null || previous.getClass() != OrcUnion.class) {
+ result = new OrcUnion();
+ } else {
+ result = (OrcUnion) previous;
+ }
+ UnionColumnVector union = (UnionColumnVector) vector;
+ byte tag = (byte) union.tags[row];
+ result.set(tag, nextValue(union.fields[tag], row, childrenTypes.get(tag),
+ result.getObject()));
+ return result;
+ } else {
+ return null;
}
-
- createStreams(streamList, bufferChunks, included, codec, bufferSize, streams);
- }
-
- @Override
- public boolean hasNext() throws IOException {
- return rowInStripe < rowCountInStripe;
}
- /**
- * Read the next stripe until we find a row that we don't skip.
- *
- * @throws IOException
- */
- private void advanceStripe() throws IOException {
- rowInStripe = rowCountInStripe;
- while (rowInStripe >= rowCountInStripe &&
- currentStripe < stripes.size() - 1) {
- currentStripe += 1;
- readStripe();
+ static ArrayList<Object> nextList(ColumnVector vector,
+ int row,
+ TypeDescription schema,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
}
- }
-
- /**
- * Skip over rows that we aren't selecting, so that the next row is
- * one that we will read.
- *
- * @param nextRow the row we want to go to
- * @throws IOException
- */
- private boolean advanceToNextRow(
- TreeReaderFactory.TreeReader reader, long nextRow, boolean canAdvanceStripe)
- throws IOException {
- long nextRowInStripe = nextRow - rowBaseInStripe;
- // check for row skipping
- if (rowIndexStride != 0 &&
- includedRowGroups != null &&
- nextRowInStripe < rowCountInStripe) {
- int rowGroup = (int) (nextRowInStripe / rowIndexStride);
- if (!includedRowGroups[rowGroup]) {
- while (rowGroup < includedRowGroups.length && !includedRowGroups[rowGroup]) {
- rowGroup += 1;
- }
- if (rowGroup >= includedRowGroups.length) {
- if (canAdvanceStripe) {
- advanceStripe();
- }
- return canAdvanceStripe;
+ if (vector.noNulls || !vector.isNull[row]) {
+ ArrayList<Object> result;
+ if (previous == null || previous.getClass() != ArrayList.class) {
+ result = new ArrayList<>();
+ } else {
+ result = (ArrayList<Object>) previous;
+ }
+ ListColumnVector list = (ListColumnVector) vector;
+ int length = (int) list.lengths[row];
+ int offset = (int) list.offsets[row];
+ result.ensureCapacity(length);
+ int oldLength = result.size();
+ int idx = 0;
+ TypeDescription childType = schema.getChildren().get(0);
+ while (idx < length && idx < oldLength) {
+ result.set(idx, nextValue(list.child, offset + idx, childType,
+ result.get(idx)));
+ idx += 1;
+ }
+ if (length < oldLength) {
+ result.subList(length,result.size()).clear();
+ } else if (oldLength < length) {
+ while (idx < length) {
+ result.add(nextValue(list.child, offset + idx, childType, null));
+ idx += 1;
}
- nextRowInStripe = Math.min(rowCountInStripe, rowGroup * rowIndexStride);
}
+ return result;
+ } else {
+ return null;
}
- if (nextRowInStripe >= rowCountInStripe) {
- if (canAdvanceStripe) {
- advanceStripe();
- }
- return canAdvanceStripe;
+ }
+
+ static HashMap<Object,Object> nextMap(ColumnVector vector,
+ int row,
+ TypeDescription schema,
+ Object previous) {
+ if (vector.isRepeating) {
+ row = 0;
}
- if (nextRowInStripe != rowInStripe) {
- if (rowIndexStride != 0) {
- int rowGroup = (int) (nextRowInStripe / rowIndexStride);
- seekToRowEntry(reader, rowGroup);
- reader.skipRows(nextRowInStripe - rowGroup * rowIndexStride);
+ if (vector.noNulls || !vector.isNull[row]) {
+ MapColumnVector map = (MapColumnVector) vector;
+ int length = (int) map.lengths[row];
+ int offset = (int) map.offsets[row];
+ TypeDescription keyType = schema.getChildren().get(0);
+ TypeDescription valueType = schema.getChildren().get(1);
+ HashMap<Object,Object> result;
+ if (previous == null || previous.getClass() != HashMap.class) {
+ result = new HashMap<Object,Object>(length);
} else {
- reader.skipRows(nextRowInStripe - rowInStripe);
+ result = (HashMap<Object,Object>) previous;
+ // I couldn't think of a good way to reuse the keys and value objects
+ // without even more allocations, so take the easy and safe approach.
+ result.clear();
}
- rowInStripe = nextRowInStripe;
+ for(int e=0; e < length; ++e) {
+ result.put(nextValue(map.keys, e + offset, keyType, null),
+ nextValue(map.values, e + offset, valueType, null));
+ }
+ return result;
+ } else {
+ return null;
}
- return true;
}
- @Override
- public Object next(Object previous) throws IOException {
- try {
- final Object result = reader.next(previous);
- // find the next row
- rowInStripe += 1;
- advanceToNextRow(reader, rowInStripe + rowBaseInStripe, true);
- return result;
- } catch (IOException e) {
- // Rethrow exception with file name in log message
- throw new IOException("Error reading file: " + path, e);
+ static Object nextValue(ColumnVector vector,
+ int row,
+ TypeDescription schema,
+ Object previous) {
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ return nextBoolean(vector, row, previous);
+ case BYTE:
+ return nextByte(vector, row, previous);
+ case SHORT:
+ return nextShort(vector, row, previous);
+ case INT:
+ return nextInt(vector, row, previous);
+ case LONG:
+ return nextLong(vector, row, previous);
+ case FLOAT:
+ return nextFloat(vector, row, previous);
+ case DOUBLE:
+ return nextDouble(vector, row, previous);
+ case STRING:
+ return nextString(vector, row, previous);
+ case CHAR:
+ return nextChar(vector, row, schema.getMaxLength(), previous);
+ case VARCHAR:
+ return nextVarchar(vector, row, schema.getMaxLength(), previous);
+ case BINARY:
+ return nextBinary(vector, row, previous);
+ case DECIMAL:
+ return nextDecimal(vector, row, previous);
+ case DATE:
+ return nextDate(vector, row, previous);
+ case TIMESTAMP:
+ return nextTimestamp(vector, row, previous);
+ case STRUCT:
+ return nextStruct(vector, row, schema, previous);
+ case UNION:
+ return nextUnion(vector, row, schema, previous);
+ case LIST:
+ return nextList(vector, row, schema, previous);
+ case MAP:
+ return nextMap(vector, row, schema, previous);
+ default:
+ throw new IllegalArgumentException("Unknown type " + schema);
}
}
- @Override
- public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
- try {
- if (rowInStripe >= rowCountInStripe) {
- currentStripe += 1;
- if (currentStripe >= stripes.size()) {
- batch.size = 0;
- return false;
+ /* Routines for copying between VectorizedRowBatches */
+
+ void copyLongColumn(ColumnVector destination,
+ ColumnVector source,
+ int sourceOffset,
+ int length) {
+ LongColumnVector lsource = (LongColumnVector) source;
+ LongColumnVector ldest = (LongColumnVector) destination;
+ ldest.isRepeating = lsource.isRepeating;
+ ldest.noNulls = lsource.noNulls;
+ if (source.isRepeating) {
+ ldest.isNull[0] = lsource.isNull[0];
+ ldest.vector[0] = lsource.vector[0];
+ } else {
+ if (!lsource.noNulls) {
+ for(int r=0; r < length; ++r) {
+ ldest.isNull[r] = lsource.isNull[sourceOffset + r];
+ ldest.vector[r] = lsource.vector[sourceOffset + r];
+ }
+ } else {
+ for (int r = 0; r < length; ++r) {
+ ldest.vector[r] = lsource.vector[sourceOffset + r];
}
- readStripe();
}
-
- int batchSize = computeBatchSize(batch.getMaxSize());
-
- rowInStripe += batchSize;
- reader.setVectorColumnCount(batch.getDataColumnCount());
- reader.nextBatch(batch, batchSize);
-
- batch.size = (int) batchSize;
- batch.selectedInUse = false;
- advanceToNextRow(reader, rowInStripe + rowBaseInStripe, true);
- return batch.size != 0;
- } catch (IOException e) {
- // Rethrow exception with file name in log message
- throw new IOException("Error reading file: " + path, e);
}
}
- private int computeBatchSize(long targetBatchSize) {
- final int batchSize;
- // In case of PPD, batch size should be aware of row group boundaries. If only a subset of row
- // groups are selected then marker position is set to the end of range (subset of row groups
- // within strip). Batch size computed out of marker position makes sure that batch size is
- // aware of row group boundary and will not cause overflow when reading rows
- // illustration of this case is here https://issues.apache.org/jira/browse/HIVE-6287
- if (rowIndexStride != 0 && includedRowGroups != null && rowInStripe < rowCountInStripe) {
- int startRowGroup = (int) (rowInStripe / rowIndexStride);
- if (!includedRowGroups[startRowGroup]) {
- while (startRowGroup < includedRowGroups.length && !includedRowGroups[startRowGroup]) {
- startRowGroup += 1;
+ void copyDoubleColumn(ColumnVector destination,
+ ColumnVector source,
+ int sourceOffset,
+ int length) {
+ DoubleColumnVector castedSource = (DoubleColumnVector) source;
+ DoubleColumnVector castedDestination = (DoubleColumnVector) destination;
+ if (source.isRepeating) {
+ castedDestination.isRepeating = true;
+ castedDestination.noNulls = castedSource.noNulls;
+ castedDestination.isNull[0] = castedSource.isNull[0];
+ castedDestination.vector[0] = castedSource.vector[0];
+ } else {
+ if (!castedSource.noNulls) {
+ castedDestination.noNulls = true;
+ for(int r=0; r < length; ++r) {
+ castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r];
}
}
-
- int endRowGroup = startRowGroup;
- while (endRowGroup < includedRowGroups.length && includedRowGroups[endRowGroup]) {
- endRowGroup += 1;
- }
-
- final long markerPosition =
- (endRowGroup * rowIndexStride) < rowCountInStripe ? (endRowGroup * rowIndexStride)
- : rowCountInStripe;
- batchSize = (int) Math.min(targetBatchSize, (markerPosition - rowInStripe));
-
- if (isLogDebugEnabled && batchSize < targetBatchSize) {
- LOG.debug("markerPosition: " + markerPosition + " batchSize: " + batchSize);
+ for(int r=0; r < length; ++r) {
+ castedDestination.vector[r] = castedSource.vector[sourceOffset + r];
}
- } else {
- batchSize = (int) Math.min(targetBatchSize, (rowCountInStripe - rowInStripe));
}
- return batchSize;
- }
-
- @Override
- public void close() throws IOException {
- clearStreams();
- dataReader.close();
- }
-
- @Override
- public long getRowNumber() {
- return rowInStripe + rowBaseInStripe + firstRow;
- }
-
- /**
- * Return the fraction of rows that have been read from the selected.
- * section of the file
- *
- * @return fraction between 0.0 and 1.0 of rows consumed
- */
- @Override
- public float getProgress() {
- return ((float) rowBaseInStripe + rowInStripe) / totalRowCount;
}
- private int findStripe(long rowNumber) {
- for (int i = 0; i < stripes.size(); i++) {
- StripeInformation stripe = stripes.get(i);
- if (stripe.getNumberOfRows() > rowNumber) {
- return i;
+ void copyTimestampColumn(ColumnVector destination,
+ ColumnVector source,
+ int sourceOffset,
+ int length) {
+ TimestampColumnVector castedSource = (TimestampColumnVector) source;
+ TimestampColumnVector castedDestination = (TimestampColumnVector) destination;
+ castedDestination.isRepeating = castedSource.isRepeating;
+ castedDestination.noNulls = castedSource.noNulls;
+ if (source.isRepeating) {
+ castedDestination.isNull[0] = castedSource.isNull[0];
+ castedDestination.time[0] = castedSource.time[0];
+ castedDestination.nanos[0] = castedSource.nanos[0];
+ } else {
+ if (!castedSource.noNulls) {
+ castedDestination.noNulls = true;
+ for(int r=0; r < length; ++r) {
+ castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r];
+ castedDestination.time[r] = castedSource.time[sourceOffset + r];
+ castedDestination.nanos[r] = castedSource.nanos[sourceOffset + r];
+ }
+ } else {
+ for (int r = 0; r < length; ++r) {
+ castedDestination.time[r] = castedSource.time[sourceOffset + r];
+ castedDestination.nanos[r] = castedSource.nanos[sourceOffset + r];
+ }
}
- rowNumber -= stripe.getNumberOfRows();
}
- throw new IllegalArgumentException("Seek after the end of reader range");
}
- OrcIndex readRowIndex(
- int stripeIndex, boolean[] included, boolean[] sargColumns) throws IOException {
- return readRowIndex(stripeIndex, included, null, null, sargColumns);
+ void copyDecimalColumn(ColumnVector destination,
+ ColumnVector source,
+ int sourceOffset,
+ int length) {
+ DecimalColumnVector castedSource = (DecimalColumnVector) source;
+ DecimalColumnVector castedDestination = (DecimalColumnVector) destination;
+ castedDestination.isRepeating = castedSource.isRepeating;
+ castedDestination.noNulls = castedSource.noNulls;
+ if (source.isRepeating) {
+ castedDestination.isNull[0] = castedSource.isNull[0];
+ if (!castedSource.isNull[0]) {
+ castedDestination.set(0, castedSource.vector[0]);
+ }
+ } else {
+ if (!castedSource.noNulls) {
+ for(int r=0; r < length; ++r) {
+ castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r];
+ if (!castedDestination.isNull[r]) {
+ castedDestination.set(r, castedSource.vector[r]);
+ }
+ }
+ } else {
+ for (int r = 0; r < length; ++r) {
+ castedDestination.set(r, castedSource.vector[r]);
+ }
+ }
+ }
}
- OrcIndex readRowIndex(int stripeIndex, boolean[] included, OrcProto.RowIndex[] indexes,
- OrcProto.BloomFilterIndex[] bloomFilterIndex, boolean[] sargColumns) throws IOException {
- StripeInformation stripe = stripes.get(stripeIndex);
- OrcProto.StripeFooter stripeFooter = null;
- // if this is the current stripe, use the cached objects.
- if (stripeIndex == currentStripe) {
- stripeFooter = this.stripeFooter;
- indexes = indexes == null ? this.indexes : indexes;
- bloomFilterIndex = bloomFilterIndex == null ? this.bloomFilterIndices : bloomFilterIndex;
- sargColumns = sargColumns == null ?
- (sargApp == null ? null : sargApp.sargColumns) : sargColumns;
+ void copyBytesColumn(ColumnVector destination,
+ ColumnVector source,
+ int sourceOffset,
+ int length) {
+ BytesColumnVector castedSource = (BytesColumnVector) source;
+ BytesColumnVector castedDestination = (BytesColumnVector) destination;
+ castedDestination.isRepeating = castedSource.isRepeating;
+ castedDestination.noNulls = castedSource.noNulls;
+ if (source.isRepeating) {
+ castedDestination.isNull[0] = castedSource.isNull[0];
+ if (!castedSource.isNull[0]) {
+ castedDestination.setVal(0, castedSource.vector[0],
+ castedSource.start[0], castedSource.length[0]);
+ }
+ } else {
+ if (!castedSource.noNulls) {
+ for(int r=0; r < length; ++r) {
+ castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r];
+ if (!castedDestination.isNull[r]) {
+ castedDestination.setVal(r, castedSource.vector[sourceOffset + r],
+ castedSource.start[sourceOffset + r],
+ castedSource.length[sourceOffset + r]);
+ }
+ }
+ } else {
+ for (int r = 0; r < length; ++r) {
+ castedDestination.setVal(r, castedSource.vector[sourceOffset + r],
+ castedSource.start[sourceOffset + r],
+ castedSource.length[sourceOffset + r]);
+ }
+ }
}
- return dataReader.readRowIndex(stripe, stripeFooter, included, indexes,
- sargColumns, bloomFilterIndex);
}
- private void seekToRowEntry(TreeReaderFactory.TreeReader reader, int rowEntry)
- throws IOException {
- PositionProvider[] index = new PositionProvider[indexes.length];
- for (int i = 0; i < indexes.length; ++i) {
- if (indexes[i] != null) {
- index[i] = new PositionProviderImpl(indexes[i].getEntry(rowEntry));
+ void copyStructColumn(ColumnVector destination,
+ ColumnVector source,
+ int sourceOffset,
+ int length) {
+ StructColumnVector castedSource = (StructColumnVector) source;
+ StructColumnVector castedDestination = (StructColumnVector) destination;
+ castedDestination.isRepeating = castedSource.isRepeating;
+ castedDestination.noNulls = castedSource.noNulls;
+ if (source.isRepeating) {
+ castedDestination.isNull[0] = castedSource.isNull[0];
+ for(int c=0; c > castedSource.fields.length; ++c) {
+ copyColumn(castedDestination.fields[c], castedSource.fields[c], 0, 1);
+ }
+ } else {
+ if (!castedSource.noNulls) {
+ for (int r = 0; r < length; ++r) {
+ castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r];
+ }
+ } else {
+ for (int c = 0; c > castedSource.fields.length; ++c) {
+ copyColumn(castedDestination.fields[c], castedSource.fields[c],
+ sourceOffset, length);
+ }
}
}
- reader.seek(index);
}
- @Override
- public void seekToRow(long rowNumber) throws IOException {
- if (rowNumber < 0) {
- throw new IllegalArgumentException("Seek to a negative row number " +
- rowNumber);
- } else if (rowNumber < firstRow) {
- throw new IllegalArgumentException("Seek before reader range " +
- rowNumber);
+ void copyUnionColumn(ColumnVector destination,
+ ColumnVector source,
+ int sourceOffset,
+ int length) {
+ UnionColumnVector castedSource = (UnionColumnVector) source;
+ UnionColumnVector castedDestination = (UnionColumnVector) destination;
+ castedDestination.isRepeating = castedSource.isRepeating;
+ castedDestination.noNulls = castedSource.noNulls;
+ if (source.isRepeating) {
+ castedDestination.isNull[0] = castedSource.isNull[0];
+ int tag = castedSource.tags[0];
+ castedDestination.tags[0] = tag;
+ if (!castedDestination.isNull[0]) {
+ copyColumn(castedDestination.fields[tag], castedSource.fields[tag], 0,
+ 1);
+ }
+ } else {
+ if (!castedSource.noNulls) {
+ for (int r = 0; r < length; ++r) {
+ castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r];
+ castedDestination.tags[r] = castedSource.tags[sourceOffset + r];
+ }
+ } else {
+ for(int r=0; r < length; ++r) {
+ castedDestination.tags[r] = castedSource.tags[sourceOffset + r];
+ }
+ }
+ for(int c=0; c > castedSource.fields.length; ++c) {
+ copyColumn(castedDestination.fields[c], castedSource.fields[c],
+ sourceOffset, length);
+ }
}
- // convert to our internal form (rows from the beginning of slice)
- rowNumber -= firstRow;
+ }
- // move to the right stripe
- int rightStripe = findStripe(rowNumber);
- if (rightStripe != currentStripe) {
- currentStripe = rightStripe;
- readStripe();
+ void copyListColumn(ColumnVector destination,
+ ColumnVector source,
+ int sourceOffset,
+ int length) {
+ ListColumnVector castedSource = (ListColumnVector) source;
+ ListColumnVector castedDestination = (ListColumnVector) destination;
+ castedDestination.isRepeating = castedSource.noNulls;
+ castedDestination.noNulls = castedSource.noNulls;
+ if (source.isRepeating) {
+ castedDestination.isNull[0] = castedSource.isNull[0];
+ castedDestination.offsets[0] = 0;
+ castedDestination.lengths[0] = castedSource.lengths[0];
+ copyColumn(castedDestination.child, castedSource.child,
+ (int) castedSource.offsets[0], (int) castedSource.lengths[0]);
+ } else {
+ if (!castedSource.noNulls) {
+ for (int r = 0; r < length; ++r) {
+ castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r];
+ }
+ }
+ int minOffset = Integer.MAX_VALUE;
+ int maxOffset = Integer.MIN_VALUE;
+ for(int r=0; r < length; ++r) {
+ int childOffset = (int) castedSource.offsets[r + sourceOffset];
+ int childLength = (int) castedSource.lengths[r + sourceOffset];
+ castedDestination.offsets[r] = childOffset;
+ castedDestination.lengths[r] = childLength;
+ minOffset = Math.min(minOffset, childOffset);
+ maxOffset = Math.max(maxOffset, childOffset + childLength);
+ }
+ if (minOffset <= maxOffset) {
+ castedDestination.childCount = maxOffset - minOffset + 1;
+ copyColumn(castedDestination.child, castedSource.child,
+ minOffset, castedDestination.childCount);
+ } else {
+ castedDestination.childCount = 0;
+ }
+ }
+ }
+
+ void copyMapColumn(ColumnVector destination,
+ ColumnVector source,
+ int sourceOffset,
+ int length) {
+ MapColumnVector castedSource = (MapColumnVector) source;
+ MapColumnVector castedDestination = (MapColumnVector) destination;
+ castedDestination.isRepeating = castedSource.noNulls;
+ castedDestination.noNulls = castedSource.noNulls;
+ if (source.isRepeating) {
+ castedDestination.isNull[0] = castedSource.isNull[0];
+ castedDestination.offsets[0] = 0;
+ castedDestination.lengths[0] = castedSource.lengths[0];
+ copyColumn(castedDestination.keys, castedSource.keys,
+ (int) castedSource.offsets[0], (int) castedSource.lengths[0]);
+ copyColumn(castedDestination.values, castedSource.values,
+ (int) castedSource.offsets[0], (int) castedSource.lengths[0]);
+ } else {
+ if (!castedSource.noNulls) {
+ for (int r = 0; r < length; ++r) {
+ castedDestination.isNull[r] = castedSource.isNull[sourceOffset + r];
+ }
+ }
+ int minOffset = Integer.MAX_VALUE;
+ int maxOffset = Integer.MIN_VALUE;
+ for(int r=0; r < length; ++r) {
+ int childOffset = (int) castedSource.offsets[r + sourceOffset];
+ int childLength = (int) castedSource.lengths[r + sourceOffset];
+ castedDestination.offsets[r] = childOffset;
+ castedDestination.lengths[r] = childLength;
+ minOffset = Math.min(minOffset, childOffset);
+ maxOffset = Math.max(maxOffset, childOffset + childLength);
+ }
+ if (minOffset <= maxOffset) {
+ castedDestination.childCount = maxOffset - minOffset + 1;
+ copyColumn(castedDestination.keys, castedSource.keys,
+ minOffset, castedDestination.childCount);
+ copyColumn(castedDestination.values, castedSource.values,
+ minOffset, castedDestination.childCount);
+ } else {
+ castedDestination.childCount = 0;
+ }
}
- readRowIndex(currentStripe, included, sargApp == null ? null : sargApp.sargColumns);
-
- // if we aren't to the right row yet, advance in the stripe.
- advanceToNextRow(reader, rowNumber, true);
}
- private static final String TRANSLATED_SARG_SEPARATOR = "_";
- public static String encodeTranslatedSargColumn(int rootColumn, Integer indexInSourceTable) {
- return rootColumn + TRANSLATED_SARG_SEPARATOR
- + ((indexInSourceTable == null) ? -1 : indexInSourceTable);
+ void copyColumn(ColumnVector destination,
+ ColumnVector source,
+ int sourceOffset,
+ int length) {
+ if (source.getClass() == LongColumnVector.class) {
+ copyLongColumn(destination, source, sourceOffset, length);
+ } else if (source.getClass() == DoubleColumnVector.class) {
+ copyDoubleColumn(destination, source, sourceOffset, length);
+ } else if (source.getClass() == BytesColumnVector.class) {
+ copyBytesColumn(destination, source, sourceOffset, length);
+ } else if (source.getClass() == TimestampColumnVector.class) {
+ copyTimestampColumn(destination, source, sourceOffset, length);
+ } else if (source.getClass() == DecimalColumnVector.class) {
+ copyDecimalColumn(destination, source, sourceOffset, length);
+ } else if (source.getClass() == StructColumnVector.class) {
+ copyStructColumn(destination, source, sourceOffset, length);
+ } else if (source.getClass() == UnionColumnVector.class) {
+ copyUnionColumn(destination, source, sourceOffset, length);
+ } else if (source.getClass() == ListColumnVector.class) {
+ copyListColumn(destination, source, sourceOffset, length);
+ } else if (source.getClass() == MapColumnVector.class) {
+ copyMapColumn(destination, source, sourceOffset, length);
+ }
}
- public static int[] mapTranslatedSargColumns(
- List<OrcProto.Type> types, List<PredicateLeaf> sargLeaves) {
- int[] result = new int[sargLeaves.size()];
- OrcProto.Type lastRoot = null; // Root will be the same for everyone as of now.
- String lastRootStr = null;
- for (int i = 0; i < result.length; ++i) {
- String[] rootAndIndex = sargLeaves.get(i).getColumnName().split(TRANSLATED_SARG_SEPARATOR);
- assert rootAndIndex.length == 2;
- String rootStr = rootAndIndex[0], indexStr = rootAndIndex[1];
- int index = Integer.parseInt(indexStr);
- // First, check if the column even maps to anything.
- if (index == -1) {
- result[i] = -1;
- continue;
- }
- assert index >= 0;
- // Then, find the root type if needed.
- if (!rootStr.equals(lastRootStr)) {
- lastRoot = types.get(Integer.parseInt(rootStr));
- lastRootStr = rootStr;
- }
- // Subtypes of the root types correspond, in order, to the columns in the table schema
- // (disregarding schema evolution that doesn't presently work). Get the index for the
- // corresponding subtype.
- result[i] = lastRoot.getSubtypes(index);
- }
- return result;
+ /**
+ * Copy part of a batch into the destination batch.
+ * @param destination the batch to copy into
+ * @param source the batch to copy from
+ * @param sourceStart the row number to start from in the source
+ * @return the number of rows copied
+ */
+ void copyIntoBatch(VectorizedRowBatch destination,
+ VectorizedRowBatch source,
+ int sourceStart) {
+ int rows = Math.min(source.size - sourceStart, destination.getMaxSize());
+ for(int c=0; c < source.cols.length; ++c) {
+ destination.cols[c].reset();
+ copyColumn(destination.cols[c], source.cols[c], sourceStart, rows);
+ }
+ destination.size = rows;
}
}
[25/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/HadoopShims.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/HadoopShims.java b/orc/src/java/org/apache/orc/impl/HadoopShims.java
index 2980d71..ef7d70f 100644
--- a/orc/src/java/org/apache/orc/impl/HadoopShims.java
+++ b/orc/src/java/org/apache/orc/impl/HadoopShims.java
@@ -18,9 +18,13 @@
package org.apache.orc.impl;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.VersionInfo;
+import java.io.Closeable;
import java.io.IOException;
+import java.io.InputStream;
import java.nio.ByteBuffer;
public interface HadoopShims {
@@ -43,6 +47,81 @@ public interface HadoopShims {
*/
DirectDecompressor getDirectDecompressor(DirectCompressionType codec);
+ /**
+ * a hadoop.io ByteBufferPool shim.
+ */
+ public interface ByteBufferPoolShim {
+ /**
+ * Get a new ByteBuffer from the pool. The pool can provide this from
+ * removing a buffer from its internal cache, or by allocating a
+ * new buffer.
+ *
+ * @param direct Whether the buffer should be direct.
+ * @param length The minimum length the buffer will have.
+ * @return A new ByteBuffer. Its capacity can be less
+ * than what was requested, but must be at
+ * least 1 byte.
+ */
+ ByteBuffer getBuffer(boolean direct, int length);
+
+ /**
+ * Release a buffer back to the pool.
+ * The pool may choose to put this buffer into its cache/free it.
+ *
+ * @param buffer a direct bytebuffer
+ */
+ void putBuffer(ByteBuffer buffer);
+ }
+
+ /**
+ * Provides an HDFS ZeroCopyReader shim.
+ * @param in FSDataInputStream to read from (where the cached/mmap buffers are tied to)
+ * @param in ByteBufferPoolShim to allocate fallback buffers with
+ *
+ * @return returns null if not supported
+ */
+ public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, ByteBufferPoolShim pool) throws IOException;
+
+ public interface ZeroCopyReaderShim extends Closeable {
+ /**
+ * Get a ByteBuffer from the FSDataInputStream - this can be either a HeapByteBuffer or an MappedByteBuffer.
+ * Also move the in stream by that amount. The data read can be small than maxLength.
+ *
+ * @return ByteBuffer read from the stream,
+ */
+ public ByteBuffer readBuffer(int maxLength, boolean verifyChecksums) throws IOException;
+ /**
+ * Release a ByteBuffer obtained from a read on the
+ * Also move the in stream by that amount. The data read can be small than maxLength.
+ *
+ */
+ public void releaseBuffer(ByteBuffer buffer);
+
+ /**
+ * Close the underlying stream.
+ * @throws IOException
+ */
+ public void close() throws IOException;
+ }
+ /**
+ * Read data into a Text object in the fastest way possible
+ */
+ public interface TextReaderShim {
+ /**
+ * @param txt
+ * @param size
+ * @return bytes read
+ * @throws IOException
+ */
+ void read(Text txt, int size) throws IOException;
+ }
+
+ /**
+ * Wrap a TextReaderShim around an input stream. The reader shim will not
+ * buffer any reads from the underlying stream and will only consume bytes
+ * which are required for TextReaderShim.read() input.
+ */
+ public TextReaderShim getTextReaderShim(InputStream input) throws IOException;
class Factory {
private static HadoopShims SHIMS = null;
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java b/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java
index 3b9371d..5c53f74 100644
--- a/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java
+++ b/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java
@@ -18,10 +18,14 @@
package org.apache.orc.impl;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.snappy.SnappyDecompressor;
import org.apache.hadoop.io.compress.zlib.ZlibDecompressor;
+import java.io.DataInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.nio.ByteBuffer;
/**
@@ -59,4 +63,30 @@ public class HadoopShimsCurrent implements HadoopShims {
return null;
}
}
+
+ @Override
+ public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in,
+ ByteBufferPoolShim pool
+ ) throws IOException {
+ return ZeroCopyShims.getZeroCopyReader(in, pool);
+ }
+
+ private final class FastTextReaderShim implements TextReaderShim {
+ private final DataInputStream din;
+
+ public FastTextReaderShim(InputStream in) {
+ this.din = new DataInputStream(in);
+ }
+
+ @Override
+ public void read(Text txt, int len) throws IOException {
+ txt.readWithKnownLength(din, len);
+ }
+ }
+
+ @Override
+ public TextReaderShim getTextReaderShim(InputStream in) throws IOException {
+ return new FastTextReaderShim(in);
+ }
+
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java b/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java
index ac46836..3f65e74 100644
--- a/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java
+++ b/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java
@@ -18,19 +18,84 @@
package org.apache.orc.impl;
-import org.apache.hadoop.io.compress.snappy.SnappyDecompressor;
-import org.apache.hadoop.io.compress.zlib.ZlibDecompressor;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.io.Text;
+import java.io.EOFException;
import java.io.IOException;
-import java.nio.ByteBuffer;
+import java.io.InputStream;
+import java.lang.reflect.Method;
/**
* Shims for versions of Hadoop up to and including 2.2.x
*/
public class HadoopShims_2_2 implements HadoopShims {
+ final boolean zeroCopy;
+ final boolean fastRead;
+
+ HadoopShims_2_2() {
+ boolean zcr = false;
+ try {
+ Class.forName("org.apache.hadoop.fs.CacheFlag", false,
+ HadoopShims_2_2.class.getClassLoader());
+ zcr = true;
+ } catch (ClassNotFoundException ce) {
+ }
+ zeroCopy = zcr;
+ boolean fastRead = false;
+ if (zcr) {
+ for (Method m : Text.class.getMethods()) {
+ if ("readWithKnownLength".equals(m.getName())) {
+ fastRead = true;
+ }
+ }
+ }
+ this.fastRead = fastRead;
+ }
+
public DirectDecompressor getDirectDecompressor(
DirectCompressionType codec) {
return null;
}
+
+ @Override
+ public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in,
+ ByteBufferPoolShim pool
+ ) throws IOException {
+ if(zeroCopy) {
+ return ZeroCopyShims.getZeroCopyReader(in, pool);
+ }
+ /* not supported */
+ return null;
+ }
+
+ private final class BasicTextReaderShim implements TextReaderShim {
+ private final InputStream in;
+
+ public BasicTextReaderShim(InputStream in) {
+ this.in = in;
+ }
+
+ @Override
+ public void read(Text txt, int len) throws IOException {
+ int offset = 0;
+ byte[] bytes = new byte[len];
+ while (len > 0) {
+ int written = in.read(bytes, offset, len);
+ if (written < 0) {
+ throw new EOFException("Can't finish read from " + in + " read "
+ + (offset) + " bytes out of " + bytes.length);
+ }
+ len -= written;
+ offset += written;
+ }
+ txt.set(bytes);
+ }
+ }
+
+ @Override
+ public TextReaderShim getTextReaderShim(InputStream in) throws IOException {
+ return new BasicTextReaderShim(in);
+ }
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/IntegerReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/IntegerReader.java b/orc/src/java/org/apache/orc/impl/IntegerReader.java
index 8bef0f1..3e64d54 100644
--- a/orc/src/java/org/apache/orc/impl/IntegerReader.java
+++ b/orc/src/java/org/apache/orc/impl/IntegerReader.java
@@ -78,4 +78,5 @@ public interface IntegerReader {
void nextVector(ColumnVector column,
int[] data,
int length
- ) throws IOException;}
+ ) throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/OrcAcidUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/OrcAcidUtils.java b/orc/src/java/org/apache/orc/impl/OrcAcidUtils.java
new file mode 100644
index 0000000..72c7f54
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/OrcAcidUtils.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.orc.Reader;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+
+public class OrcAcidUtils {
+ public static final String ACID_STATS = "hive.acid.stats";
+ public static final String DELTA_SIDE_FILE_SUFFIX = "_flush_length";
+
+ /**
+ * Get the filename of the ORC ACID side file that contains the lengths
+ * of the intermediate footers.
+ * @param main the main ORC filename
+ * @return the name of the side file
+ */
+ public static Path getSideFile(Path main) {
+ return new Path(main + DELTA_SIDE_FILE_SUFFIX);
+ }
+
+ /**
+ * Read the side file to get the last flush length.
+ * @param fs the file system to use
+ * @param deltaFile the path of the delta file
+ * @return the maximum size of the file to use
+ * @throws IOException
+ */
+ public static long getLastFlushLength(FileSystem fs,
+ Path deltaFile) throws IOException {
+ Path lengths = getSideFile(deltaFile);
+ long result = Long.MAX_VALUE;
+ try (FSDataInputStream stream = fs.open(lengths)) {
+ result = -1;
+ while (stream.available() > 0) {
+ result = stream.readLong();
+ }
+ return result;
+ } catch (IOException ioe) {
+ return result;
+ }
+ }
+
+ private static final Charset utf8 = Charset.forName("UTF-8");
+ private static final CharsetDecoder utf8Decoder = utf8.newDecoder();
+
+ public static AcidStats parseAcidStats(Reader reader) {
+ if (reader.hasMetadataValue(ACID_STATS)) {
+ try {
+ ByteBuffer val = reader.getMetadataValue(ACID_STATS).duplicate();
+ return new AcidStats(utf8Decoder.decode(val).toString());
+ } catch (CharacterCodingException e) {
+ throw new IllegalArgumentException("Bad string encoding for " +
+ ACID_STATS, e);
+ }
+ } else {
+ return null;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/ReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/ReaderImpl.java b/orc/src/java/org/apache/orc/impl/ReaderImpl.java
new file mode 100644
index 0000000..2da590e
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/ReaderImpl.java
@@ -0,0 +1,758 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcUtils;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.CompressionCodec;
+import org.apache.orc.FileFormatException;
+import org.apache.orc.FileMetaInfo;
+import org.apache.orc.FileMetadata;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.StripeStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.io.DiskRange;
+import org.apache.hadoop.hive.ql.util.JavaDataModel;
+import org.apache.hadoop.io.Text;
+import org.apache.orc.OrcProto;
+
+import com.google.common.collect.Lists;
+import com.google.protobuf.CodedInputStream;
+
+public class ReaderImpl implements Reader {
+
+ private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class);
+
+ private static final int DIRECTORY_SIZE_GUESS = 16 * 1024;
+
+ protected final FileSystem fileSystem;
+ private final long maxLength;
+ protected final Path path;
+ protected final org.apache.orc.CompressionKind compressionKind;
+ protected final CompressionCodec codec;
+ protected final int bufferSize;
+ private final List<OrcProto.StripeStatistics> stripeStats;
+ private final int metadataSize;
+ protected final List<OrcProto.Type> types;
+ private final TypeDescription schema;
+ private final List<OrcProto.UserMetadataItem> userMetadata;
+ private final List<OrcProto.ColumnStatistics> fileStats;
+ private final List<StripeInformation> stripes;
+ protected final int rowIndexStride;
+ private final long contentLength, numberOfRows;
+
+
+ private long deserializedSize = -1;
+ protected final Configuration conf;
+ private final List<Integer> versionList;
+ private final OrcFile.WriterVersion writerVersion;
+
+ // Same for metastore cache - maintains the same background buffer, but includes postscript.
+ // This will only be set if the file footer/metadata was read from disk.
+ private final ByteBuffer footerMetaAndPsBuffer;
+
+ public static class StripeInformationImpl
+ implements StripeInformation {
+ private final OrcProto.StripeInformation stripe;
+
+ public StripeInformationImpl(OrcProto.StripeInformation stripe) {
+ this.stripe = stripe;
+ }
+
+ @Override
+ public long getOffset() {
+ return stripe.getOffset();
+ }
+
+ @Override
+ public long getLength() {
+ return stripe.getDataLength() + getIndexLength() + getFooterLength();
+ }
+
+ @Override
+ public long getDataLength() {
+ return stripe.getDataLength();
+ }
+
+ @Override
+ public long getFooterLength() {
+ return stripe.getFooterLength();
+ }
+
+ @Override
+ public long getIndexLength() {
+ return stripe.getIndexLength();
+ }
+
+ @Override
+ public long getNumberOfRows() {
+ return stripe.getNumberOfRows();
+ }
+
+ @Override
+ public String toString() {
+ return "offset: " + getOffset() + " data: " + getDataLength() +
+ " rows: " + getNumberOfRows() + " tail: " + getFooterLength() +
+ " index: " + getIndexLength();
+ }
+ }
+
+ @Override
+ public long getNumberOfRows() {
+ return numberOfRows;
+ }
+
+ @Override
+ public List<String> getMetadataKeys() {
+ List<String> result = new ArrayList<String>();
+ for(OrcProto.UserMetadataItem item: userMetadata) {
+ result.add(item.getName());
+ }
+ return result;
+ }
+
+ @Override
+ public ByteBuffer getMetadataValue(String key) {
+ for(OrcProto.UserMetadataItem item: userMetadata) {
+ if (item.hasName() && item.getName().equals(key)) {
+ return item.getValue().asReadOnlyByteBuffer();
+ }
+ }
+ throw new IllegalArgumentException("Can't find user metadata " + key);
+ }
+
+ public boolean hasMetadataValue(String key) {
+ for(OrcProto.UserMetadataItem item: userMetadata) {
+ if (item.hasName() && item.getName().equals(key)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public org.apache.orc.CompressionKind getCompressionKind() {
+ return compressionKind;
+ }
+
+ @Override
+ public int getCompressionSize() {
+ return bufferSize;
+ }
+
+ @Override
+ public List<StripeInformation> getStripes() {
+ return stripes;
+ }
+
+ @Override
+ public long getContentLength() {
+ return contentLength;
+ }
+
+ @Override
+ public List<OrcProto.Type> getTypes() {
+ return types;
+ }
+
+ @Override
+ public OrcFile.Version getFileVersion() {
+ for (OrcFile.Version version: OrcFile.Version.values()) {
+ if ((versionList != null && !versionList.isEmpty()) &&
+ version.getMajor() == versionList.get(0) &&
+ version.getMinor() == versionList.get(1)) {
+ return version;
+ }
+ }
+ return OrcFile.Version.V_0_11;
+ }
+
+ @Override
+ public OrcFile.WriterVersion getWriterVersion() {
+ return writerVersion;
+ }
+
+ @Override
+ public int getRowIndexStride() {
+ return rowIndexStride;
+ }
+
+ @Override
+ public ColumnStatistics[] getStatistics() {
+ ColumnStatistics[] result = new ColumnStatistics[types.size()];
+ for(int i=0; i < result.length; ++i) {
+ result[i] = ColumnStatisticsImpl.deserialize(fileStats.get(i));
+ }
+ return result;
+ }
+
+ @Override
+ public TypeDescription getSchema() {
+ return schema;
+ }
+
+ /**
+ * Ensure this is an ORC file to prevent users from trying to read text
+ * files or RC files as ORC files.
+ * @param in the file being read
+ * @param path the filename for error messages
+ * @param psLen the postscript length
+ * @param buffer the tail of the file
+ * @throws IOException
+ */
+ protected static void ensureOrcFooter(FSDataInputStream in,
+ Path path,
+ int psLen,
+ ByteBuffer buffer) throws IOException {
+ int magicLength = OrcFile.MAGIC.length();
+ int fullLength = magicLength + 1;
+ if (psLen < fullLength || buffer.remaining() < fullLength) {
+ throw new FileFormatException("Malformed ORC file " + path +
+ ". Invalid postscript length " + psLen);
+ }
+ int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;
+ byte[] array = buffer.array();
+ // now look for the magic string at the end of the postscript.
+ if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {
+ // If it isn't there, this may be the 0.11.0 version of ORC.
+ // Read the first 3 bytes of the file to check for the header
+ byte[] header = new byte[magicLength];
+ in.readFully(0, header, 0, magicLength);
+ // if it isn't there, this isn't an ORC file
+ if (!Text.decode(header, 0 , magicLength).equals(OrcFile.MAGIC)) {
+ throw new FileFormatException("Malformed ORC file " + path +
+ ". Invalid postscript.");
+ }
+ }
+ }
+
+ /**
+ * Build a version string out of an array.
+ * @param version the version number as a list
+ * @return the human readable form of the version string
+ */
+ private static String versionString(List<Integer> version) {
+ StringBuilder buffer = new StringBuilder();
+ for(int i=0; i < version.size(); ++i) {
+ if (i != 0) {
+ buffer.append('.');
+ }
+ buffer.append(version.get(i));
+ }
+ return buffer.toString();
+ }
+
+ /**
+ * Check to see if this ORC file is from a future version and if so,
+ * warn the user that we may not be able to read all of the column encodings.
+ * @param log the logger to write any error message to
+ * @param path the data source path for error messages
+ * @param version the version of hive that wrote the file.
+ */
+ protected static void checkOrcVersion(Logger log, Path path,
+ List<Integer> version) {
+ if (version.size() >= 1) {
+ int major = version.get(0);
+ int minor = 0;
+ if (version.size() >= 2) {
+ minor = version.get(1);
+ }
+ if (major > OrcFile.Version.CURRENT.getMajor() ||
+ (major == OrcFile.Version.CURRENT.getMajor() &&
+ minor > OrcFile.Version.CURRENT.getMinor())) {
+ log.warn(path + " was written by a future Hive version " +
+ versionString(version) +
+ ". This file may not be readable by this version of Hive.");
+ }
+ }
+ }
+
+ /**
+ * Constructor that let's the user specify additional options.
+ * @param path pathname for file
+ * @param options options for reading
+ * @throws IOException
+ */
+ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException {
+ FileSystem fs = options.getFilesystem();
+ if (fs == null) {
+ fs = path.getFileSystem(options.getConfiguration());
+ }
+ this.fileSystem = fs;
+ this.path = path;
+ this.conf = options.getConfiguration();
+ this.maxLength = options.getMaxLength();
+
+ FileMetadata fileMetadata = options.getFileMetadata();
+ if (fileMetadata != null) {
+ this.compressionKind = fileMetadata.getCompressionKind();
+ this.bufferSize = fileMetadata.getCompressionBufferSize();
+ this.codec = WriterImpl.createCodec(compressionKind);
+ this.metadataSize = fileMetadata.getMetadataSize();
+ this.stripeStats = fileMetadata.getStripeStats();
+ this.versionList = fileMetadata.getVersionList();
+ this.writerVersion =
+ OrcFile.WriterVersion.from(fileMetadata.getWriterVersionNum());
+ this.types = fileMetadata.getTypes();
+ this.rowIndexStride = fileMetadata.getRowIndexStride();
+ this.contentLength = fileMetadata.getContentLength();
+ this.numberOfRows = fileMetadata.getNumberOfRows();
+ this.fileStats = fileMetadata.getFileStats();
+ this.stripes = fileMetadata.getStripes();
+ this.userMetadata = null; // not cached and not needed here
+ this.footerMetaAndPsBuffer = null;
+ } else {
+ FileMetaInfo footerMetaData;
+ if (options.getFileMetaInfo() != null) {
+ footerMetaData = options.getFileMetaInfo();
+ this.footerMetaAndPsBuffer = null;
+ } else {
+ footerMetaData = extractMetaInfoFromFooter(fs, path,
+ options.getMaxLength());
+ this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer;
+ }
+ MetaInfoObjExtractor rInfo =
+ new MetaInfoObjExtractor(footerMetaData.compressionType,
+ footerMetaData.bufferSize,
+ footerMetaData.metadataSize,
+ footerMetaData.footerBuffer
+ );
+ this.compressionKind = rInfo.compressionKind;
+ this.codec = rInfo.codec;
+ this.bufferSize = rInfo.bufferSize;
+ this.metadataSize = rInfo.metadataSize;
+ this.stripeStats = rInfo.metadata.getStripeStatsList();
+ this.types = rInfo.footer.getTypesList();
+ this.rowIndexStride = rInfo.footer.getRowIndexStride();
+ this.contentLength = rInfo.footer.getContentLength();
+ this.numberOfRows = rInfo.footer.getNumberOfRows();
+ this.userMetadata = rInfo.footer.getMetadataList();
+ this.fileStats = rInfo.footer.getStatisticsList();
+ this.versionList = footerMetaData.versionList;
+ this.writerVersion = footerMetaData.writerVersion;
+ this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList());
+ }
+ this.schema = OrcUtils.convertTypeFromProtobuf(this.types, 0);
+ }
+
+ /**
+ * Get the WriterVersion based on the ORC file postscript.
+ * @param writerVersion the integer writer version
+ * @return the version of the software that produced the file
+ */
+ public static OrcFile.WriterVersion getWriterVersion(int writerVersion) {
+ for(OrcFile.WriterVersion version: OrcFile.WriterVersion.values()) {
+ if (version.getId() == writerVersion) {
+ return version;
+ }
+ }
+ return OrcFile.WriterVersion.FUTURE;
+ }
+
+ private static OrcProto.Footer extractFooter(ByteBuffer bb, int footerAbsPos,
+ int footerSize, CompressionCodec codec, int bufferSize) throws IOException {
+ bb.position(footerAbsPos);
+ bb.limit(footerAbsPos + footerSize);
+ return OrcProto.Footer.parseFrom(InStream.createCodedInputStream("footer",
+ Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize));
+ }
+
+ private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos,
+ int metadataSize, CompressionCodec codec, int bufferSize) throws IOException {
+ bb.position(metadataAbsPos);
+ bb.limit(metadataAbsPos + metadataSize);
+ return OrcProto.Metadata.parseFrom(InStream.createCodedInputStream("metadata",
+ Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize));
+ }
+
+ private static OrcProto.PostScript extractPostScript(ByteBuffer bb, Path path,
+ int psLen, int psAbsOffset) throws IOException {
+ // TODO: when PB is upgraded to 2.6, newInstance(ByteBuffer) method should be used here.
+ assert bb.hasArray();
+ CodedInputStream in = CodedInputStream.newInstance(
+ bb.array(), bb.arrayOffset() + psAbsOffset, psLen);
+ OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in);
+ checkOrcVersion(LOG, path, ps.getVersionList());
+
+ // Check compression codec.
+ switch (ps.getCompression()) {
+ case NONE:
+ break;
+ case ZLIB:
+ break;
+ case SNAPPY:
+ break;
+ case LZO:
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown compression");
+ }
+ return ps;
+ }
+
+ private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs,
+ Path path,
+ long maxFileLength
+ ) throws IOException {
+ FSDataInputStream file = fs.open(path);
+ ByteBuffer buffer = null, fullFooterBuffer = null;
+ OrcProto.PostScript ps = null;
+ OrcFile.WriterVersion writerVersion = null;
+ try {
+ // figure out the size of the file using the option or filesystem
+ long size;
+ if (maxFileLength == Long.MAX_VALUE) {
+ size = fs.getFileStatus(path).getLen();
+ } else {
+ size = maxFileLength;
+ }
+
+ //read last bytes into buffer to get PostScript
+ int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
+ buffer = ByteBuffer.allocate(readSize);
+ assert buffer.position() == 0;
+ file.readFully((size - readSize),
+ buffer.array(), buffer.arrayOffset(), readSize);
+ buffer.position(0);
+
+ //read the PostScript
+ //get length of PostScript
+ int psLen = buffer.get(readSize - 1) & 0xff;
+ ensureOrcFooter(file, path, psLen, buffer);
+ int psOffset = readSize - 1 - psLen;
+ ps = extractPostScript(buffer, path, psLen, psOffset);
+
+ int footerSize = (int) ps.getFooterLength();
+ int metadataSize = (int) ps.getMetadataLength();
+ writerVersion = extractWriterVersion(ps);
+
+ //check if extra bytes need to be read
+ int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize);
+ if (extra > 0) {
+ //more bytes need to be read, seek back to the right place and read extra bytes
+ ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize);
+ file.readFully((size - readSize - extra), extraBuf.array(),
+ extraBuf.arrayOffset() + extraBuf.position(), extra);
+ extraBuf.position(extra);
+ //append with already read bytes
+ extraBuf.put(buffer);
+ buffer = extraBuf;
+ buffer.position(0);
+ fullFooterBuffer = buffer.slice();
+ buffer.limit(footerSize + metadataSize);
+ } else {
+ //footer is already in the bytes in buffer, just adjust position, length
+ buffer.position(psOffset - footerSize - metadataSize);
+ fullFooterBuffer = buffer.slice();
+ buffer.limit(psOffset);
+ }
+
+ // remember position for later TODO: what later? this comment is useless
+ buffer.mark();
+ } finally {
+ try {
+ file.close();
+ } catch (IOException ex) {
+ LOG.error("Failed to close the file after another error", ex);
+ }
+ }
+
+ return new FileMetaInfo(
+ ps.getCompression().toString(),
+ (int) ps.getCompressionBlockSize(),
+ (int) ps.getMetadataLength(),
+ buffer,
+ ps.getVersionList(),
+ writerVersion,
+ fullFooterBuffer
+ );
+ }
+
+ protected static OrcFile.WriterVersion extractWriterVersion(OrcProto.PostScript ps) {
+ return (ps.hasWriterVersion()
+ ? getWriterVersion(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL);
+ }
+
+ protected static List<StripeInformation> convertProtoStripesToStripes(
+ List<OrcProto.StripeInformation> stripes) {
+ List<StripeInformation> result = new ArrayList<StripeInformation>(stripes.size());
+ for (OrcProto.StripeInformation info : stripes) {
+ result.add(new StripeInformationImpl(info));
+ }
+ return result;
+ }
+
+ /**
+ * MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl
+ * from serialized fields.
+ * As the fields are final, the fields need to be initialized in the constructor and
+ * can't be done in some helper function. So this helper class is used instead.
+ *
+ */
+ private static class MetaInfoObjExtractor{
+ final org.apache.orc.CompressionKind compressionKind;
+ final CompressionCodec codec;
+ final int bufferSize;
+ final int metadataSize;
+ final OrcProto.Metadata metadata;
+ final OrcProto.Footer footer;
+
+ MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize,
+ ByteBuffer footerBuffer) throws IOException {
+
+ this.compressionKind = org.apache.orc.CompressionKind.valueOf(codecStr.toUpperCase());
+ this.bufferSize = bufferSize;
+ this.codec = WriterImpl.createCodec(compressionKind);
+ this.metadataSize = metadataSize;
+
+ int position = footerBuffer.position();
+ int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize;
+
+ this.metadata = extractMetadata(footerBuffer, position, metadataSize, codec, bufferSize);
+ this.footer = extractFooter(
+ footerBuffer, position + metadataSize, footerBufferSize, codec, bufferSize);
+
+ footerBuffer.position(position);
+ }
+ }
+
+ @Override
+ public ByteBuffer getSerializedFileFooter() {
+ return footerMetaAndPsBuffer;
+ }
+
+ @Override
+ public RecordReader rows() throws IOException {
+ return rows(new Options());
+ }
+
+ @Override
+ public RecordReader rows(Options options) throws IOException {
+ LOG.info("Reading ORC rows from " + path + " with " + options);
+ boolean[] include = options.getInclude();
+ // if included columns is null, then include all columns
+ if (include == null) {
+ include = new boolean[types.size()];
+ Arrays.fill(include, true);
+ options.include(include);
+ }
+ return new RecordReaderImpl(this, options);
+ }
+
+
+ @Override
+ public long getRawDataSize() {
+ // if the deserializedSize is not computed, then compute it, else
+ // return the already computed size. since we are reading from the footer
+ // we don't have to compute deserialized size repeatedly
+ if (deserializedSize == -1) {
+ List<Integer> indices = Lists.newArrayList();
+ for (int i = 0; i < fileStats.size(); ++i) {
+ indices.add(i);
+ }
+ deserializedSize = getRawDataSizeFromColIndices(indices);
+ }
+ return deserializedSize;
+ }
+
+ @Override
+ public long getRawDataSizeFromColIndices(List<Integer> colIndices) {
+ return getRawDataSizeFromColIndices(colIndices, types, fileStats);
+ }
+
+ public static long getRawDataSizeFromColIndices(
+ List<Integer> colIndices, List<OrcProto.Type> types,
+ List<OrcProto.ColumnStatistics> stats) {
+ long result = 0;
+ for (int colIdx : colIndices) {
+ result += getRawDataSizeOfColumn(colIdx, types, stats);
+ }
+ return result;
+ }
+
+ private static long getRawDataSizeOfColumn(int colIdx, List<OrcProto.Type> types,
+ List<OrcProto.ColumnStatistics> stats) {
+ OrcProto.ColumnStatistics colStat = stats.get(colIdx);
+ long numVals = colStat.getNumberOfValues();
+ OrcProto.Type type = types.get(colIdx);
+
+ switch (type.getKind()) {
+ case BINARY:
+ // old orc format doesn't support binary statistics. checking for binary
+ // statistics is not required as protocol buffers takes care of it.
+ return colStat.getBinaryStatistics().getSum();
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ // old orc format doesn't support sum for string statistics. checking for
+ // existence is not required as protocol buffers takes care of it.
+
+ // ORC strings are deserialized to java strings. so use java data model's
+ // string size
+ numVals = numVals == 0 ? 1 : numVals;
+ int avgStrLen = (int) (colStat.getStringStatistics().getSum() / numVals);
+ return numVals * JavaDataModel.get().lengthForStringOfLength(avgStrLen);
+ case TIMESTAMP:
+ return numVals * JavaDataModel.get().lengthOfTimestamp();
+ case DATE:
+ return numVals * JavaDataModel.get().lengthOfDate();
+ case DECIMAL:
+ return numVals * JavaDataModel.get().lengthOfDecimal();
+ case DOUBLE:
+ case LONG:
+ return numVals * JavaDataModel.get().primitive2();
+ case FLOAT:
+ case INT:
+ case SHORT:
+ case BOOLEAN:
+ case BYTE:
+ return numVals * JavaDataModel.get().primitive1();
+ default:
+ LOG.debug("Unknown primitive category: " + type.getKind());
+ break;
+ }
+
+ return 0;
+ }
+
+ @Override
+ public long getRawDataSizeOfColumns(List<String> colNames) {
+ List<Integer> colIndices = getColumnIndicesFromNames(colNames);
+ return getRawDataSizeFromColIndices(colIndices);
+ }
+
+ private List<Integer> getColumnIndicesFromNames(List<String> colNames) {
+ // top level struct
+ OrcProto.Type type = types.get(0);
+ List<Integer> colIndices = Lists.newArrayList();
+ List<String> fieldNames = type.getFieldNamesList();
+ int fieldIdx;
+ for (String colName : colNames) {
+ if (fieldNames.contains(colName)) {
+ fieldIdx = fieldNames.indexOf(colName);
+ } else {
+ String s = "Cannot find field for: " + colName + " in ";
+ for (String fn : fieldNames) {
+ s += fn + ", ";
+ }
+ LOG.warn(s);
+ continue;
+ }
+
+ // a single field may span multiple columns. find start and end column
+ // index for the requested field
+ int idxStart = type.getSubtypes(fieldIdx);
+
+ int idxEnd;
+
+ // if the specified is the last field and then end index will be last
+ // column index
+ if (fieldIdx + 1 > fieldNames.size() - 1) {
+ idxEnd = getLastIdx() + 1;
+ } else {
+ idxEnd = type.getSubtypes(fieldIdx + 1);
+ }
+
+ // if start index and end index are same then the field is a primitive
+ // field else complex field (like map, list, struct, union)
+ if (idxStart == idxEnd) {
+ // simple field
+ colIndices.add(idxStart);
+ } else {
+ // complex fields spans multiple columns
+ for (int i = idxStart; i < idxEnd; i++) {
+ colIndices.add(i);
+ }
+ }
+ }
+ return colIndices;
+ }
+
+ private int getLastIdx() {
+ Set<Integer> indices = new HashSet<>();
+ for (OrcProto.Type type : types) {
+ indices.addAll(type.getSubtypesList());
+ }
+ return Collections.max(indices);
+ }
+
+ @Override
+ public List<OrcProto.StripeStatistics> getOrcProtoStripeStatistics() {
+ return stripeStats;
+ }
+
+ @Override
+ public List<OrcProto.ColumnStatistics> getOrcProtoFileStatistics() {
+ return fileStats;
+ }
+
+ @Override
+ public List<StripeStatistics> getStripeStatistics() {
+ List<StripeStatistics> result = new ArrayList<>();
+ for (OrcProto.StripeStatistics ss : stripeStats) {
+ result.add(new StripeStatistics(ss.getColStatsList()));
+ }
+ return result;
+ }
+
+ public List<OrcProto.UserMetadataItem> getOrcProtoUserMetadata() {
+ return userMetadata;
+ }
+
+ @Override
+ public List<Integer> getVersionList() {
+ return versionList;
+ }
+
+ @Override
+ public int getMetadataSize() {
+ return metadataSize;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buffer = new StringBuilder();
+ buffer.append("ORC Reader(");
+ buffer.append(path);
+ if (maxLength != -1) {
+ buffer.append(", ");
+ buffer.append(maxLength);
+ }
+ buffer.append(")");
+ return buffer.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java b/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java
new file mode 100644
index 0000000..36a802e
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -0,0 +1,1215 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.orc.BooleanColumnStatistics;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.CompressionCodec;
+import org.apache.orc.DataReader;
+import org.apache.orc.DateColumnStatistics;
+import org.apache.orc.DecimalColumnStatistics;
+import org.apache.orc.DoubleColumnStatistics;
+import org.apache.orc.IntegerColumnStatistics;
+import org.apache.orc.OrcConf;
+import org.apache.orc.StringColumnStatistics;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.TimestampColumnStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.io.DiskRange;
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.BloomFilterIO;
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.ql.util.TimestampUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.orc.OrcProto;
+
+public class RecordReaderImpl implements RecordReader {
+ static final Logger LOG = LoggerFactory.getLogger(RecordReaderImpl.class);
+ private static final boolean isLogDebugEnabled = LOG.isDebugEnabled();
+ private static final Object UNKNOWN_VALUE = new Object();
+ protected final Path path;
+ private final long firstRow;
+ private final List<StripeInformation> stripes =
+ new ArrayList<StripeInformation>();
+ private OrcProto.StripeFooter stripeFooter;
+ private final long totalRowCount;
+ private final CompressionCodec codec;
+ protected final TypeDescription schema;
+ private final List<OrcProto.Type> types;
+ private final int bufferSize;
+ private final boolean[] included;
+ private final long rowIndexStride;
+ private long rowInStripe = 0;
+ private int currentStripe = -1;
+ private long rowBaseInStripe = 0;
+ private long rowCountInStripe = 0;
+ private final Map<StreamName, InStream> streams =
+ new HashMap<StreamName, InStream>();
+ DiskRangeList bufferChunks = null;
+ private final TreeReaderFactory.TreeReader reader;
+ private final OrcProto.RowIndex[] indexes;
+ private final OrcProto.BloomFilterIndex[] bloomFilterIndices;
+ private final SargApplier sargApp;
+ // an array about which row groups aren't skipped
+ private boolean[] includedRowGroups = null;
+ private final DataReader dataReader;
+
+ /**
+ * Given a list of column names, find the given column and return the index.
+ *
+ * @param columnNames the list of potential column names
+ * @param columnName the column name to look for
+ * @param rootColumn offset the result with the rootColumn
+ * @return the column number or -1 if the column wasn't found
+ */
+ static int findColumns(String[] columnNames,
+ String columnName,
+ int rootColumn) {
+ for(int i=0; i < columnNames.length; ++i) {
+ if (columnName.equals(columnNames[i])) {
+ return i + rootColumn;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Find the mapping from predicate leaves to columns.
+ * @param sargLeaves the search argument that we need to map
+ * @param columnNames the names of the columns
+ * @param rootColumn the offset of the top level row, which offsets the
+ * result
+ * @return an array mapping the sarg leaves to concrete column numbers
+ */
+ public static int[] mapSargColumnsToOrcInternalColIdx(List<PredicateLeaf> sargLeaves,
+ String[] columnNames,
+ int rootColumn) {
+ int[] result = new int[sargLeaves.size()];
+ Arrays.fill(result, -1);
+ for(int i=0; i < result.length; ++i) {
+ String colName = sargLeaves.get(i).getColumnName();
+ result[i] = findColumns(columnNames, colName, rootColumn);
+ }
+ return result;
+ }
+
+ protected RecordReaderImpl(ReaderImpl fileReader,
+ Reader.Options options) throws IOException {
+ SchemaEvolution treeReaderSchema;
+ this.included = options.getInclude();
+ included[0] = true;
+ if (options.getSchema() == null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Schema on read not provided -- using file schema " +
+ fileReader.getSchema());
+ }
+ treeReaderSchema = new SchemaEvolution(fileReader.getSchema(), included);
+ } else {
+
+ // Now that we are creating a record reader for a file, validate that the schema to read
+ // is compatible with the file schema.
+ //
+ treeReaderSchema = new SchemaEvolution(fileReader.getSchema(),
+ options.getSchema(),included);
+ }
+ this.schema = treeReaderSchema.getReaderSchema();
+ this.path = fileReader.path;
+ this.codec = fileReader.codec;
+ this.types = fileReader.types;
+ this.bufferSize = fileReader.bufferSize;
+ this.rowIndexStride = fileReader.rowIndexStride;
+ SearchArgument sarg = options.getSearchArgument();
+ if (sarg != null && rowIndexStride != 0) {
+ sargApp = new SargApplier(
+ sarg, options.getColumnNames(), rowIndexStride, types,
+ included.length);
+ } else {
+ sargApp = null;
+ }
+ long rows = 0;
+ long skippedRows = 0;
+ long offset = options.getOffset();
+ long maxOffset = options.getMaxOffset();
+ for(StripeInformation stripe: fileReader.getStripes()) {
+ long stripeStart = stripe.getOffset();
+ if (offset > stripeStart) {
+ skippedRows += stripe.getNumberOfRows();
+ } else if (stripeStart < maxOffset) {
+ this.stripes.add(stripe);
+ rows += stripe.getNumberOfRows();
+ }
+ }
+
+ Boolean zeroCopy = options.getUseZeroCopy();
+ if (zeroCopy == null) {
+ zeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(fileReader.conf);
+ }
+ if (options.getDataReader() != null) {
+ this.dataReader = options.getDataReader();
+ } else {
+ this.dataReader = RecordReaderUtils.createDefaultDataReader(
+ DataReaderProperties.builder()
+ .withBufferSize(bufferSize)
+ .withCompression(fileReader.compressionKind)
+ .withFileSystem(fileReader.fileSystem)
+ .withPath(fileReader.path)
+ .withTypeCount(types.size())
+ .withZeroCopy(zeroCopy)
+ .build());
+ }
+ this.dataReader.open();
+
+ firstRow = skippedRows;
+ totalRowCount = rows;
+ Boolean skipCorrupt = options.getSkipCorruptRecords();
+ if (skipCorrupt == null) {
+ skipCorrupt = OrcConf.SKIP_CORRUPT_DATA.getBoolean(fileReader.conf);
+ }
+
+ reader = TreeReaderFactory.createTreeReader(treeReaderSchema.getReaderSchema(),
+ treeReaderSchema, included, skipCorrupt);
+ indexes = new OrcProto.RowIndex[types.size()];
+ bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()];
+ advanceToNextRow(reader, 0L, true);
+ }
+
+ public static final class PositionProviderImpl implements PositionProvider {
+ private final OrcProto.RowIndexEntry entry;
+ private int index;
+
+ public PositionProviderImpl(OrcProto.RowIndexEntry entry) {
+ this(entry, 0);
+ }
+
+ public PositionProviderImpl(OrcProto.RowIndexEntry entry, int startPos) {
+ this.entry = entry;
+ this.index = startPos;
+ }
+
+ @Override
+ public long getNext() {
+ return entry.getPositions(index++);
+ }
+ }
+
+ public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe
+ ) throws IOException {
+ return dataReader.readStripeFooter(stripe);
+ }
+
+ enum Location {
+ BEFORE, MIN, MIDDLE, MAX, AFTER
+ }
+
+ /**
+ * Given a point and min and max, determine if the point is before, at the
+ * min, in the middle, at the max, or after the range.
+ * @param point the point to test
+ * @param min the minimum point
+ * @param max the maximum point
+ * @param <T> the type of the comparision
+ * @return the location of the point
+ */
+ static <T> Location compareToRange(Comparable<T> point, T min, T max) {
+ int minCompare = point.compareTo(min);
+ if (minCompare < 0) {
+ return Location.BEFORE;
+ } else if (minCompare == 0) {
+ return Location.MIN;
+ }
+ int maxCompare = point.compareTo(max);
+ if (maxCompare > 0) {
+ return Location.AFTER;
+ } else if (maxCompare == 0) {
+ return Location.MAX;
+ }
+ return Location.MIDDLE;
+ }
+
+ /**
+ * Get the maximum value out of an index entry.
+ * @param index
+ * the index entry
+ * @return the object for the maximum value or null if there isn't one
+ */
+ static Object getMax(ColumnStatistics index) {
+ if (index instanceof IntegerColumnStatistics) {
+ return ((IntegerColumnStatistics) index).getMaximum();
+ } else if (index instanceof DoubleColumnStatistics) {
+ return ((DoubleColumnStatistics) index).getMaximum();
+ } else if (index instanceof StringColumnStatistics) {
+ return ((StringColumnStatistics) index).getMaximum();
+ } else if (index instanceof DateColumnStatistics) {
+ return ((DateColumnStatistics) index).getMaximum();
+ } else if (index instanceof DecimalColumnStatistics) {
+ return ((DecimalColumnStatistics) index).getMaximum();
+ } else if (index instanceof TimestampColumnStatistics) {
+ return ((TimestampColumnStatistics) index).getMaximum();
+ } else if (index instanceof BooleanColumnStatistics) {
+ if (((BooleanColumnStatistics)index).getTrueCount()!=0) {
+ return Boolean.TRUE;
+ } else {
+ return Boolean.FALSE;
+ }
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Get the minimum value out of an index entry.
+ * @param index
+ * the index entry
+ * @return the object for the minimum value or null if there isn't one
+ */
+ static Object getMin(ColumnStatistics index) {
+ if (index instanceof IntegerColumnStatistics) {
+ return ((IntegerColumnStatistics) index).getMinimum();
+ } else if (index instanceof DoubleColumnStatistics) {
+ return ((DoubleColumnStatistics) index).getMinimum();
+ } else if (index instanceof StringColumnStatistics) {
+ return ((StringColumnStatistics) index).getMinimum();
+ } else if (index instanceof DateColumnStatistics) {
+ return ((DateColumnStatistics) index).getMinimum();
+ } else if (index instanceof DecimalColumnStatistics) {
+ return ((DecimalColumnStatistics) index).getMinimum();
+ } else if (index instanceof TimestampColumnStatistics) {
+ return ((TimestampColumnStatistics) index).getMinimum();
+ } else if (index instanceof BooleanColumnStatistics) {
+ if (((BooleanColumnStatistics)index).getFalseCount()!=0) {
+ return Boolean.FALSE;
+ } else {
+ return Boolean.TRUE;
+ }
+ } else {
+ return UNKNOWN_VALUE; // null is not safe here
+ }
+ }
+
+ /**
+ * Evaluate a predicate with respect to the statistics from the column
+ * that is referenced in the predicate.
+ * @param statsProto the statistics for the column mentioned in the predicate
+ * @param predicate the leaf predicate we need to evaluation
+ * @param bloomFilter
+ * @return the set of truth values that may be returned for the given
+ * predicate.
+ */
+ static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto,
+ PredicateLeaf predicate, OrcProto.BloomFilter bloomFilter) {
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto);
+ Object minValue = getMin(cs);
+ Object maxValue = getMax(cs);
+ BloomFilterIO bf = null;
+ if (bloomFilter != null) {
+ bf = new BloomFilterIO(bloomFilter);
+ }
+ return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(), bf);
+ }
+
+ /**
+ * Evaluate a predicate with respect to the statistics from the column
+ * that is referenced in the predicate.
+ * @param stats the statistics for the column mentioned in the predicate
+ * @param predicate the leaf predicate we need to evaluation
+ * @return the set of truth values that may be returned for the given
+ * predicate.
+ */
+ public static TruthValue evaluatePredicate(ColumnStatistics stats,
+ PredicateLeaf predicate,
+ BloomFilterIO bloomFilter) {
+ Object minValue = getMin(stats);
+ Object maxValue = getMax(stats);
+ return evaluatePredicateRange(predicate, minValue, maxValue, stats.hasNull(), bloomFilter);
+ }
+
+ static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min,
+ Object max, boolean hasNull, BloomFilterIO bloomFilter) {
+ // if we didn't have any values, everything must have been null
+ if (min == null) {
+ if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) {
+ return TruthValue.YES;
+ } else {
+ return TruthValue.NULL;
+ }
+ } else if (min == UNKNOWN_VALUE) {
+ return TruthValue.YES_NO_NULL;
+ }
+
+ TruthValue result;
+ Object baseObj = predicate.getLiteral();
+ try {
+ // Predicate object and stats objects are converted to the type of the predicate object.
+ Object minValue = getBaseObjectForComparison(predicate.getType(), min);
+ Object maxValue = getBaseObjectForComparison(predicate.getType(), max);
+ Object predObj = getBaseObjectForComparison(predicate.getType(), baseObj);
+
+ result = evaluatePredicateMinMax(predicate, predObj, minValue, maxValue, hasNull);
+ if (shouldEvaluateBloomFilter(predicate, result, bloomFilter)) {
+ result = evaluatePredicateBloomFilter(predicate, predObj, bloomFilter, hasNull);
+ }
+ // in case failed conversion, return the default YES_NO_NULL truth value
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ final String statsType = min == null ?
+ (max == null ? "null" : max.getClass().getSimpleName()) :
+ min.getClass().getSimpleName();
+ final String predicateType = baseObj == null ? "null" : baseObj.getClass().getSimpleName();
+ final String reason = e.getClass().getSimpleName() + " when evaluating predicate." +
+ " Skipping ORC PPD." +
+ " Exception: " + e.getMessage() +
+ " StatsType: " + statsType +
+ " PredicateType: " + predicateType;
+ LOG.warn(reason);
+ LOG.debug(reason, e);
+ }
+ if (predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS) || !hasNull) {
+ result = TruthValue.YES_NO;
+ } else {
+ result = TruthValue.YES_NO_NULL;
+ }
+ }
+ return result;
+ }
+
+ private static boolean shouldEvaluateBloomFilter(PredicateLeaf predicate,
+ TruthValue result, BloomFilterIO bloomFilter) {
+ // evaluate bloom filter only when
+ // 1) Bloom filter is available
+ // 2) Min/Max evaluation yield YES or MAYBE
+ // 3) Predicate is EQUALS or IN list
+ if (bloomFilter != null
+ && result != TruthValue.NO_NULL && result != TruthValue.NO
+ && (predicate.getOperator().equals(PredicateLeaf.Operator.EQUALS)
+ || predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+ || predicate.getOperator().equals(PredicateLeaf.Operator.IN))) {
+ return true;
+ }
+ return false;
+ }
+
+ private static TruthValue evaluatePredicateMinMax(PredicateLeaf predicate, Object predObj,
+ Object minValue,
+ Object maxValue,
+ boolean hasNull) {
+ Location loc;
+
+ switch (predicate.getOperator()) {
+ case NULL_SAFE_EQUALS:
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.BEFORE || loc == Location.AFTER) {
+ return TruthValue.NO;
+ } else {
+ return TruthValue.YES_NO;
+ }
+ case EQUALS:
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (minValue.equals(maxValue) && loc == Location.MIN) {
+ return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
+ } else if (loc == Location.BEFORE || loc == Location.AFTER) {
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ } else {
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ case LESS_THAN:
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.AFTER) {
+ return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
+ } else if (loc == Location.BEFORE || loc == Location.MIN) {
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ } else {
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ case LESS_THAN_EQUALS:
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.AFTER || loc == Location.MAX) {
+ return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
+ } else if (loc == Location.BEFORE) {
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ } else {
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ case IN:
+ if (minValue.equals(maxValue)) {
+ // for a single value, look through to see if that value is in the
+ // set
+ for (Object arg : predicate.getLiteralList()) {
+ predObj = getBaseObjectForComparison(predicate.getType(), arg);
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.MIN) {
+ return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
+ }
+ }
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ } else {
+ // are all of the values outside of the range?
+ for (Object arg : predicate.getLiteralList()) {
+ predObj = getBaseObjectForComparison(predicate.getType(), arg);
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.MIN || loc == Location.MIDDLE ||
+ loc == Location.MAX) {
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ }
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ }
+ case BETWEEN:
+ List<Object> args = predicate.getLiteralList();
+ Object predObj1 = getBaseObjectForComparison(predicate.getType(), args.get(0));
+
+ loc = compareToRange((Comparable) predObj1, minValue, maxValue);
+ if (loc == Location.BEFORE || loc == Location.MIN) {
+ Object predObj2 = getBaseObjectForComparison(predicate.getType(), args.get(1));
+
+ Location loc2 = compareToRange((Comparable) predObj2, minValue, maxValue);
+ if (loc2 == Location.AFTER || loc2 == Location.MAX) {
+ return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
+ } else if (loc2 == Location.BEFORE) {
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ } else {
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ } else if (loc == Location.AFTER) {
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ } else {
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ case IS_NULL:
+ // min = null condition above handles the all-nulls YES case
+ return hasNull ? TruthValue.YES_NO : TruthValue.NO;
+ default:
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ }
+
+ private static TruthValue evaluatePredicateBloomFilter(PredicateLeaf predicate,
+ final Object predObj, BloomFilterIO bloomFilter, boolean hasNull) {
+ switch (predicate.getOperator()) {
+ case NULL_SAFE_EQUALS:
+ // null safe equals does not return *_NULL variant. So set hasNull to false
+ return checkInBloomFilter(bloomFilter, predObj, false);
+ case EQUALS:
+ return checkInBloomFilter(bloomFilter, predObj, hasNull);
+ case IN:
+ for (Object arg : predicate.getLiteralList()) {
+ // if atleast one value in IN list exist in bloom filter, qualify the row group/stripe
+ Object predObjItem = getBaseObjectForComparison(predicate.getType(), arg);
+ TruthValue result = checkInBloomFilter(bloomFilter, predObjItem, hasNull);
+ if (result == TruthValue.YES_NO_NULL || result == TruthValue.YES_NO) {
+ return result;
+ }
+ }
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ default:
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ }
+
+ private static TruthValue checkInBloomFilter(BloomFilterIO bf, Object predObj, boolean hasNull) {
+ TruthValue result = hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+
+ if (predObj instanceof Long) {
+ if (bf.testLong(((Long) predObj).longValue())) {
+ result = TruthValue.YES_NO_NULL;
+ }
+ } else if (predObj instanceof Double) {
+ if (bf.testDouble(((Double) predObj).doubleValue())) {
+ result = TruthValue.YES_NO_NULL;
+ }
+ } else if (predObj instanceof String || predObj instanceof Text ||
+ predObj instanceof HiveDecimalWritable ||
+ predObj instanceof BigDecimal) {
+ if (bf.testString(predObj.toString())) {
+ result = TruthValue.YES_NO_NULL;
+ }
+ } else if (predObj instanceof Timestamp) {
+ if (bf.testLong(((Timestamp) predObj).getTime())) {
+ result = TruthValue.YES_NO_NULL;
+ }
+ } else if (predObj instanceof Date) {
+ if (bf.testLong(DateWritable.dateToDays((Date) predObj))) {
+ result = TruthValue.YES_NO_NULL;
+ }
+ } else {
+ // if the predicate object is null and if hasNull says there are no nulls then return NO
+ if (predObj == null && !hasNull) {
+ result = TruthValue.NO;
+ } else {
+ result = TruthValue.YES_NO_NULL;
+ }
+ }
+
+ if (result == TruthValue.YES_NO_NULL && !hasNull) {
+ result = TruthValue.YES_NO;
+ }
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Bloom filter evaluation: " + result.toString());
+ }
+
+ return result;
+ }
+
+ private static Object getBaseObjectForComparison(PredicateLeaf.Type type, Object obj) {
+ if (obj == null) {
+ return null;
+ }
+ switch (type) {
+ case BOOLEAN:
+ if (obj instanceof Boolean) {
+ return obj;
+ } else {
+ // will only be true if the string conversion yields "true", all other values are
+ // considered false
+ return Boolean.valueOf(obj.toString());
+ }
+ case DATE:
+ if (obj instanceof Date) {
+ return obj;
+ } else if (obj instanceof String) {
+ return Date.valueOf((String) obj);
+ } else if (obj instanceof Timestamp) {
+ return DateWritable.timeToDate(((Timestamp) obj).getTime() / 1000L);
+ }
+ // always string, but prevent the comparison to numbers (are they days/seconds/milliseconds?)
+ break;
+ case DECIMAL:
+ if (obj instanceof Boolean) {
+ return new HiveDecimalWritable(((Boolean) obj).booleanValue() ?
+ HiveDecimal.ONE : HiveDecimal.ZERO);
+ } else if (obj instanceof Integer) {
+ return new HiveDecimalWritable(((Integer) obj).intValue());
+ } else if (obj instanceof Long) {
+ return new HiveDecimalWritable(((Long) obj));
+ } else if (obj instanceof Float || obj instanceof Double ||
+ obj instanceof String) {
+ return new HiveDecimalWritable(obj.toString());
+ } else if (obj instanceof BigDecimal) {
+ return new HiveDecimalWritable(HiveDecimal.create((BigDecimal) obj));
+ } else if (obj instanceof HiveDecimal) {
+ return new HiveDecimalWritable((HiveDecimal) obj);
+ } else if (obj instanceof HiveDecimalWritable) {
+ return obj;
+ } else if (obj instanceof Timestamp) {
+ return new HiveDecimalWritable(Double.toString(
+ TimestampUtils.getDouble((Timestamp) obj)));
+ }
+ break;
+ case FLOAT:
+ if (obj instanceof Number) {
+ // widening conversion
+ return ((Number) obj).doubleValue();
+ } else if (obj instanceof HiveDecimal) {
+ return ((HiveDecimal) obj).doubleValue();
+ } else if (obj instanceof String) {
+ return Double.valueOf(obj.toString());
+ } else if (obj instanceof Timestamp) {
+ return TimestampUtils.getDouble((Timestamp) obj);
+ } else if (obj instanceof HiveDecimal) {
+ return ((HiveDecimal) obj).doubleValue();
+ } else if (obj instanceof BigDecimal) {
+ return ((BigDecimal) obj).doubleValue();
+ }
+ break;
+ case LONG:
+ if (obj instanceof Number) {
+ // widening conversion
+ return ((Number) obj).longValue();
+ } else if (obj instanceof HiveDecimal) {
+ return ((HiveDecimal) obj).longValue();
+ } else if (obj instanceof String) {
+ return Long.valueOf(obj.toString());
+ }
+ break;
+ case STRING:
+ if (obj != null) {
+ return (obj.toString());
+ }
+ break;
+ case TIMESTAMP:
+ if (obj instanceof Timestamp) {
+ return obj;
+ } else if (obj instanceof Integer) {
+ return new Timestamp(((Number) obj).longValue());
+ } else if (obj instanceof Float) {
+ return TimestampUtils.doubleToTimestamp(((Float) obj).doubleValue());
+ } else if (obj instanceof Double) {
+ return TimestampUtils.doubleToTimestamp(((Double) obj).doubleValue());
+ } else if (obj instanceof HiveDecimal) {
+ return TimestampUtils.decimalToTimestamp((HiveDecimal) obj);
+ } else if (obj instanceof HiveDecimalWritable) {
+ return TimestampUtils.decimalToTimestamp(((HiveDecimalWritable) obj).getHiveDecimal());
+ } else if (obj instanceof Date) {
+ return new Timestamp(((Date) obj).getTime());
+ }
+ // float/double conversion to timestamp is interpreted as seconds whereas integer conversion
+ // to timestamp is interpreted as milliseconds by default. The integer to timestamp casting
+ // is also config driven. The filter operator changes its promotion based on config:
+ // "int.timestamp.conversion.in.seconds". Disable PPD for integer cases.
+ break;
+ default:
+ break;
+ }
+
+ throw new IllegalArgumentException(String.format(
+ "ORC SARGS could not convert from %s to %s", obj == null ? "(null)" : obj.getClass()
+ .getSimpleName(), type));
+ }
+
+ public static class SargApplier {
+ public final static boolean[] READ_ALL_RGS = null;
+ public final static boolean[] READ_NO_RGS = new boolean[0];
+
+ private final SearchArgument sarg;
+ private final List<PredicateLeaf> sargLeaves;
+ private final int[] filterColumns;
+ private final long rowIndexStride;
+ // same as the above array, but indices are set to true
+ private final boolean[] sargColumns;
+
+ public SargApplier(SearchArgument sarg, String[] columnNames, long rowIndexStride,
+ List<OrcProto.Type> types, int includedCount) {
+ this.sarg = sarg;
+ sargLeaves = sarg.getLeaves();
+ filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves, columnNames, 0);
+ this.rowIndexStride = rowIndexStride;
+ // included will not be null, row options will fill the array with trues if null
+ sargColumns = new boolean[includedCount];
+ for (int i : filterColumns) {
+ // filter columns may have -1 as index which could be partition column in SARG.
+ if (i > 0) {
+ sargColumns[i] = true;
+ }
+ }
+ }
+
+ /**
+ * Pick the row groups that we need to load from the current stripe.
+ *
+ * @return an array with a boolean for each row group or null if all of the
+ * row groups must be read.
+ * @throws IOException
+ */
+ public boolean[] pickRowGroups(StripeInformation stripe, OrcProto.RowIndex[] indexes,
+ OrcProto.BloomFilterIndex[] bloomFilterIndices, boolean returnNone) throws IOException {
+ long rowsInStripe = stripe.getNumberOfRows();
+ int groupsInStripe = (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride);
+ boolean[] result = new boolean[groupsInStripe]; // TODO: avoid alloc?
+ TruthValue[] leafValues = new TruthValue[sargLeaves.size()];
+ boolean hasSelected = false, hasSkipped = false;
+ for (int rowGroup = 0; rowGroup < result.length; ++rowGroup) {
+ for (int pred = 0; pred < leafValues.length; ++pred) {
+ int columnIx = filterColumns[pred];
+ if (columnIx != -1) {
+ if (indexes[columnIx] == null) {
+ throw new AssertionError("Index is not populated for " + columnIx);
+ }
+ OrcProto.RowIndexEntry entry = indexes[columnIx].getEntry(rowGroup);
+ if (entry == null) {
+ throw new AssertionError("RG is not populated for " + columnIx + " rg " + rowGroup);
+ }
+ OrcProto.ColumnStatistics stats = entry.getStatistics();
+ OrcProto.BloomFilter bf = null;
+ if (bloomFilterIndices != null && bloomFilterIndices[filterColumns[pred]] != null) {
+ bf = bloomFilterIndices[filterColumns[pred]].getBloomFilter(rowGroup);
+ }
+ leafValues[pred] = evaluatePredicateProto(stats, sargLeaves.get(pred), bf);
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Stats = " + stats);
+ LOG.trace("Setting " + sargLeaves.get(pred) + " to " + leafValues[pred]);
+ }
+ } else {
+ // the column is a virtual column
+ leafValues[pred] = TruthValue.YES_NO_NULL;
+ }
+ }
+ result[rowGroup] = sarg.evaluate(leafValues).isNeeded();
+ hasSelected = hasSelected || result[rowGroup];
+ hasSkipped = hasSkipped || (!result[rowGroup]);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Row group " + (rowIndexStride * rowGroup) + " to " +
+ (rowIndexStride * (rowGroup + 1) - 1) + " is " +
+ (result[rowGroup] ? "" : "not ") + "included.");
+ }
+ }
+
+ return hasSkipped ? ((hasSelected || !returnNone) ? result : READ_NO_RGS) : READ_ALL_RGS;
+ }
+ }
+
+ /**
+ * Pick the row groups that we need to load from the current stripe.
+ *
+ * @return an array with a boolean for each row group or null if all of the
+ * row groups must be read.
+ * @throws IOException
+ */
+ protected boolean[] pickRowGroups() throws IOException {
+ // if we don't have a sarg or indexes, we read everything
+ if (sargApp == null) {
+ return null;
+ }
+ readRowIndex(currentStripe, included, sargApp.sargColumns);
+ return sargApp.pickRowGroups(stripes.get(currentStripe), indexes, bloomFilterIndices, false);
+ }
+
+ private void clearStreams() {
+ // explicit close of all streams to de-ref ByteBuffers
+ for (InStream is : streams.values()) {
+ is.close();
+ }
+ if (bufferChunks != null) {
+ if (dataReader.isTrackingDiskRanges()) {
+ for (DiskRangeList range = bufferChunks; range != null; range = range.next) {
+ if (!(range instanceof BufferChunk)) {
+ continue;
+ }
+ dataReader.releaseBuffer(((BufferChunk) range).getChunk());
+ }
+ }
+ }
+ bufferChunks = null;
+ streams.clear();
+ }
+
+ /**
+ * Read the current stripe into memory.
+ *
+ * @throws IOException
+ */
+ private void readStripe() throws IOException {
+ StripeInformation stripe = beginReadStripe();
+ includedRowGroups = pickRowGroups();
+
+ // move forward to the first unskipped row
+ if (includedRowGroups != null) {
+ while (rowInStripe < rowCountInStripe &&
+ !includedRowGroups[(int) (rowInStripe / rowIndexStride)]) {
+ rowInStripe = Math.min(rowCountInStripe, rowInStripe + rowIndexStride);
+ }
+ }
+
+ // if we haven't skipped the whole stripe, read the data
+ if (rowInStripe < rowCountInStripe) {
+ // if we aren't projecting columns or filtering rows, just read it all
+ if (included == null && includedRowGroups == null) {
+ readAllDataStreams(stripe);
+ } else {
+ readPartialDataStreams(stripe);
+ }
+ reader.startStripe(streams, stripeFooter);
+ // if we skipped the first row group, move the pointers forward
+ if (rowInStripe != 0) {
+ seekToRowEntry(reader, (int) (rowInStripe / rowIndexStride));
+ }
+ }
+ }
+
+ private StripeInformation beginReadStripe() throws IOException {
+ StripeInformation stripe = stripes.get(currentStripe);
+ stripeFooter = readStripeFooter(stripe);
+ clearStreams();
+ // setup the position in the stripe
+ rowCountInStripe = stripe.getNumberOfRows();
+ rowInStripe = 0;
+ rowBaseInStripe = 0;
+ for (int i = 0; i < currentStripe; ++i) {
+ rowBaseInStripe += stripes.get(i).getNumberOfRows();
+ }
+ // reset all of the indexes
+ for (int i = 0; i < indexes.length; ++i) {
+ indexes[i] = null;
+ }
+ return stripe;
+ }
+
+ private void readAllDataStreams(StripeInformation stripe) throws IOException {
+ long start = stripe.getIndexLength();
+ long end = start + stripe.getDataLength();
+ // explicitly trigger 1 big read
+ DiskRangeList toRead = new DiskRangeList(start, end);
+ bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false);
+ List<OrcProto.Stream> streamDescriptions = stripeFooter.getStreamsList();
+ createStreams(streamDescriptions, bufferChunks, null, codec, bufferSize, streams);
+ }
+
+ /**
+ * Plan the ranges of the file that we need to read given the list of
+ * columns and row groups.
+ *
+ * @param streamList the list of streams available
+ * @param indexes the indexes that have been loaded
+ * @param includedColumns which columns are needed
+ * @param includedRowGroups which row groups are needed
+ * @param isCompressed does the file have generic compression
+ * @param encodings the encodings for each column
+ * @param types the types of the columns
+ * @param compressionSize the compression block size
+ * @return the list of disk ranges that will be loaded
+ */
+ static DiskRangeList planReadPartialDataStreams
+ (List<OrcProto.Stream> streamList,
+ OrcProto.RowIndex[] indexes,
+ boolean[] includedColumns,
+ boolean[] includedRowGroups,
+ boolean isCompressed,
+ List<OrcProto.ColumnEncoding> encodings,
+ List<OrcProto.Type> types,
+ int compressionSize,
+ boolean doMergeBuffers) {
+ long offset = 0;
+ // figure out which columns have a present stream
+ boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, types);
+ CreateHelper list = new CreateHelper();
+ for (OrcProto.Stream stream : streamList) {
+ long length = stream.getLength();
+ int column = stream.getColumn();
+ OrcProto.Stream.Kind streamKind = stream.getKind();
+ // since stream kind is optional, first check if it exists
+ if (stream.hasKind() &&
+ (StreamName.getArea(streamKind) == StreamName.Area.DATA) &&
+ (column < includedColumns.length && includedColumns[column])) {
+ // if we aren't filtering or it is a dictionary, load it.
+ if (includedRowGroups == null
+ || RecordReaderUtils.isDictionary(streamKind, encodings.get(column))) {
+ RecordReaderUtils.addEntireStreamToRanges(offset, length, list, doMergeBuffers);
+ } else {
+ RecordReaderUtils.addRgFilteredStreamToRanges(stream, includedRowGroups,
+ isCompressed, indexes[column], encodings.get(column), types.get(column),
+ compressionSize, hasNull[column], offset, length, list, doMergeBuffers);
+ }
+ }
+ offset += length;
+ }
+ return list.extract();
+ }
+
+ void createStreams(List<OrcProto.Stream> streamDescriptions,
+ DiskRangeList ranges,
+ boolean[] includeColumn,
+ CompressionCodec codec,
+ int bufferSize,
+ Map<StreamName, InStream> streams) throws IOException {
+ long streamOffset = 0;
+ for (OrcProto.Stream streamDesc : streamDescriptions) {
+ int column = streamDesc.getColumn();
+ if ((includeColumn != null &&
+ (column < included.length && !includeColumn[column])) ||
+ streamDesc.hasKind() &&
+ (StreamName.getArea(streamDesc.getKind()) != StreamName.Area.DATA)) {
+ streamOffset += streamDesc.getLength();
+ continue;
+ }
+ List<DiskRange> buffers = RecordReaderUtils.getStreamBuffers(
+ ranges, streamOffset, streamDesc.getLength());
+ StreamName name = new StreamName(column, streamDesc.getKind());
+ streams.put(name, InStream.create(name.toString(), buffers,
+ streamDesc.getLength(), codec, bufferSize));
+ streamOffset += streamDesc.getLength();
+ }
+ }
+
+ private void readPartialDataStreams(StripeInformation stripe) throws IOException {
+ List<OrcProto.Stream> streamList = stripeFooter.getStreamsList();
+ DiskRangeList toRead = planReadPartialDataStreams(streamList,
+ indexes, included, includedRowGroups, codec != null,
+ stripeFooter.getColumnsList(), types, bufferSize, true);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("chunks = " + RecordReaderUtils.stringifyDiskRanges(toRead));
+ }
+ bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("merge = " + RecordReaderUtils.stringifyDiskRanges(bufferChunks));
+ }
+
+ createStreams(streamList, bufferChunks, included, codec, bufferSize, streams);
+ }
+
+ /**
+ * Read the next stripe until we find a row that we don't skip.
+ *
+ * @throws IOException
+ */
+ private void advanceStripe() throws IOException {
+ rowInStripe = rowCountInStripe;
+ while (rowInStripe >= rowCountInStripe &&
+ currentStripe < stripes.size() - 1) {
+ currentStripe += 1;
+ readStripe();
+ }
+ }
+
+ /**
+ * Skip over rows that we aren't selecting, so that the next row is
+ * one that we will read.
+ *
+ * @param nextRow the row we want to go to
+ * @throws IOException
+ */
+ private boolean advanceToNextRow(
+ TreeReaderFactory.TreeReader reader, long nextRow, boolean canAdvanceStripe)
+ throws IOException {
+ long nextRowInStripe = nextRow - rowBaseInStripe;
+ // check for row skipping
+ if (rowIndexStride != 0 &&
+ includedRowGroups != null &&
+ nextRowInStripe < rowCountInStripe) {
+ int rowGroup = (int) (nextRowInStripe / rowIndexStride);
+ if (!includedRowGroups[rowGroup]) {
+ while (rowGroup < includedRowGroups.length && !includedRowGroups[rowGroup]) {
+ rowGroup += 1;
+ }
+ if (rowGroup >= includedRowGroups.length) {
+ if (canAdvanceStripe) {
+ advanceStripe();
+ }
+ return canAdvanceStripe;
+ }
+ nextRowInStripe = Math.min(rowCountInStripe, rowGroup * rowIndexStride);
+ }
+ }
+ if (nextRowInStripe >= rowCountInStripe) {
+ if (canAdvanceStripe) {
+ advanceStripe();
+ }
+ return canAdvanceStripe;
+ }
+ if (nextRowInStripe != rowInStripe) {
+ if (rowIndexStride != 0) {
+ int rowGroup = (int) (nextRowInStripe / rowIndexStride);
+ seekToRowEntry(reader, rowGroup);
+ reader.skipRows(nextRowInStripe - rowGroup * rowIndexStride);
+ } else {
+ reader.skipRows(nextRowInStripe - rowInStripe);
+ }
+ rowInStripe = nextRowInStripe;
+ }
+ return true;
+ }
+
+ @Override
+ public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
+ try {
+ if (rowInStripe >= rowCountInStripe) {
+ currentStripe += 1;
+ if (currentStripe >= stripes.size()) {
+ batch.size = 0;
+ return false;
+ }
+ readStripe();
+ }
+
+ int batchSize = computeBatchSize(batch.getMaxSize());
+
+ rowInStripe += batchSize;
+ reader.setVectorColumnCount(batch.getDataColumnCount());
+ reader.nextBatch(batch, batchSize);
+ batch.selectedInUse = false;
+ batch.size = batchSize;
+ advanceToNextRow(reader, rowInStripe + rowBaseInStripe, true);
+ return batch.size != 0;
+ } catch (IOException e) {
+ // Rethrow exception with file name in log message
+ throw new IOException("Error reading file: " + path, e);
+ }
+ }
+
+ private int computeBatchSize(long targetBatchSize) {
+ final int batchSize;
+ // In case of PPD, batch size should be aware of row group boundaries. If only a subset of row
+ // groups are selected then marker position is set to the end of range (subset of row groups
+ // within strip). Batch size computed out of marker position makes sure that batch size is
+ // aware of row group boundary and will not cause overflow when reading rows
+ // illustration of this case is here https://issues.apache.org/jira/browse/HIVE-6287
+ if (rowIndexStride != 0 && includedRowGroups != null && rowInStripe < rowCountInStripe) {
+ int startRowGroup = (int) (rowInStripe / rowIndexStride);
+ if (!includedRowGroups[startRowGroup]) {
+ while (startRowGroup < includedRowGroups.length && !includedRowGroups[startRowGroup]) {
+ startRowGroup += 1;
+ }
+ }
+
+ int endRowGroup = startRowGroup;
+ while (endRowGroup < includedRowGroups.length && includedRowGroups[endRowGroup]) {
+ endRowGroup += 1;
+ }
+
+ final long markerPosition =
+ (endRowGroup * rowIndexStride) < rowCountInStripe ? (endRowGroup * rowIndexStride)
+ : rowCountInStripe;
+ batchSize = (int) Math.min(targetBatchSize, (markerPosition - rowInStripe));
+
+ if (isLogDebugEnabled && batchSize < targetBatchSize) {
+ LOG.debug("markerPosition: " + markerPosition + " batchSize: " + batchSize);
+ }
+ } else {
+ batchSize = (int) Math.min(targetBatchSize, (rowCountInStripe - rowInStripe));
+ }
+ return batchSize;
+ }
+
+ @Override
+ public void close() throws IOException {
+ clearStreams();
+ dataReader.close();
+ }
+
+ @Override
+ public long getRowNumber() {
+ return rowInStripe + rowBaseInStripe + firstRow;
+ }
+
+ /**
+ * Return the fraction of rows that have been read from the selected.
+ * section of the file
+ *
+ * @return fraction between 0.0 and 1.0 of rows consumed
+ */
+ @Override
+ public float getProgress() {
+ return ((float) rowBaseInStripe + rowInStripe) / totalRowCount;
+ }
+
+ private int findStripe(long rowNumber) {
+ for (int i = 0; i < stripes.size(); i++) {
+ StripeInformation stripe = stripes.get(i);
+ if (stripe.getNumberOfRows() > rowNumber) {
+ return i;
+ }
+ rowNumber -= stripe.getNumberOfRows();
+ }
+ throw new IllegalArgumentException("Seek after the end of reader range");
+ }
+
+ public OrcIndex readRowIndex(int stripeIndex, boolean[] included,
+ boolean[] sargColumns) throws IOException {
+ return readRowIndex(stripeIndex, included, null, null, sargColumns);
+ }
+
+ public OrcIndex readRowIndex(int stripeIndex, boolean[] included,
+ OrcProto.RowIndex[] indexes,
+ OrcProto.BloomFilterIndex[] bloomFilterIndex,
+ boolean[] sargColumns) throws IOException {
+ StripeInformation stripe = stripes.get(stripeIndex);
+ OrcProto.StripeFooter stripeFooter = null;
+ // if this is the current stripe, use the cached objects.
+ if (stripeIndex == currentStripe) {
+ stripeFooter = this.stripeFooter;
+ indexes = indexes == null ? this.indexes : indexes;
+ bloomFilterIndex = bloomFilterIndex == null ? this.bloomFilterIndices : bloomFilterIndex;
+ sargColumns = sargColumns == null ?
+ (sargApp == null ? null : sargApp.sargColumns) : sargColumns;
+ }
+ return dataReader.readRowIndex(stripe, stripeFooter, included, indexes, sargColumns,
+ bloomFilterIndex);
+ }
+
+ private void seekToRowEntry(TreeReaderFactory.TreeReader reader, int rowEntry)
+ throws IOException {
+ PositionProvider[] index = new PositionProvider[indexes.length];
+ for (int i = 0; i < indexes.length; ++i) {
+ if (indexes[i] != null) {
+ index[i] = new PositionProviderImpl(indexes[i].getEntry(rowEntry));
+ }
+ }
+ reader.seek(index);
+ }
+
+ @Override
+ public void seekToRow(long rowNumber) throws IOException {
+ if (rowNumber < 0) {
+ throw new IllegalArgumentException("Seek to a negative row number " +
+ rowNumber);
+ } else if (rowNumber < firstRow) {
+ throw new IllegalArgumentException("Seek before reader range " +
+ rowNumber);
+ }
+ // convert to our internal form (rows from the beginning of slice)
+ rowNumber -= firstRow;
+
+ // move to the right stripe
+ int rightStripe = findStripe(rowNumber);
+ if (rightStripe != currentStripe) {
+ currentStripe = rightStripe;
+ readStripe();
+ }
+ readRowIndex(currentStripe, included, sargApp == null ? null : sargApp.sargColumns);
+
+ // if we aren't to the right row yet, advance in the stripe.
+ advanceToNextRow(reader, rowNumber, true);
+ }
+
+ private static final String TRANSLATED_SARG_SEPARATOR = "_";
+ public static String encodeTranslatedSargColumn(int rootColumn, Integer indexInSourceTable) {
+ return rootColumn + TRANSLATED_SARG_SEPARATOR
+ + ((indexInSourceTable == null) ? -1 : indexInSourceTable);
+ }
+
+ public static int[] mapTranslatedSargColumns(
+ List<OrcProto.Type> types, List<PredicateLeaf> sargLeaves) {
+ int[] result = new int[sargLeaves.size()];
+ OrcProto.Type lastRoot = null; // Root will be the same for everyone as of now.
+ String lastRootStr = null;
+ for (int i = 0; i < result.length; ++i) {
+ String[] rootAndIndex = sargLeaves.get(i).getColumnName().split(TRANSLATED_SARG_SEPARATOR);
+ assert rootAndIndex.length == 2;
+ String rootStr = rootAndIndex[0], indexStr = rootAndIndex[1];
+ int index = Integer.parseInt(indexStr);
+ // First, check if the column even maps to anything.
+ if (index == -1) {
+ result[i] = -1;
+ continue;
+ }
+ assert index >= 0;
+ // Then, find the root type if needed.
+ if (!rootStr.equals(lastRootStr)) {
+ lastRoot = types.get(Integer.parseInt(rootStr));
+ lastRootStr = rootStr;
+ }
+ // Subtypes of the root types correspond, in order, to the columns in the table schema
+ // (disregarding schema evolution that doesn't presently work). Get the index for the
+ // corresponding subtype.
+ result[i] = lastRoot.getSubtypes(index);
+ }
+ return result;
+ }
+}
[03/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/resources/orc-file-dump-bloomfilter.out
----------------------------------------------------------------------
diff --git a/ql/src/test/resources/orc-file-dump-bloomfilter.out b/ql/src/test/resources/orc-file-dump-bloomfilter.out
deleted file mode 100644
index 18fd2fb..0000000
--- a/ql/src/test/resources/orc-file-dump-bloomfilter.out
+++ /dev/null
@@ -1,179 +0,0 @@
-Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
-Rows: 21000
-Compression: ZLIB
-Compression size: 4096
-Type: struct<i:int,l:bigint,s:string>
-
-Stripe Statistics:
- Stripe 1:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2146021688 max: 2147223299 sum: 515792826
- Column 2: count: 5000 hasNull: false min: -9218592812243954469 max: 9221614132680747961
- Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19280
- Stripe 2:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2146733128 max: 2147001622 sum: 7673427
- Column 2: count: 5000 hasNull: false min: -9220818777591257749 max: 9222259462014003839
- Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19504
- Stripe 3:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2146993718 max: 2147378179 sum: 132660742551
- Column 2: count: 5000 hasNull: false min: -9218342074710552826 max: 9222303228623055266
- Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19641
- Stripe 4:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2146658006 max: 2145520931 sum: 8533549236
- Column 2: count: 5000 hasNull: false min: -9222758097219661129 max: 9221043130193737406
- Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19470
- Stripe 5:
- Column 0: count: 1000 hasNull: false
- Column 1: count: 1000 hasNull: false min: -2146245500 max: 2146378640 sum: 51299706363
- Column 2: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476
- Column 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866
-
-File Statistics:
- Column 0: count: 21000 hasNull: false
- Column 1: count: 21000 hasNull: false min: -2146993718 max: 2147378179 sum: 193017464403
- Column 2: count: 21000 hasNull: false min: -9222758097219661129 max: 9222303228623055266
- Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
-
-Stripes:
- Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 951
- Stream: column 0 section ROW_INDEX start: 3 length 17
- Stream: column 1 section ROW_INDEX start: 20 length 166
- Stream: column 2 section ROW_INDEX start: 186 length 169
- Stream: column 3 section ROW_INDEX start: 355 length 87
- Stream: column 3 section BLOOM_FILTER start: 442 length 512
- Stream: column 1 section DATA start: 954 length 20035
- Stream: column 2 section DATA start: 20989 length 40050
- Stream: column 3 section DATA start: 61039 length 3543
- Stream: column 3 section LENGTH start: 64582 length 25
- Stream: column 3 section DICTIONARY_DATA start: 64607 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3862 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3884 positions: 0,659,149
- Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3893 positions: 0,1531,3
- Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3798 positions: 0,2281,32
- Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3843 positions: 0,3033,45
- Bloom filters for column 3:
- Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 1: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 2: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 64826 data: 63775 rows: 5000 tail: 86 index: 944
- Stream: column 0 section ROW_INDEX start: 64826 length 17
- Stream: column 1 section ROW_INDEX start: 64843 length 164
- Stream: column 2 section ROW_INDEX start: 65007 length 168
- Stream: column 3 section ROW_INDEX start: 65175 length 83
- Stream: column 3 section BLOOM_FILTER start: 65258 length 512
- Stream: column 1 section DATA start: 65770 length 20035
- Stream: column 2 section DATA start: 85805 length 40050
- Stream: column 3 section DATA start: 125855 length 3532
- Stream: column 3 section LENGTH start: 129387 length 25
- Stream: column 3 section DICTIONARY_DATA start: 129412 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3923 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3869 positions: 0,761,12
- Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,1472,70
- Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3931 positions: 0,2250,43
- Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3964 positions: 0,2978,88
- Bloom filters for column 3:
- Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 1: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 2: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 129631 data: 63787 rows: 5000 tail: 86 index: 950
- Stream: column 0 section ROW_INDEX start: 129631 length 17
- Stream: column 1 section ROW_INDEX start: 129648 length 163
- Stream: column 2 section ROW_INDEX start: 129811 length 168
- Stream: column 3 section ROW_INDEX start: 129979 length 90
- Stream: column 3 section BLOOM_FILTER start: 130069 length 512
- Stream: column 1 section DATA start: 130581 length 20035
- Stream: column 2 section DATA start: 150616 length 40050
- Stream: column 3 section DATA start: 190666 length 3544
- Stream: column 3 section LENGTH start: 194210 length 25
- Stream: column 3 section DICTIONARY_DATA start: 194235 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 4008 positions: 0,634,174
- Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3999 positions: 0,1469,69
- Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,2133,194
- Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 4000 positions: 0,3005,43
- Bloom filters for column 3:
- Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 1: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 2: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 194454 data: 63817 rows: 5000 tail: 86 index: 952
- Stream: column 0 section ROW_INDEX start: 194454 length 17
- Stream: column 1 section ROW_INDEX start: 194471 length 165
- Stream: column 2 section ROW_INDEX start: 194636 length 167
- Stream: column 3 section ROW_INDEX start: 194803 length 91
- Stream: column 3 section BLOOM_FILTER start: 194894 length 512
- Stream: column 1 section DATA start: 195406 length 20035
- Stream: column 2 section DATA start: 215441 length 40050
- Stream: column 3 section DATA start: 255491 length 3574
- Stream: column 3 section LENGTH start: 259065 length 25
- Stream: column 3 section DICTIONARY_DATA start: 259090 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3901 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3900 positions: 0,431,431
- Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3909 positions: 0,1485,52
- Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3947 positions: 0,2196,104
- Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3813 positions: 0,2934,131
- Bloom filters for column 3:
- Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 1: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 2: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 259309 data: 12943 rows: 1000 tail: 78 index: 432
- Stream: column 0 section ROW_INDEX start: 259309 length 12
- Stream: column 1 section ROW_INDEX start: 259321 length 38
- Stream: column 2 section ROW_INDEX start: 259359 length 41
- Stream: column 3 section ROW_INDEX start: 259400 length 40
- Stream: column 3 section BLOOM_FILTER start: 259440 length 301
- Stream: column 1 section DATA start: 259741 length 4007
- Stream: column 2 section DATA start: 263748 length 8010
- Stream: column 3 section DATA start: 271758 length 768
- Stream: column 3 section LENGTH start: 272526 length 25
- Stream: column 3 section DICTIONARY_DATA start: 272551 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866 positions: 0,0,0
- Bloom filters for column 3:
- Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
-
-File length: 273307 bytes
-Padding length: 0 bytes
-Padding ratio: 0%
-________________________________________________________________________________________________________________________
-
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/resources/orc-file-dump-bloomfilter2.out
----------------------------------------------------------------------
diff --git a/ql/src/test/resources/orc-file-dump-bloomfilter2.out b/ql/src/test/resources/orc-file-dump-bloomfilter2.out
deleted file mode 100644
index fa5cc2d..0000000
--- a/ql/src/test/resources/orc-file-dump-bloomfilter2.out
+++ /dev/null
@@ -1,179 +0,0 @@
-Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
-Rows: 21000
-Compression: ZLIB
-Compression size: 4096
-Type: struct<i:int,l:bigint,s:string>
-
-Stripe Statistics:
- Stripe 1:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2146021688 max: 2147223299 sum: 515792826
- Column 2: count: 5000 hasNull: false min: -9218592812243954469 max: 9221614132680747961
- Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19280
- Stripe 2:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2146733128 max: 2147001622 sum: 7673427
- Column 2: count: 5000 hasNull: false min: -9220818777591257749 max: 9222259462014003839
- Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19504
- Stripe 3:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2146993718 max: 2147378179 sum: 132660742551
- Column 2: count: 5000 hasNull: false min: -9218342074710552826 max: 9222303228623055266
- Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19641
- Stripe 4:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2146658006 max: 2145520931 sum: 8533549236
- Column 2: count: 5000 hasNull: false min: -9222758097219661129 max: 9221043130193737406
- Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19470
- Stripe 5:
- Column 0: count: 1000 hasNull: false
- Column 1: count: 1000 hasNull: false min: -2146245500 max: 2146378640 sum: 51299706363
- Column 2: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476
- Column 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866
-
-File Statistics:
- Column 0: count: 21000 hasNull: false
- Column 1: count: 21000 hasNull: false min: -2146993718 max: 2147378179 sum: 193017464403
- Column 2: count: 21000 hasNull: false min: -9222758097219661129 max: 9222303228623055266
- Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
-
-Stripes:
- Stripe: offset: 3 data: 63786 rows: 5000 tail: 85 index: 6974
- Stream: column 0 section ROW_INDEX start: 3 length 17
- Stream: column 1 section ROW_INDEX start: 20 length 166
- Stream: column 2 section ROW_INDEX start: 186 length 169
- Stream: column 2 section BLOOM_FILTER start: 355 length 6535
- Stream: column 3 section ROW_INDEX start: 6890 length 87
- Stream: column 1 section DATA start: 6977 length 20035
- Stream: column 2 section DATA start: 27012 length 40050
- Stream: column 3 section DATA start: 67062 length 3543
- Stream: column 3 section LENGTH start: 70605 length 25
- Stream: column 3 section DICTIONARY_DATA start: 70630 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9200577545527640566 max: 9175500305011173751 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9203618157670445774 max: 9208123824411178101 positions: 4099,2,488
- Entry 2: count: 1000 hasNull: false min: -9218592812243954469 max: 9221351515892923972 positions: 12297,6,464
- Entry 3: count: 1000 hasNull: false min: -9206585617947511272 max: 9167703224425685487 positions: 20495,10,440
- Entry 4: count: 1000 hasNull: false min: -9206645795733282496 max: 9221614132680747961 positions: 28693,14,416
- Bloom filters for column 2:
- Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4931 loadFactor: 0.5136 expectedFpp: 0.009432924
- Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 4956 loadFactor: 0.5163 expectedFpp: 0.009772834
- Entry 2: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
- Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
- Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 0.5155 expectedFpp: 0.009676614
- Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 loadFactor: 0.9736 expectedFpp: 0.829482
- Stripe: offset: 70848 data: 63775 rows: 5000 tail: 85 index: 6965
- Stream: column 0 section ROW_INDEX start: 70848 length 17
- Stream: column 1 section ROW_INDEX start: 70865 length 164
- Stream: column 2 section ROW_INDEX start: 71029 length 168
- Stream: column 2 section BLOOM_FILTER start: 71197 length 6533
- Stream: column 3 section ROW_INDEX start: 77730 length 83
- Stream: column 1 section DATA start: 77813 length 20035
- Stream: column 2 section DATA start: 97848 length 40050
- Stream: column 3 section DATA start: 137898 length 3532
- Stream: column 3 section LENGTH start: 141430 length 25
- Stream: column 3 section DICTIONARY_DATA start: 141455 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9218450653857701562 max: 9189819526332228512 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9220818777591257749 max: 9178821722829648113 positions: 4099,2,488
- Entry 2: count: 1000 hasNull: false min: -9220031433030423388 max: 9210838931786956852 positions: 12297,6,464
- Entry 3: count: 1000 hasNull: false min: -9208195729739635607 max: 9222259462014003839 positions: 20495,10,440
- Entry 4: count: 1000 hasNull: false min: -9174271499932339698 max: 9212277876771676916 positions: 28693,14,416
- Bloom filters for column 2:
- Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
- Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 4988 loadFactor: 0.5196 expectedFpp: 0.010223193
- Entry 2: numHashFunctions: 7 bitCount: 9600 popCount: 5002 loadFactor: 0.521 expectedFpp: 0.01042575
- Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 0.5169 expectedFpp: 0.009855959
- Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 0.5173 expectedFpp: 0.009911705
- Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 loadFactor: 0.9733 expectedFpp: 0.8276205
- Stripe: offset: 141673 data: 63787 rows: 5000 tail: 85 index: 6971
- Stream: column 0 section ROW_INDEX start: 141673 length 17
- Stream: column 1 section ROW_INDEX start: 141690 length 163
- Stream: column 2 section ROW_INDEX start: 141853 length 168
- Stream: column 2 section BLOOM_FILTER start: 142021 length 6533
- Stream: column 3 section ROW_INDEX start: 148554 length 90
- Stream: column 1 section DATA start: 148644 length 20035
- Stream: column 2 section DATA start: 168679 length 40050
- Stream: column 3 section DATA start: 208729 length 3544
- Stream: column 3 section LENGTH start: 212273 length 25
- Stream: column 3 section DICTIONARY_DATA start: 212298 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9211978436552246208 max: 9179058898902097152 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9195645160817780503 max: 9189147759444307708 positions: 4099,2,488
- Entry 2: count: 1000 hasNull: false min: -9202888157616520823 max: 9193561362676960747 positions: 12297,6,464
- Entry 3: count: 1000 hasNull: false min: -9216318198067839390 max: 9221286760675829363 positions: 20495,10,440
- Entry 4: count: 1000 hasNull: false min: -9218342074710552826 max: 9222303228623055266 positions: 28693,14,416
- Bloom filters for column 2:
- Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4967 loadFactor: 0.5174 expectedFpp: 0.009925688
- Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 5002 loadFactor: 0.521 expectedFpp: 0.01042575
- Entry 2: numHashFunctions: 7 bitCount: 9600 popCount: 4964 loadFactor: 0.5171 expectedFpp: 0.009883798
- Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 0.5149 expectedFpp: 0.009594797
- Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 0.5135 expectedFpp: 0.009419539
- Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 loadFactor: 0.9722 expectedFpp: 0.82082444
- Stripe: offset: 212516 data: 63817 rows: 5000 tail: 85 index: 6964
- Stream: column 0 section ROW_INDEX start: 212516 length 17
- Stream: column 1 section ROW_INDEX start: 212533 length 165
- Stream: column 2 section ROW_INDEX start: 212698 length 167
- Stream: column 2 section BLOOM_FILTER start: 212865 length 6524
- Stream: column 3 section ROW_INDEX start: 219389 length 91
- Stream: column 1 section DATA start: 219480 length 20035
- Stream: column 2 section DATA start: 239515 length 40050
- Stream: column 3 section DATA start: 279565 length 3574
- Stream: column 3 section LENGTH start: 283139 length 25
- Stream: column 3 section DICTIONARY_DATA start: 283164 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9222731174895935707 max: 9214167447015056056 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9222758097219661129 max: 9221043130193737406 positions: 4099,2,488
- Entry 2: count: 1000 hasNull: false min: -9174483776261243438 max: 9208134757538374043 positions: 12297,6,464
- Entry 3: count: 1000 hasNull: false min: -9174329712613510612 max: 9197412874152820822 positions: 20495,10,440
- Entry 4: count: 1000 hasNull: false min: -9221162005892422758 max: 9220625004936875965 positions: 28693,14,416
- Bloom filters for column 2:
- Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4951 loadFactor: 0.5157 expectedFpp: 0.009704026
- Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 4969 loadFactor: 0.5176 expectedFpp: 0.009953696
- Entry 2: numHashFunctions: 7 bitCount: 9600 popCount: 4994 loadFactor: 0.5202 expectedFpp: 0.010309587
- Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 0.5147 expectedFpp: 0.009567649
- Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 0.5201 expectedFpp: 0.010295142
- Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 loadFactor: 0.9743 expectedFpp: 0.8332165
- Stripe: offset: 283382 data: 12943 rows: 1000 tail: 78 index: 1468
- Stream: column 0 section ROW_INDEX start: 283382 length 12
- Stream: column 1 section ROW_INDEX start: 283394 length 38
- Stream: column 2 section ROW_INDEX start: 283432 length 41
- Stream: column 2 section BLOOM_FILTER start: 283473 length 1337
- Stream: column 3 section ROW_INDEX start: 284810 length 40
- Stream: column 1 section DATA start: 284850 length 4007
- Stream: column 2 section DATA start: 288857 length 8010
- Stream: column 3 section DATA start: 296867 length 768
- Stream: column 3 section LENGTH start: 297635 length 25
- Stream: column 3 section DICTIONARY_DATA start: 297660 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476 positions: 0,0,0
- Bloom filters for column 2:
- Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294
- Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294
-
-File length: 298416 bytes
-Padding length: 0 bytes
-Padding ratio: 0%
-________________________________________________________________________________________________________________________
-
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/resources/orc-file-dump-dictionary-threshold.out
----------------------------------------------------------------------
diff --git a/ql/src/test/resources/orc-file-dump-dictionary-threshold.out b/ql/src/test/resources/orc-file-dump-dictionary-threshold.out
deleted file mode 100644
index 17a964b..0000000
--- a/ql/src/test/resources/orc-file-dump-dictionary-threshold.out
+++ /dev/null
@@ -1,190 +0,0 @@
-Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
-Rows: 21000
-Compression: ZLIB
-Compression size: 4096
-Type: struct<i:int,l:bigint,s:string>
-
-Stripe Statistics:
- Stripe 1:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2147115959 max: 2145911404 sum: 159677169195
- Column 2: count: 5000 hasNull: false min: -9216505819108477308 max: 9217851628057711416
- Column 3: count: 5000 hasNull: false min: Darkness,-230 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744 sum: 381254
- Stripe 2:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2147390285 max: 2147224606 sum: -14961457759
- Column 2: count: 5000 hasNull: false min: -9222178666167296739 max: 9221301751385928177
- Column 3: count: 5000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938 sum: 1117994
- Stripe 3:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2145842720 max: 2146718321 sum: 141092475520
- Column 2: count: 5000 hasNull: false min: -9221963099397084326 max: 9222722740629726770
- Column 3: count: 5000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974 sum: 1925226
- Stripe 4:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2145378214 max: 2147453086 sum: -153680004530
- Column 2: count: 5000 hasNull: false min: -9222731174895935707 max: 9222919052987871506
- Column 3: count: 5000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-
11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904 sum: 2815002
- Stripe 5:
- Column 0: count: 1000 hasNull: false
- Column 1: count: 1000 hasNull: false min: -2143595397 max: 2136858458 sum: -22999664100
- Column 2: count: 1000 hasNull: false min: -9212379634781416464 max: 9197412874152820822
- Column 3: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006-16056-16180-16304-16332-16452-16598-16730-16810-16994-17210-17268-17786-17962-18214-18444-18446-18724-18912-18952-19164-19348-19400-19546-19776-19896-20084 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7
798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936 sum: 670762
-
-File Statistics:
- Column 0: count: 21000 hasNull: false
- Column 1: count: 21000 hasNull: false min: -2147390285 max: 2147453086 sum: 109128518326
- Column 2: count: 21000 hasNull: false min: -9222731174895935707 max: 9222919052987871506
- Column 3: count: 21000 hasNull: false min: Darkness,-230 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936 sum: 6910238
-
-Stripes:
- Stripe: offset: 3 data: 163602 rows: 5000 tail: 68 index: 720
- Stream: column 0 section ROW_INDEX start: 3 length 17
- Stream: column 1 section ROW_INDEX start: 20 length 166
- Stream: column 2 section ROW_INDEX start: 186 length 171
- Stream: column 3 section ROW_INDEX start: 357 length 366
- Stream: column 1 section DATA start: 723 length 20035
- Stream: column 2 section DATA start: 20758 length 40050
- Stream: column 3 section DATA start: 60808 length 99226
- Stream: column 3 section LENGTH start: 160034 length 4291
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DIRECT_V2
- Row group indices for column 1:
- Entry 0: count: 1000 hasNull: false min: -2132329551 max: 2145911404 sum: 61941331718 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -2138433136 max: 2145210552 sum: 14574030042 positions: 0,2050,488
- Entry 2: count: 1000 hasNull: false min: -2147115959 max: 2137805337 sum: -2032493169 positions: 4099,2054,464
- Entry 3: count: 1000 hasNull: false min: -2137828953 max: 2145877119 sum: -3167202608 positions: 8198,2058,440
- Entry 4: count: 1000 hasNull: false min: -2146452517 max: 2142394906 sum: 88361503212 positions: 12297,2062,416
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9206837518492372266 max: 9169230975203934579 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9188878639954124284 max: 9213664245516510068 positions: 4099,2,488
- Entry 2: count: 1000 hasNull: false min: -9211329013123260308 max: 9217851628057711416 positions: 12297,6,464
- Entry 3: count: 1000 hasNull: false min: -9185745718227889962 max: 9181722705210917931 positions: 20495,10,440
- Entry 4: count: 1000 hasNull: false min: -9216505819108477308 max: 9196474183833079923 positions: 28693,14,416
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness,-230 max: worst-54-290-346-648-908-996 sum: 18442 positions: 0,0,0,0,0
- Entry 1: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966 sum: 46338 positions: 4767,2058,0,695,18
- Entry 2: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660 sum: 75448 positions: 16464,3340,0,1554,14
- Entry 3: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788 sum: 104868 positions: 36532,964,0,2372,90
- Entry 4: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744 sum: 136158 positions: 63067,3432,0,3354,108
- Stripe: offset: 164393 data: 368335 rows: 5000 tail: 69 index: 956
- Stream: column 0 section ROW_INDEX start: 164393 length 17
- Stream: column 1 section ROW_INDEX start: 164410 length 157
- Stream: column 2 section ROW_INDEX start: 164567 length 166
- Stream: column 3 section ROW_INDEX start: 164733 length 616
- Stream: column 1 section DATA start: 165349 length 20035
- Stream: column 2 section DATA start: 185384 length 40050
- Stream: column 3 section DATA start: 225434 length 302715
- Stream: column 3 section LENGTH start: 528149 length 5535
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DIRECT_V2
- Row group indices for column 1:
- Entry 0: count: 1000 hasNull: false min: -2146021688 max: 2146838901 sum: -50979197646 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -2143569489 max: 2141223179 sum: 22810066834 positions: 0,2050,488
- Entry 2: count: 1000 hasNull: false min: -2140649392 max: 2146301701 sum: -31694882346 positions: 4099,2054,464
- Entry 3: count: 1000 hasNull: false min: -2147390285 max: 2146299933 sum: 79371934221 positions: 8198,2058,440
- Entry 4: count: 1000 hasNull: false min: -2145928262 max: 2147224606 sum: -34469378822 positions: 12297,2062,416
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9222178666167296739 max: 9191250610515369723 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9220148577547102875 max: 9213945522531717278 positions: 4099,2,488
- Entry 2: count: 1000 hasNull: false min: -9220818777591257749 max: 9221301751385928177 positions: 12297,6,464
- Entry 3: count: 1000 hasNull: false min: -9220031433030423388 max: 9207856144487414148 positions: 20495,10,440
- Entry 4: count: 1000 hasNull: false min: -9201438531577205959 max: 9212462124593119846 positions: 28693,14,416
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726 sum: 166320 positions: 0,0,0,0,0
- Entry 1: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994 sum: 193436 positions: 43833,2480,0,967,90
- Entry 2: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988 sum: 224740 positions: 94117,3404,0,1945,222
- Entry 3: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984 sum: 252094 positions: 155111,2864,0,3268,48
- Entry 4: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938 sum: 281404 positions: 224570,1006,0,4064,342
- Stripe: offset: 533753 data: 606074 rows: 5000 tail: 69 index: 1427
- Stream: column 0 section ROW_INDEX start: 533753 length 17
- Stream: column 1 section ROW_INDEX start: 533770 length 167
- Stream: column 2 section ROW_INDEX start: 533937 length 168
- Stream: column 3 section ROW_INDEX start: 534105 length 1075
- Stream: column 1 section DATA start: 535180 length 20035
- Stream: column 2 section DATA start: 555215 length 40050
- Stream: column 3 section DATA start: 595265 length 540210
- Stream: column 3 section LENGTH start: 1135475 length 5779
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DIRECT_V2
- Row group indices for column 1:
- Entry 0: count: 1000 hasNull: false min: -2138229212 max: 2144818981 sum: -22823642812 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -2145842720 max: 2144179881 sum: -12562754334 positions: 0,2050,488
- Entry 2: count: 1000 hasNull: false min: -2143045885 max: 2146718321 sum: 82993638644 positions: 4099,2054,464
- Entry 3: count: 1000 hasNull: false min: -2144745617 max: 2146570474 sum: 25138722367 positions: 8198,2058,440
- Entry 4: count: 1000 hasNull: false min: -2140127150 max: 2135081620 sum: 68346511655 positions: 12297,2062,416
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9204340807292138409 max: 9208698732685326961 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9221963099397084326 max: 9222722740629726770 positions: 4099,2,488
- Entry 2: count: 1000 hasNull: false min: -9210480084701091299 max: 9207767402467343058 positions: 12297,6,464
- Entry 3: count: 1000 hasNull: false min: -9195038026813631215 max: 9199201928563274421 positions: 20495,10,440
- Entry 4: count: 1000 hasNull: false min: -9215483580266514322 max: 9220102792864959501 positions: 28693,14,416
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876 sum: 313880 positions: 0,0,0,0,0
- Entry 1: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964 sum: 349542 positions: 87800,2584,0,1097,28
- Entry 2: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976 sum: 386538 posit
ions: 185635,3966,0,2077,162
- Entry 3: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802
-12976-13216-13246-13502-13766 sum: 421660 positions: 295550,1384,0,3369,16
- Entry 4: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298
-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974 sum: 453606 positions: 412768,1156,0,4041,470
- Stripe: offset: 1141323 data: 864001 rows: 5000 tail: 69 index: 1975
- Stream: column 0 section ROW_INDEX start: 1141323 length 17
- Stream: column 1 section ROW_INDEX start: 1141340 length 156
- Stream: column 2 section ROW_INDEX start: 1141496 length 168
- Stream: column 3 section ROW_INDEX start: 1141664 length 1634
- Stream: column 1 section DATA start: 1143298 length 20035
- Stream: column 2 section DATA start: 1163333 length 40050
- Stream: column 3 section DATA start: 1203383 length 798014
- Stream: column 3 section LENGTH start: 2001397 length 5902
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DIRECT_V2
- Row group indices for column 1:
- Entry 0: count: 1000 hasNull: false min: -2145319330 max: 2146998132 sum: -50856753363 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -2134288866 max: 2147453086 sum: -17911019023 positions: 0,2050,488
- Entry 2: count: 1000 hasNull: false min: -2139010804 max: 2144727593 sum: -24993151857 positions: 4099,2054,464
- Entry 3: count: 1000 hasNull: false min: -2145378214 max: 2144098933 sum: -18055164052 positions: 8198,2058,440
- Entry 4: count: 1000 hasNull: false min: -2140494429 max: 2144595861 sum: -41863916235 positions: 12297,2062,416
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9172774601303513941 max: 9212917101275642143 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9218164880949195469 max: 9222919052987871506 positions: 4099,2,488
- Entry 2: count: 1000 hasNull: false min: -9222731174895935707 max: 9214167447015056056 positions: 12297,6,464
- Entry 3: count: 1000 hasNull: false min: -9196276654247395117 max: 9210639275226058005 positions: 20495,10,440
- Entry 4: count: 1000 hasNull: false min: -9197393848859294562 max: 9208134757538374043 positions: 28693,14,416
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836
-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610 sum: 492916 positions: 0,0,0,0,0
- Entry 1: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008
-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936 sum: 527290 positions: 139298,1396,0,1077,140
- Entry 2: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006-16056-16180-16304-16332-16452-16598-16730-16810-16994-17210 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9
650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878 sum: 568274 positions: 286457,302,0,1926,462
- Entry 3: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006-16056-16180-16304-16332-16452-16598-16730-16810-16994-17210-17268-17786-17962-18214 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-91
28-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788 sum: 594578 positions: 447943,3328,0,3444,250
- Entry 4: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006-16056-16180-16304-16332-16452-16598-16730-16810-16994-17210-17268-17786-17962-18214-18444-18446-18724-18912-18952-19164 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8
390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904 sum: 631944 positions: 616471,3986,3778,547,292
- Stripe: offset: 2007368 data: 207295 rows: 1000 tail: 67 index: 841
- Stream: column 0 section ROW_INDEX start: 2007368 length 12
- Stream: column 1 section ROW_INDEX start: 2007380 length 38
- Stream: column 2 section ROW_INDEX start: 2007418 length 41
- Stream: column 3 section ROW_INDEX start: 2007459 length 750
- Stream: column 1 section DATA start: 2008209 length 4007
- Stream: column 2 section DATA start: 2012216 length 8010
- Stream: column 3 section DATA start: 2020226 length 194018
- Stream: column 3 section LENGTH start: 2214244 length 1260
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DIRECT_V2
- Row group indices for column 1:
- Entry 0: count: 1000 hasNull: false min: -2143595397 max: 2136858458 sum: -22999664100 positions: 0,0,0
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9212379634781416464 max: 9197412874152820822 positions: 0,0,0
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006-16056-16180-16304-16332-16452-16598-16730-16810-16994-17210-17268-17786-17962-18214-18444-18446-18724-18912-18952-19164-19348-19400-19546-19776-19896-20084 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-
7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936 sum: 670762 positions: 0,0,0,0,0
-
-File length: 2217685 bytes
-Padding length: 0 bytes
-Padding ratio: 0%
-________________________________________________________________________________________________________________________
-
[09/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java
index 40cc86f..dad35e3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedReaderImpl.java
@@ -38,7 +38,7 @@ import org.apache.orc.CompressionCodec;
import org.apache.orc.DataReader;
import org.apache.orc.OrcConf;
import org.apache.orc.impl.OutStream;
-import org.apache.hadoop.hive.ql.io.orc.RecordReaderUtils;
+import org.apache.orc.impl.RecordReaderUtils;
import org.apache.orc.impl.StreamName;
import org.apache.orc.StripeInformation;
import org.apache.orc.impl.BufferChunk;
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedTreeReaderFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedTreeReaderFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedTreeReaderFactory.java
index fe46446..b44da06 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedTreeReaderFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedTreeReaderFactory.java
@@ -25,7 +25,7 @@ import org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamD
import org.apache.orc.CompressionCodec;
import org.apache.orc.impl.PositionProvider;
import org.apache.orc.impl.SettableUncompressedStream;
-import org.apache.hadoop.hive.ql.io.orc.TreeReaderFactory;
+import org.apache.orc.impl.TreeReaderFactory;
import org.apache.orc.OrcProto;
public class EncodedTreeReaderFactory extends TreeReaderFactory {
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java
index b20ce28..e4cbd5f 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java
@@ -23,7 +23,6 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.txn.TxnDbUtil;
import org.apache.hadoop.hive.ql.io.AcidUtils;
-import org.apache.hadoop.hive.ql.io.orc.FileDump;
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.ql.txn.AcidHouseKeeperService;
@@ -36,7 +35,6 @@ import org.junit.Test;
import org.junit.rules.TestName;
import java.io.File;
-import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.util.ArrayList;
import java.util.Arrays;
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampWritableAndColumnVector.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampWritableAndColumnVector.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampWritableAndColumnVector.java
index 6c46257..2fa9ab2 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampWritableAndColumnVector.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampWritableAndColumnVector.java
@@ -20,14 +20,11 @@ package org.apache.hadoop.hive.ql.exec.vector;
import org.junit.Test;
-import java.math.BigDecimal;
-import java.math.RoundingMode;
import java.sql.Timestamp;
-import java.util.Date;
import java.util.Random;
import org.apache.hadoop.hive.common.type.RandomTypeUtil;
-import org.apache.hadoop.hive.serde2.io.TimestampWritable;
+import org.apache.hadoop.hive.ql.util.TimestampUtils;
import static org.junit.Assert.*;
@@ -58,7 +55,7 @@ public class TestTimestampWritableAndColumnVector {
if (!retrievedTimestamp.equals(randTimestamp)) {
assertTrue(false);
}
- double randDouble = TimestampWritable.getDouble(randTimestamp);
+ double randDouble = TimestampUtils.getDouble(randTimestamp);
double retrievedDouble = timestampColVector.getDouble(i);
if (randDouble != retrievedDouble) {
assertTrue(false);
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java
index 1e41fce..e7a044e 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorTypeCasts.java
@@ -33,7 +33,6 @@ import java.util.concurrent.TimeUnit;
import junit.framework.Assert;
-import org.apache.hadoop.hive.common.type.Decimal128;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.RandomTypeUtil;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
@@ -44,6 +43,7 @@ import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.*;
import org.apache.hadoop.hive.ql.exec.vector.expressions.*;
+import org.apache.hadoop.hive.ql.util.TimestampUtils;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils;
import org.junit.Test;
@@ -91,8 +91,8 @@ public class TestVectorTypeCasts {
b.cols[0].noNulls = true;
VectorExpression expr = new CastDoubleToTimestamp(0, 1);
expr.evaluate(b);
- Assert.assertEquals(0.0, TimestampWritable.getDouble(resultV.asScratchTimestamp(3)));
- Assert.assertEquals(0.5d, TimestampWritable.getDouble(resultV.asScratchTimestamp(4)));
+ Assert.assertEquals(0.0, TimestampUtils.getDouble(resultV.asScratchTimestamp(3)));
+ Assert.assertEquals(0.5d, TimestampUtils.getDouble(resultV.asScratchTimestamp(4)));
}
@Test
@@ -152,7 +152,7 @@ public class TestVectorTypeCasts {
expr.evaluate(b);
for (int i = 0; i < doubleValues.length; i++) {
double actual = resultV.vector[i];
- double doubleValue = TimestampWritable.getDouble(inV.asScratchTimestamp(i));
+ double doubleValue = TimestampUtils.getDouble(inV.asScratchTimestamp(i));
assertEquals(actual, doubleValue, 0.000000001F);
}
}
@@ -382,7 +382,7 @@ public class TestVectorTypeCasts {
TimestampColumnVector r = (TimestampColumnVector) b.cols[1];
for (int i = 0; i < doubleValues.length; i++) {
Timestamp timestamp = r.asScratchTimestamp(i);
- double asDouble = TimestampWritable.getDouble(timestamp);
+ double asDouble = TimestampUtils.getDouble(timestamp);
double expectedDouble = doubleValues[i];
if (expectedDouble != asDouble) {
assertTrue(false);
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/udf/TestVectorUDFAdaptor.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/udf/TestVectorUDFAdaptor.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/udf/TestVectorUDFAdaptor.java
index a7567b7..b78c1f2 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/udf/TestVectorUDFAdaptor.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/udf/TestVectorUDFAdaptor.java
@@ -27,8 +27,6 @@ import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
-import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFAdaptor;
-import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFArgDesc;
import org.apache.hadoop.hive.ql.exec.vector.udf.generic.GenericUDFIsNull;
import org.apache.hadoop.hive.ql.exec.vector.udf.legacy.ConcatTextLongDoubleUDF;
import org.apache.hadoop.hive.ql.exec.vector.udf.legacy.LongUDF;
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestColumnStatistics.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestColumnStatistics.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestColumnStatistics.java
deleted file mode 100644
index 5f0146f..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestColumnStatistics.java
+++ /dev/null
@@ -1,352 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static junit.framework.Assert.assertEquals;
-import static org.junit.Assume.assumeTrue;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.PrintStream;
-import java.sql.Timestamp;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.impl.ColumnStatisticsImpl;
-import org.apache.orc.DateColumnStatistics;
-import org.apache.orc.DecimalColumnStatistics;
-import org.apache.orc.DoubleColumnStatistics;
-import org.apache.orc.IntegerColumnStatistics;
-import org.apache.orc.StringColumnStatistics;
-import org.apache.orc.StripeStatistics;
-import org.apache.orc.TimestampColumnStatistics;
-import org.apache.orc.TypeDescription;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-
-/**
- * Test ColumnStatisticsImpl for ORC.
- */
-public class TestColumnStatistics {
-
- @Test
- public void testLongMerge() throws Exception {
- TypeDescription schema = TypeDescription.createInt();
-
- ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
- ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
- stats1.updateInteger(10, 2);
- stats2.updateInteger(1, 1);
- stats2.updateInteger(1000, 1);
- stats1.merge(stats2);
- IntegerColumnStatistics typed = (IntegerColumnStatistics) stats1;
- assertEquals(1, typed.getMinimum());
- assertEquals(1000, typed.getMaximum());
- stats1.reset();
- stats1.updateInteger(-10, 1);
- stats1.updateInteger(10000, 1);
- stats1.merge(stats2);
- assertEquals(-10, typed.getMinimum());
- assertEquals(10000, typed.getMaximum());
- }
-
- @Test
- public void testDoubleMerge() throws Exception {
- TypeDescription schema = TypeDescription.createDouble();
-
- ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
- ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
- stats1.updateDouble(10.0);
- stats1.updateDouble(100.0);
- stats2.updateDouble(1.0);
- stats2.updateDouble(1000.0);
- stats1.merge(stats2);
- DoubleColumnStatistics typed = (DoubleColumnStatistics) stats1;
- assertEquals(1.0, typed.getMinimum(), 0.001);
- assertEquals(1000.0, typed.getMaximum(), 0.001);
- stats1.reset();
- stats1.updateDouble(-10);
- stats1.updateDouble(10000);
- stats1.merge(stats2);
- assertEquals(-10, typed.getMinimum(), 0.001);
- assertEquals(10000, typed.getMaximum(), 0.001);
- }
-
-
- @Test
- public void testStringMerge() throws Exception {
- TypeDescription schema = TypeDescription.createString();
-
- ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
- ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
- stats1.updateString(new Text("bob"));
- stats1.updateString(new Text("david"));
- stats1.updateString(new Text("charles"));
- stats2.updateString(new Text("anne"));
- byte[] erin = new byte[]{0, 1, 2, 3, 4, 5, 101, 114, 105, 110};
- stats2.updateString(erin, 6, 4, 5);
- assertEquals(24, ((StringColumnStatistics)stats2).getSum());
- stats1.merge(stats2);
- StringColumnStatistics typed = (StringColumnStatistics) stats1;
- assertEquals("anne", typed.getMinimum());
- assertEquals("erin", typed.getMaximum());
- assertEquals(39, typed.getSum());
- stats1.reset();
- stats1.updateString(new Text("aaa"));
- stats1.updateString(new Text("zzz"));
- stats1.merge(stats2);
- assertEquals("aaa", typed.getMinimum());
- assertEquals("zzz", typed.getMaximum());
- }
-
- @Test
- public void testDateMerge() throws Exception {
- TypeDescription schema = TypeDescription.createDate();
-
- ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
- ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
- stats1.updateDate(new DateWritable(1000));
- stats1.updateDate(new DateWritable(100));
- stats2.updateDate(new DateWritable(10));
- stats2.updateDate(new DateWritable(2000));
- stats1.merge(stats2);
- DateColumnStatistics typed = (DateColumnStatistics) stats1;
- assertEquals(new DateWritable(10).get(), typed.getMinimum());
- assertEquals(new DateWritable(2000).get(), typed.getMaximum());
- stats1.reset();
- stats1.updateDate(new DateWritable(-10));
- stats1.updateDate(new DateWritable(10000));
- stats1.merge(stats2);
- assertEquals(new DateWritable(-10).get(), typed.getMinimum());
- assertEquals(new DateWritable(10000).get(), typed.getMaximum());
- }
-
- @Test
- public void testTimestampMerge() throws Exception {
- TypeDescription schema = TypeDescription.createTimestamp();
-
- ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
- ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
- stats1.updateTimestamp(new Timestamp(10));
- stats1.updateTimestamp(new Timestamp(100));
- stats2.updateTimestamp(new Timestamp(1));
- stats2.updateTimestamp(new Timestamp(1000));
- stats1.merge(stats2);
- TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
- assertEquals(1, typed.getMinimum().getTime());
- assertEquals(1000, typed.getMaximum().getTime());
- stats1.reset();
- stats1.updateTimestamp(new Timestamp(-10));
- stats1.updateTimestamp(new Timestamp(10000));
- stats1.merge(stats2);
- assertEquals(-10, typed.getMinimum().getTime());
- assertEquals(10000, typed.getMaximum().getTime());
- }
-
- @Test
- public void testDecimalMerge() throws Exception {
- TypeDescription schema = TypeDescription.createDecimal()
- .withPrecision(38).withScale(16);
-
- ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
- ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
- stats1.updateDecimal(HiveDecimal.create(10));
- stats1.updateDecimal(HiveDecimal.create(100));
- stats2.updateDecimal(HiveDecimal.create(1));
- stats2.updateDecimal(HiveDecimal.create(1000));
- stats1.merge(stats2);
- DecimalColumnStatistics typed = (DecimalColumnStatistics) stats1;
- assertEquals(1, typed.getMinimum().longValue());
- assertEquals(1000, typed.getMaximum().longValue());
- stats1.reset();
- stats1.updateDecimal(HiveDecimal.create(-10));
- stats1.updateDecimal(HiveDecimal.create(10000));
- stats1.merge(stats2);
- assertEquals(-10, typed.getMinimum().longValue());
- assertEquals(10000, typed.getMaximum().longValue());
- }
-
-
- public static class SimpleStruct {
- BytesWritable bytes1;
- Text string1;
-
- SimpleStruct(BytesWritable b1, String s1) {
- this.bytes1 = b1;
- if (s1 == null) {
- this.string1 = null;
- } else {
- this.string1 = new Text(s1);
- }
- }
- }
-
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- fs.setWorkingDirectory(workDir);
- testFilePath = new Path("TestOrcFile." + testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- private static BytesWritable bytes(int... items) {
- BytesWritable result = new BytesWritable();
- result.setSize(items.length);
- for (int i = 0; i < items.length; ++i) {
- result.getBytes()[i] = (byte) items[i];
- }
- return result;
- }
-
- @Test
- public void testHasNull() throws Exception {
-
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector
- (SimpleStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .inspector(inspector)
- .rowIndexStride(1000)
- .stripeSize(10000)
- .bufferSize(10000));
- // STRIPE 1
- // RG1
- for(int i=0; i<1000; i++) {
- writer.addRow(new SimpleStruct(bytes(1,2,3), "RG1"));
- }
- // RG2
- for(int i=0; i<1000; i++) {
- writer.addRow(new SimpleStruct(bytes(1,2,3), null));
- }
- // RG3
- for(int i=0; i<1000; i++) {
- writer.addRow(new SimpleStruct(bytes(1,2,3), "RG3"));
- }
- // RG4
- for(int i=0; i<1000; i++) {
- writer.addRow(new SimpleStruct(bytes(1,2,3), null));
- }
- // RG5
- for(int i=0; i<1000; i++) {
- writer.addRow(new SimpleStruct(bytes(1,2,3), null));
- }
- // STRIPE 2
- for(int i=0; i<5000; i++) {
- writer.addRow(new SimpleStruct(bytes(1,2,3), null));
- }
- // STRIPE 3
- for(int i=0; i<5000; i++) {
- writer.addRow(new SimpleStruct(bytes(1,2,3), "STRIPE-3"));
- }
- // STRIPE 4
- for(int i=0; i<5000; i++) {
- writer.addRow(new SimpleStruct(bytes(1,2,3), null));
- }
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- // check the file level stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(20000, stats[0].getNumberOfValues());
- assertEquals(20000, stats[1].getNumberOfValues());
- assertEquals(7000, stats[2].getNumberOfValues());
- assertEquals(false, stats[0].hasNull());
- assertEquals(false, stats[1].hasNull());
- assertEquals(true, stats[2].hasNull());
-
- // check the stripe level stats
- List<StripeStatistics> stripeStats = reader.getStripeStatistics();
- // stripe 1 stats
- StripeStatistics ss1 = stripeStats.get(0);
- ColumnStatistics ss1_cs1 = ss1.getColumnStatistics()[0];
- ColumnStatistics ss1_cs2 = ss1.getColumnStatistics()[1];
- ColumnStatistics ss1_cs3 = ss1.getColumnStatistics()[2];
- assertEquals(false, ss1_cs1.hasNull());
- assertEquals(false, ss1_cs2.hasNull());
- assertEquals(true, ss1_cs3.hasNull());
-
- // stripe 2 stats
- StripeStatistics ss2 = stripeStats.get(1);
- ColumnStatistics ss2_cs1 = ss2.getColumnStatistics()[0];
- ColumnStatistics ss2_cs2 = ss2.getColumnStatistics()[1];
- ColumnStatistics ss2_cs3 = ss2.getColumnStatistics()[2];
- assertEquals(false, ss2_cs1.hasNull());
- assertEquals(false, ss2_cs2.hasNull());
- assertEquals(true, ss2_cs3.hasNull());
-
- // stripe 3 stats
- StripeStatistics ss3 = stripeStats.get(2);
- ColumnStatistics ss3_cs1 = ss3.getColumnStatistics()[0];
- ColumnStatistics ss3_cs2 = ss3.getColumnStatistics()[1];
- ColumnStatistics ss3_cs3 = ss3.getColumnStatistics()[2];
- assertEquals(false, ss3_cs1.hasNull());
- assertEquals(false, ss3_cs2.hasNull());
- assertEquals(false, ss3_cs3.hasNull());
-
- // stripe 4 stats
- StripeStatistics ss4 = stripeStats.get(3);
- ColumnStatistics ss4_cs1 = ss4.getColumnStatistics()[0];
- ColumnStatistics ss4_cs2 = ss4.getColumnStatistics()[1];
- ColumnStatistics ss4_cs3 = ss4.getColumnStatistics()[2];
- assertEquals(false, ss4_cs1.hasNull());
- assertEquals(false, ss4_cs2.hasNull());
- assertEquals(true, ss4_cs3.hasNull());
-
- // Test file dump
- PrintStream origOut = System.out;
- String outputFilename = "orc-file-has-null.out";
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "--rowindex=2"});
- System.out.flush();
- System.setOut(origOut);
- // If called with an expression evaluating to false, the test will halt
- // and be ignored.
- assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
- TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
deleted file mode 100644
index 554033c..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
+++ /dev/null
@@ -1,418 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-
-import java.io.BufferedReader;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.PrintStream;
-import java.sql.Date;
-import java.sql.Timestamp;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.type.HiveChar;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.common.type.HiveVarchar;
-import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.apache.hive.common.util.HiveTestUtils;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestFileDump {
-
- Path workDir = new Path(System.getProperty("test.tmp.dir"));
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Before
- public void openFileSystem () throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- fs.setWorkingDirectory(workDir);
- testFilePath = new Path("TestFileDump.testDump.orc");
- fs.delete(testFilePath, false);
- }
-
- static class MyRecord {
- int i;
- long l;
- String s;
- MyRecord(int i, long l, String s) {
- this.i = i;
- this.l = l;
- this.s = s;
- }
- }
-
- static class AllTypesRecord {
- static class Struct {
- int i;
- String s;
-
- Struct(int i, String s) {
- this.i = i;
- this.s = s;
- }
- }
- boolean b;
- byte bt;
- short s;
- int i;
- long l;
- float f;
- double d;
- HiveDecimal de;
- Timestamp t;
- Date dt;
- String str;
- HiveChar c;
- HiveVarchar vc;
- Map<String, String> m;
- List<Integer> a;
- Struct st;
-
- AllTypesRecord(boolean b, byte bt, short s, int i, long l, float f, double d, HiveDecimal de,
- Timestamp t, Date dt, String str, HiveChar c, HiveVarchar vc, Map<String,
- String> m, List<Integer> a, Struct st) {
- this.b = b;
- this.bt = bt;
- this.s = s;
- this.i = i;
- this.l = l;
- this.f = f;
- this.d = d;
- this.de = de;
- this.t = t;
- this.dt = dt;
- this.str = str;
- this.c = c;
- this.vc = vc;
- this.m = m;
- this.a = a;
- this.st = st;
- }
- }
-
- static void checkOutput(String expected,
- String actual) throws Exception {
- BufferedReader eStream =
- new BufferedReader(new FileReader(HiveTestUtils.getFileFromClasspath(expected)));
- BufferedReader aStream =
- new BufferedReader(new FileReader(actual));
- String expectedLine = eStream.readLine().trim();
- while (expectedLine != null) {
- String actualLine = aStream.readLine().trim();
- System.out.println("actual: " + actualLine);
- System.out.println("expected: " + expectedLine);
- assertEquals(expectedLine, actualLine);
- expectedLine = eStream.readLine();
- expectedLine = expectedLine == null ? null : expectedLine.trim();
- }
- assertNull(eStream.readLine());
- assertNull(aStream.readLine());
- eStream.close();
- aStream.close();
- }
-
- @Test
- public void testDump() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector
- (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION");
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .fileSystem(fs)
- .inspector(inspector)
- .batchSize(1000)
- .compress(CompressionKind.ZLIB)
- .stripeSize(100000)
- .rowIndexStride(1000));
- Random r1 = new Random(1);
- String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
- "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
- "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
- "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
- "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
- "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
- "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
- "we", "had", "everything", "before", "us,", "we", "had", "nothing",
- "before", "us,", "we", "were", "all", "going", "direct", "to",
- "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
- "way"};
- for(int i=0; i < 21000; ++i) {
- writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(),
- words[r1.nextInt(words.length)]));
- }
- writer.close();
- PrintStream origOut = System.out;
- String outputFilename = "orc-file-dump.out";
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "--rowindex=1,2,3"});
- System.out.flush();
- System.setOut(origOut);
-
-
- checkOutput(outputFilename, workDir + File.separator + outputFilename);
- }
-
- @Test
- public void testDataDump() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector
- (AllTypesRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
- 100000, CompressionKind.NONE, 10000, 1000);
- Map<String, String> m = new HashMap<String, String>(2);
- m.put("k1", "v1");
- writer.addRow(new AllTypesRecord(
- true,
- (byte) 10,
- (short) 100,
- 1000,
- 10000L,
- 4.0f,
- 20.0,
- HiveDecimal.create("4.2222"),
- new Timestamp(1416967764000L),
- new Date(1416967764000L),
- "string",
- new HiveChar("hello", 5),
- new HiveVarchar("hello", 10),
- m,
- Arrays.asList(100, 200),
- new AllTypesRecord.Struct(10, "foo")));
- m.clear();
- m.put("k3", "v3");
- writer.addRow(new AllTypesRecord(
- false,
- (byte)20,
- (short)200,
- 2000,
- 20000L,
- 8.0f,
- 40.0,
- HiveDecimal.create("2.2222"),
- new Timestamp(1416967364000L),
- new Date(1411967764000L),
- "abcd",
- new HiveChar("world", 5),
- new HiveVarchar("world", 10),
- m,
- Arrays.asList(200, 300),
- new AllTypesRecord.Struct(20, "bar")));
-
- writer.close();
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "-d"});
- System.out.flush();
- System.setOut(origOut);
-
- String[] lines = myOut.toString().split("\n");
- // Don't be fooled by the big space in the middle, this line is quite long
- assertEquals("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.2222\",\"t\":\"2014-11-25 18:09:24\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello \",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]);
- assertEquals("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.2222\",\"t\":\"2014-11-25 18:02:44\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world \",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]);
- }
-
- // Test that if the fraction of rows that have distinct strings is greater than the configured
- // threshold dictionary encoding is turned off. If dictionary encoding is turned off the length
- // of the dictionary stream for the column will be 0 in the ORC file dump.
- @Test
- public void testDictionaryThreshold() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector
- (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- Configuration conf = new Configuration();
- conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION");
- conf.setFloat(HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, 0.49f);
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .fileSystem(fs)
- .batchSize(1000)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.ZLIB)
- .rowIndexStride(1000)
- .bufferSize(10000));
- Random r1 = new Random(1);
- String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
- "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
- "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
- "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
- "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
- "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
- "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
- "we", "had", "everything", "before", "us,", "we", "had", "nothing",
- "before", "us,", "we", "were", "all", "going", "direct", "to",
- "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
- "way"};
- int nextInt = 0;
- for(int i=0; i < 21000; ++i) {
- // Write out the same string twice, this guarantees the fraction of rows with
- // distinct strings is 0.5
- if (i % 2 == 0) {
- nextInt = r1.nextInt(words.length);
- // Append the value of i to the word, this guarantees when an index or word is repeated
- // the actual string is unique.
- words[nextInt] += "-" + i;
- }
- writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(),
- words[nextInt]));
- }
- writer.close();
- PrintStream origOut = System.out;
- String outputFilename = "orc-file-dump-dictionary-threshold.out";
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "--rowindex=1,2,3"});
- System.out.flush();
- System.setOut(origOut);
-
- checkOutput(outputFilename, workDir + File.separator + outputFilename);
- }
-
- @Test
- public void testBloomFilter() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector
- (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION");
- OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
- .fileSystem(fs)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.ZLIB)
- .bufferSize(10000)
- .rowIndexStride(1000)
- .batchSize(1000)
- .bloomFilterColumns("S");
- Writer writer = OrcFile.createWriter(testFilePath, options);
- Random r1 = new Random(1);
- String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
- "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
- "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
- "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
- "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
- "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
- "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
- "we", "had", "everything", "before", "us,", "we", "had", "nothing",
- "before", "us,", "we", "were", "all", "going", "direct", "to",
- "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
- "way"};
- for(int i=0; i < 21000; ++i) {
- writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(),
- words[r1.nextInt(words.length)]));
- }
- writer.close();
- PrintStream origOut = System.out;
- String outputFilename = "orc-file-dump-bloomfilter.out";
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "--rowindex=3"});
- System.out.flush();
- System.setOut(origOut);
-
-
- checkOutput(outputFilename, workDir + File.separator + outputFilename);
- }
-
- @Test
- public void testBloomFilter2() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector
- (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION");
- OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
- .fileSystem(fs)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.ZLIB)
- .bufferSize(10000)
- .rowIndexStride(1000)
- .bloomFilterColumns("l")
- .bloomFilterFpp(0.01)
- .batchSize(1000);
- Writer writer = OrcFile.createWriter(testFilePath, options);
- Random r1 = new Random(1);
- String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
- "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
- "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
- "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
- "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
- "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
- "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
- "we", "had", "everything", "before", "us,", "we", "had", "nothing",
- "before", "us,", "we", "were", "all", "going", "direct", "to",
- "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
- "way"};
- for(int i=0; i < 21000; ++i) {
- writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(),
- words[r1.nextInt(words.length)]));
- }
- writer.close();
- PrintStream origOut = System.out;
- String outputFilename = "orc-file-dump-bloomfilter2.out";
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "--rowindex=2"});
- System.out.flush();
- System.setOut(origOut);
-
-
- checkOutput(outputFilename, workDir + File.separator + outputFilename);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestJsonFileDump.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestJsonFileDump.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestJsonFileDump.java
deleted file mode 100644
index acf232d..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestJsonFileDump.java
+++ /dev/null
@@ -1,139 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hive.ql.io.orc;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.PrintStream;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.apache.hive.common.util.HiveTestUtils;
-import org.apache.orc.CompressionKind;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestJsonFileDump {
-
- Path workDir = new Path(System.getProperty("test.tmp.dir"));
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Before
- public void openFileSystem () throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- fs.setWorkingDirectory(workDir);
- testFilePath = new Path("TestFileDump.testDump.orc");
- fs.delete(testFilePath, false);
- }
-
- static class MyRecord {
- int i;
- long l;
- String s;
- MyRecord(int i, long l, String s) {
- this.i = i;
- this.l = l;
- this.s = s;
- }
- }
-
- static void checkOutput(String expected,
- String actual) throws Exception {
- BufferedReader eStream =
- new BufferedReader(new FileReader(HiveTestUtils.getFileFromClasspath(expected)));
- BufferedReader aStream =
- new BufferedReader(new FileReader(actual));
- String expectedLine = eStream.readLine();
- while (expectedLine != null) {
- String actualLine = aStream.readLine();
- System.out.println("actual: " + actualLine);
- System.out.println("expected: " + expectedLine);
- assertEquals(expectedLine, actualLine);
- expectedLine = eStream.readLine();
- }
- assertNull(eStream.readLine());
- assertNull(aStream.readLine());
- }
-
- @Test
- public void testJsonDump() throws Exception {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector
- (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION");
- OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
- .fileSystem(fs)
- .inspector(inspector)
- .stripeSize(100000)
- .compress(CompressionKind.ZLIB)
- .bufferSize(10000)
- .rowIndexStride(1000)
- .bloomFilterColumns("s");
- Writer writer = OrcFile.createWriter(testFilePath, options);
- Random r1 = new Random(1);
- String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
- "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
- "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
- "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
- "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
- "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
- "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
- "we", "had", "everything", "before", "us,", "we", "had", "nothing",
- "before", "us,", "we", "were", "all", "going", "direct", "to",
- "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
- "way"};
- for(int i=0; i < 21000; ++i) {
- if (i % 100 == 0) {
- writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(), null));
- } else {
- writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(),
- words[r1.nextInt(words.length)]));
- }
- }
-
- writer.close();
- PrintStream origOut = System.out;
- String outputFilename = "orc-file-dump.json";
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "-j", "-p", "--rowindex=3"});
- System.out.flush();
- System.setOut(origOut);
-
-
- checkOutput(outputFilename, workDir + File.separator + outputFilename);
- }
-}
[20/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/TestStringDictionary.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestStringDictionary.java b/orc/src/test/org/apache/orc/TestStringDictionary.java
new file mode 100644
index 0000000..46209bb
--- /dev/null
+++ b/orc/src/test/org/apache/orc/TestStringDictionary.java
@@ -0,0 +1,290 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import org.apache.orc.impl.RecordReaderImpl;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+public class TestStringDictionary {
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
+ + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testTooManyDistinct() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector col = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ col.setVal(batch.size++, String.valueOf(i).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ col = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(idx++), col.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testHalfDistinct() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ Random rand = new Random(123);
+ int[] input = new int[20000];
+ for (int i = 0; i < 20000; i++) {
+ input[i] = rand.nextInt(10000);
+ }
+
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector col = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ col.setVal(batch.size++, String.valueOf(input[i]).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ col = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(input[idx++]), col.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testTooManyDistinctCheckDisabled() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ conf.setBoolean(OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getAttribute(), false);
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector string = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ string.setVal(batch.size++, String.valueOf(i).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ string = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(idx++), string.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testHalfDistinctCheckDisabled() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ conf.setBoolean(OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getAttribute(),
+ false);
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ Random rand = new Random(123);
+ int[] input = new int[20000];
+ for (int i = 0; i < 20000; i++) {
+ input[i] = rand.nextInt(10000);
+ }
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector string = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ string.setVal(batch.size++, String.valueOf(input[i]).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ string = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(input[idx++]), string.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testTooManyDistinctV11AlwaysDictionary() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .version(OrcFile.Version.V_0_11).bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector string = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ string.setVal(batch.size++, String.valueOf(i).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ batch = reader.getSchema().createRowBatch();
+ string = (BytesColumnVector) batch.cols[0];
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(idx++), string.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY, encoding.getKind());
+ }
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/TestTypeDescription.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestTypeDescription.java b/orc/src/test/org/apache/orc/TestTypeDescription.java
new file mode 100644
index 0000000..0ac1e64
--- /dev/null
+++ b/orc/src/test/org/apache/orc/TestTypeDescription.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.orc.TypeDescription;
+import org.junit.Test;
+
+public class TestTypeDescription {
+
+ @Test
+ public void testJson() {
+ TypeDescription bin = TypeDescription.createBinary();
+ assertEquals("{\"category\": \"binary\", \"id\": 0, \"max\": 0}",
+ bin.toJson());
+ assertEquals("binary", bin.toString());
+ TypeDescription struct = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt())
+ .addField("f2", TypeDescription.createString())
+ .addField("f3", TypeDescription.createDecimal());
+ assertEquals("struct<f1:int,f2:string,f3:decimal(38,10)>",
+ struct.toString());
+ assertEquals("{\"category\": \"struct\", \"id\": 0, \"max\": 3, \"fields\": [\n"
+ + " \"f1\": {\"category\": \"int\", \"id\": 1, \"max\": 1},\n"
+ + " \"f2\": {\"category\": \"string\", \"id\": 2, \"max\": 2},\n"
+ + " \"f3\": {\"category\": \"decimal\", \"id\": 3, \"max\": 3, \"precision\": 38, \"scale\": 10}]}",
+ struct.toJson());
+ struct = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createUnion()
+ .addUnionChild(TypeDescription.createByte())
+ .addUnionChild(TypeDescription.createDecimal()
+ .withPrecision(20).withScale(10)))
+ .addField("f2", TypeDescription.createStruct()
+ .addField("f3", TypeDescription.createDate())
+ .addField("f4", TypeDescription.createDouble())
+ .addField("f5", TypeDescription.createBoolean()))
+ .addField("f6", TypeDescription.createChar().withMaxLength(100));
+ assertEquals("struct<f1:uniontype<tinyint,decimal(20,10)>,f2:struct<f3:date,f4:double,f5:boolean>,f6:char(100)>",
+ struct.toString());
+ assertEquals(
+ "{\"category\": \"struct\", \"id\": 0, \"max\": 8, \"fields\": [\n" +
+ " \"f1\": {\"category\": \"uniontype\", \"id\": 1, \"max\": 3, \"children\": [\n" +
+ " {\"category\": \"tinyint\", \"id\": 2, \"max\": 2},\n" +
+ " {\"category\": \"decimal\", \"id\": 3, \"max\": 3, \"precision\": 20, \"scale\": 10}]},\n" +
+ " \"f2\": {\"category\": \"struct\", \"id\": 4, \"max\": 7, \"fields\": [\n" +
+ " \"f3\": {\"category\": \"date\", \"id\": 5, \"max\": 5},\n" +
+ " \"f4\": {\"category\": \"double\", \"id\": 6, \"max\": 6},\n" +
+ " \"f5\": {\"category\": \"boolean\", \"id\": 7, \"max\": 7}]},\n" +
+ " \"f6\": {\"category\": \"char\", \"id\": 8, \"max\": 8, \"length\": 100}]}",
+ struct.toJson());
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/TestUnrolledBitPack.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestUnrolledBitPack.java b/orc/src/test/org/apache/orc/TestUnrolledBitPack.java
new file mode 100644
index 0000000..ef8fcd0
--- /dev/null
+++ b/orc/src/test/org/apache/orc/TestUnrolledBitPack.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import com.google.common.collect.Lists;
+import com.google.common.primitives.Longs;
+
+@RunWith(value = Parameterized.class)
+public class TestUnrolledBitPack {
+
+ private long val;
+
+ public TestUnrolledBitPack(long val) {
+ this.val = val;
+ }
+
+ @Parameters
+ public static Collection<Object[]> data() {
+ Object[][] data = new Object[][] { { -1 }, { 1 }, { 7 }, { -128 }, { 32000 }, { 8300000 },
+ { Integer.MAX_VALUE }, { 540000000000L }, { 140000000000000L }, { 36000000000000000L },
+ { Long.MAX_VALUE } };
+ return Arrays.asList(data);
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
+ + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testBitPacking() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { val, 0, val, val, 0, val, 0, val, val, 0, val, 0, val, val, 0, 0,
+ val, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val,
+ 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0,
+ 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0,
+ val, 0, val, 0, 0, val, 0, val, 0, 0, val, val };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .compress(CompressionKind.NONE).bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ int row = batch.size++;
+ ((LongColumnVector) batch.cols[0]).vector[row] = l;
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+}
[17/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
new file mode 100644
index 0000000..cdd62ac
--- /dev/null
+++ b/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
@@ -0,0 +1,1691 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import static junit.framework.Assert.assertEquals;
+import static org.hamcrest.core.Is.is;
+import static org.junit.Assert.*;
+import static org.mockito.Mockito.any;
+import static org.mockito.Mockito.atLeastOnce;
+import static org.mockito.Mockito.doThrow;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.List;
+
+import junit.framework.Assert;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PositionedReadable;
+import org.apache.hadoop.fs.Seekable;
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl;
+import org.apache.orc.BloomFilterIO;
+import org.apache.orc.DataReader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.apache.orc.impl.RecordReaderImpl.Location;
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.OrcProto;
+
+import org.junit.Test;
+import org.mockito.MockSettings;
+import org.mockito.Mockito;
+
+public class TestRecordReaderImpl {
+ /**
+ * Create a predicate leaf. This is used by another test.
+ */
+ public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator,
+ PredicateLeaf.Type type,
+ String columnName,
+ Object literal,
+ List<Object> literalList) {
+ return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName,
+ literal, literalList);
+ }
+
+ // can add .verboseLogging() to cause Mockito to log invocations
+ private final MockSettings settings = Mockito.withSettings().verboseLogging();
+
+ static class BufferInStream
+ extends InputStream implements PositionedReadable, Seekable {
+ private final byte[] buffer;
+ private final int length;
+ private int position = 0;
+
+ BufferInStream(byte[] bytes, int length) {
+ this.buffer = bytes;
+ this.length = length;
+ }
+
+ @Override
+ public int read() {
+ if (position < length) {
+ return buffer[position++];
+ }
+ return -1;
+ }
+
+ @Override
+ public int read(byte[] bytes, int offset, int length) {
+ int lengthToRead = Math.min(length, this.length - this.position);
+ if (lengthToRead >= 0) {
+ for(int i=0; i < lengthToRead; ++i) {
+ bytes[offset + i] = buffer[position++];
+ }
+ return lengthToRead;
+ } else {
+ return -1;
+ }
+ }
+
+ @Override
+ public int read(long position, byte[] bytes, int offset, int length) {
+ this.position = (int) position;
+ return read(bytes, offset, length);
+ }
+
+ @Override
+ public void readFully(long position, byte[] bytes, int offset,
+ int length) throws IOException {
+ this.position = (int) position;
+ while (length > 0) {
+ int result = read(bytes, offset, length);
+ offset += result;
+ length -= result;
+ if (result < 0) {
+ throw new IOException("Read past end of buffer at " + offset);
+ }
+ }
+ }
+
+ @Override
+ public void readFully(long position, byte[] bytes) throws IOException {
+ readFully(position, bytes, 0, bytes.length);
+ }
+
+ @Override
+ public void seek(long position) {
+ this.position = (int) position;
+ }
+
+ @Override
+ public long getPos() {
+ return position;
+ }
+
+ @Override
+ public boolean seekToNewSource(long position) throws IOException {
+ this.position = (int) position;
+ return false;
+ }
+ }
+
+ @Test
+ public void testMaxLengthToReader() throws Exception {
+ Configuration conf = new Configuration();
+ OrcProto.Type rowType = OrcProto.Type.newBuilder()
+ .setKind(OrcProto.Type.Kind.STRUCT).build();
+ OrcProto.Footer footer = OrcProto.Footer.newBuilder()
+ .setHeaderLength(0).setContentLength(0).setNumberOfRows(0)
+ .setRowIndexStride(0).addTypes(rowType).build();
+ OrcProto.PostScript ps = OrcProto.PostScript.newBuilder()
+ .setCompression(OrcProto.CompressionKind.NONE)
+ .setFooterLength(footer.getSerializedSize())
+ .setMagic("ORC").addVersion(0).addVersion(11).build();
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ footer.writeTo(buffer);
+ ps.writeTo(buffer);
+ buffer.write(ps.getSerializedSize());
+ FileSystem fs = mock(FileSystem.class, settings);
+ FSDataInputStream file =
+ new FSDataInputStream(new BufferInStream(buffer.getData(),
+ buffer.getLength()));
+ Path p = new Path("/dir/file.orc");
+ when(fs.open(p)).thenReturn(file);
+ OrcFile.ReaderOptions options = OrcFile.readerOptions(conf);
+ options.filesystem(fs);
+ options.maxLength(buffer.getLength());
+ when(fs.getFileStatus(p))
+ .thenReturn(new FileStatus(10, false, 3, 3000, 0, p));
+ Reader reader = OrcFile.createReader(p, options);
+ }
+
+ @Test
+ public void testCompareToRangeInt() throws Exception {
+ assertEquals(Location.BEFORE,
+ RecordReaderImpl.compareToRange(19L, 20L, 40L));
+ assertEquals(Location.AFTER,
+ RecordReaderImpl.compareToRange(41L, 20L, 40L));
+ assertEquals(Location.MIN,
+ RecordReaderImpl.compareToRange(20L, 20L, 40L));
+ assertEquals(Location.MIDDLE,
+ RecordReaderImpl.compareToRange(21L, 20L, 40L));
+ assertEquals(Location.MAX,
+ RecordReaderImpl.compareToRange(40L, 20L, 40L));
+ assertEquals(Location.BEFORE,
+ RecordReaderImpl.compareToRange(0L, 1L, 1L));
+ assertEquals(Location.MIN,
+ RecordReaderImpl.compareToRange(1L, 1L, 1L));
+ assertEquals(Location.AFTER,
+ RecordReaderImpl.compareToRange(2L, 1L, 1L));
+ }
+
+ @Test
+ public void testCompareToRangeString() throws Exception {
+ assertEquals(Location.BEFORE,
+ RecordReaderImpl.compareToRange("a", "b", "c"));
+ assertEquals(Location.AFTER,
+ RecordReaderImpl.compareToRange("d", "b", "c"));
+ assertEquals(Location.MIN,
+ RecordReaderImpl.compareToRange("b", "b", "c"));
+ assertEquals(Location.MIDDLE,
+ RecordReaderImpl.compareToRange("bb", "b", "c"));
+ assertEquals(Location.MAX,
+ RecordReaderImpl.compareToRange("c", "b", "c"));
+ assertEquals(Location.BEFORE,
+ RecordReaderImpl.compareToRange("a", "b", "b"));
+ assertEquals(Location.MIN,
+ RecordReaderImpl.compareToRange("b", "b", "b"));
+ assertEquals(Location.AFTER,
+ RecordReaderImpl.compareToRange("c", "b", "b"));
+ }
+
+ @Test
+ public void testCompareToCharNeedConvert() throws Exception {
+ assertEquals(Location.BEFORE,
+ RecordReaderImpl.compareToRange("apple", "hello", "world"));
+ assertEquals(Location.AFTER,
+ RecordReaderImpl.compareToRange("zombie", "hello", "world"));
+ assertEquals(Location.MIN,
+ RecordReaderImpl.compareToRange("hello", "hello", "world"));
+ assertEquals(Location.MIDDLE,
+ RecordReaderImpl.compareToRange("pilot", "hello", "world"));
+ assertEquals(Location.MAX,
+ RecordReaderImpl.compareToRange("world", "hello", "world"));
+ assertEquals(Location.BEFORE,
+ RecordReaderImpl.compareToRange("apple", "hello", "hello"));
+ assertEquals(Location.MIN,
+ RecordReaderImpl.compareToRange("hello", "hello", "hello"));
+ assertEquals(Location.AFTER,
+ RecordReaderImpl.compareToRange("zombie", "hello", "hello"));
+ }
+
+ @Test
+ public void testGetMin() throws Exception {
+ assertEquals(10L, RecordReaderImpl.getMin(
+ ColumnStatisticsImpl.deserialize(createIntStats(10L, 100L))));
+ assertEquals(10.0d, RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
+ OrcProto.ColumnStatistics.newBuilder()
+ .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
+ .setMinimum(10.0d).setMaximum(100.0d).build()).build())));
+ assertEquals(null, RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
+ OrcProto.ColumnStatistics.newBuilder()
+ .setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
+ .build())));
+ assertEquals("a", RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
+ OrcProto.ColumnStatistics.newBuilder()
+ .setStringStatistics(OrcProto.StringStatistics.newBuilder()
+ .setMinimum("a").setMaximum("b").build()).build())));
+ assertEquals("hello", RecordReaderImpl.getMin(ColumnStatisticsImpl
+ .deserialize(createStringStats("hello", "world"))));
+ assertEquals(HiveDecimal.create("111.1"), RecordReaderImpl.getMin(ColumnStatisticsImpl
+ .deserialize(createDecimalStats("111.1", "112.1"))));
+ }
+
+ private static OrcProto.ColumnStatistics createIntStats(Long min,
+ Long max) {
+ OrcProto.IntegerStatistics.Builder intStats =
+ OrcProto.IntegerStatistics.newBuilder();
+ if (min != null) {
+ intStats.setMinimum(min);
+ }
+ if (max != null) {
+ intStats.setMaximum(max);
+ }
+ return OrcProto.ColumnStatistics.newBuilder()
+ .setIntStatistics(intStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createBooleanStats(int n, int trueCount) {
+ OrcProto.BucketStatistics.Builder boolStats = OrcProto.BucketStatistics.newBuilder();
+ boolStats.addCount(trueCount);
+ return OrcProto.ColumnStatistics.newBuilder().setNumberOfValues(n).setBucketStatistics(
+ boolStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createIntStats(int min, int max) {
+ OrcProto.IntegerStatistics.Builder intStats = OrcProto.IntegerStatistics.newBuilder();
+ intStats.setMinimum(min);
+ intStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setIntStatistics(intStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createDoubleStats(double min, double max) {
+ OrcProto.DoubleStatistics.Builder dblStats = OrcProto.DoubleStatistics.newBuilder();
+ dblStats.setMinimum(min);
+ dblStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setDoubleStatistics(dblStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createStringStats(String min, String max,
+ boolean hasNull) {
+ OrcProto.StringStatistics.Builder strStats = OrcProto.StringStatistics.newBuilder();
+ strStats.setMinimum(min);
+ strStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build())
+ .setHasNull(hasNull).build();
+ }
+
+ private static OrcProto.ColumnStatistics createStringStats(String min, String max) {
+ OrcProto.StringStatistics.Builder strStats = OrcProto.StringStatistics.newBuilder();
+ strStats.setMinimum(min);
+ strStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createDateStats(int min, int max) {
+ OrcProto.DateStatistics.Builder dateStats = OrcProto.DateStatistics.newBuilder();
+ dateStats.setMinimum(min);
+ dateStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setDateStatistics(dateStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createTimestampStats(long min, long max) {
+ OrcProto.TimestampStatistics.Builder tsStats = OrcProto.TimestampStatistics.newBuilder();
+ tsStats.setMinimum(min);
+ tsStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setTimestampStatistics(tsStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createDecimalStats(String min, String max) {
+ OrcProto.DecimalStatistics.Builder decStats = OrcProto.DecimalStatistics.newBuilder();
+ decStats.setMinimum(min);
+ decStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setDecimalStatistics(decStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createDecimalStats(String min, String max,
+ boolean hasNull) {
+ OrcProto.DecimalStatistics.Builder decStats = OrcProto.DecimalStatistics.newBuilder();
+ decStats.setMinimum(min);
+ decStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setDecimalStatistics(decStats.build())
+ .setHasNull(hasNull).build();
+ }
+
+ @Test
+ public void testGetMax() throws Exception {
+ assertEquals(100L, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(createIntStats(10L, 100L))));
+ assertEquals(100.0d, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
+ OrcProto.ColumnStatistics.newBuilder()
+ .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
+ .setMinimum(10.0d).setMaximum(100.0d).build()).build())));
+ assertEquals(null, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
+ OrcProto.ColumnStatistics.newBuilder()
+ .setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
+ .build())));
+ assertEquals("b", RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
+ OrcProto.ColumnStatistics.newBuilder()
+ .setStringStatistics(OrcProto.StringStatistics.newBuilder()
+ .setMinimum("a").setMaximum("b").build()).build())));
+ assertEquals("world", RecordReaderImpl.getMax(ColumnStatisticsImpl
+ .deserialize(createStringStats("hello", "world"))));
+ assertEquals(HiveDecimal.create("112.1"), RecordReaderImpl.getMax(ColumnStatisticsImpl
+ .deserialize(createDecimalStats("111.1", "112.1"))));
+ }
+
+ @Test
+ public void testPredEvalWithBooleanStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
+
+ pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
+
+ pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", false, null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
+ }
+
+ @Test
+ public void testPredEvalWithIntStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+
+ // Stats gets converted to column type. "15" is outside of "10" and "100"
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "15", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+
+ // Integer stats will not be converted date because of days/seconds/millis ambiguity
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+ }
+
+ @Test
+ public void testPredEvalWithDoubleStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+
+ // Stats gets converted to column type. "15.0" is outside of "10.0" and "100.0"
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "15", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+
+ // Double is not converted to date type because of days/seconds/millis ambiguity
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15*1000L), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150*1000L), null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+ }
+
+ @Test
+ public void testPredEvalWithStringStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 100L, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.FLOAT, "x", 100.0, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "100", null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+
+ // IllegalArgumentException is thrown when converting String to Date, hence YES_NO
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(100).get(), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 1000), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("100"), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(100), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+ }
+
+ @Test
+ public void testPredEvalWithDateStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ // Date to Integer conversion is not possible.
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ // Date to Float conversion is also not possible.
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "15", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "1970-01-11", null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "15.1", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "__a15__1", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "2000-01-16", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "1970-01-16", null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(150).get(), null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ // Date to Decimal conversion is also not possible.
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15L * 24L * 60L * 60L * 1000L), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ }
+
+ @Test
+ public void testPredEvalWithDecimalStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+
+ // "15" out of range of "10.0" and "100.0"
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "15", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+
+ // Decimal to Date not possible.
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15 * 1000L), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150 * 1000L), null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+ }
+
+ @Test
+ public void testPredEvalWithTimestampStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "15", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", new Timestamp(15).toString(), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10 * 24L * 60L * 60L * 1000L,
+ 100 * 24L * 60L * 60L * 1000L), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
+ }
+
+ @Test
+ public void testEquals() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG,
+ "x", 15L, null);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null));
+ }
+
+ @Test
+ public void testNullSafeEquals() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG,
+ "x", 15L, null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null));
+ }
+
+ @Test
+ public void testLessThan() throws Exception {
+ PredicateLeaf lessThan = createPredicateLeaf
+ (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.LONG,
+ "x", 15L, null);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), lessThan, null));
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), lessThan, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), lessThan, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), lessThan, null));
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), lessThan, null));
+ }
+
+ @Test
+ public void testLessThanEquals() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.LONG,
+ "x", 15L, null);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
+ }
+
+ @Test
+ public void testIn() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(10L);
+ args.add(20L);
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
+ "x", null, args);
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 20L), pred, null));
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null));
+ }
+
+ @Test
+ public void testBetween() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(10L);
+ args.add(20L);
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.LONG,
+ "x", null, args);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 5L), pred, null));
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 40L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 15L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 25L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 25L), pred, null));
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 20L), pred, null));
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null));
+ }
+
+ @Test
+ public void testIsNull() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.LONG,
+ "x", null, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+ }
+
+
+ @Test
+ public void testEqualsWithNullInStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING,
+ "x", "c", null);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+ }
+
+ @Test
+ public void testNullSafeEqualsWithNullInStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING,
+ "x", "c", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+ }
+
+ @Test
+ public void testLessThanWithNullInStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.STRING,
+ "x", "c", null);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ assertEquals(TruthValue.NO_NULL, // min, same stats
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null));
+ }
+
+ @Test
+ public void testLessThanEqualsWithNullInStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.STRING,
+ "x", "c", null);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+ }
+
+ @Test
+ public void testInWithNullInStats() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add("c");
+ args.add("f");
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
+ "x", null, args);
+ assertEquals(TruthValue.NO_NULL, // before & after
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null));
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null)); // max
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+ }
+
+ @Test
+ public void testBetweenWithNullInStats() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add("c");
+ args.add("f");
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.STRING,
+ "x", null, args);
+ assertEquals(TruthValue.YES_NULL, // before & after
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null));
+ assertEquals(TruthValue.YES_NULL, // before & max
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null));
+ assertEquals(TruthValue.NO_NULL, // before & before
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("h", "g", true), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL, // before & min
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("f", "g", true), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL, // before & middle
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "g", true), pred, null));
+
+ assertEquals(TruthValue.YES_NULL, // min & after
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "e", true), pred, null));
+ assertEquals(TruthValue.YES_NULL, // min & max
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "f", true), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL, // min & middle
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "g", true), pred, null));
+
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "c", true), pred, null)); // max
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ assertEquals(TruthValue.YES_NULL, // min & after, same stats
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null));
+ }
+
+ @Test
+ public void testIsNullWithNullInStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.STRING,
+ "x", null, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null));
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", false), pred, null));
+ }
+
+ @Test
+ public void testOverlap() throws Exception {
+ assertTrue(!RecordReaderUtils.overlap(0, 10, -10, -1));
+ assertTrue(RecordReaderUtils.overlap(0, 10, -1, 0));
+ assertTrue(RecordReaderUtils.overlap(0, 10, -1, 1));
+ assertTrue(RecordReaderUtils.overlap(0, 10, 2, 8));
+ assertTrue(RecordReaderUtils.overlap(0, 10, 5, 10));
+ assertTrue(RecordReaderUtils.overlap(0, 10, 10, 11));
+ assertTrue(RecordReaderUtils.overlap(0, 10, 0, 10));
+ assertTrue(RecordReaderUtils.overlap(0, 10, -1, 11));
+ assertTrue(!RecordReaderUtils.overlap(0, 10, 11, 12));
+ }
+
+ private static DiskRangeList diskRanges(Integer... points) {
+ DiskRangeList head = null, tail = null;
+ for(int i = 0; i < points.length; i += 2) {
+ DiskRangeList range = new DiskRangeList(points[i], points[i+1]);
+ if (tail == null) {
+ head = tail = range;
+ } else {
+ tail = tail.insertAfter(range);
+ }
+ }
+ return head;
+ }
+
+ @Test
+ public void testGetIndexPosition() throws Exception {
+ assertEquals(0, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
+ OrcProto.Stream.Kind.PRESENT, true, true));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
+ OrcProto.Stream.Kind.DATA, true, true));
+ assertEquals(3, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
+ OrcProto.Stream.Kind.DATA, false, true));
+ assertEquals(0, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
+ OrcProto.Stream.Kind.DATA, true, false));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DICTIONARY, OrcProto.Type.Kind.STRING,
+ OrcProto.Stream.Kind.DATA, true, true));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
+ OrcProto.Stream.Kind.DATA, true, true));
+ assertEquals(3, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
+ OrcProto.Stream.Kind.DATA, false, true));
+ assertEquals(6, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
+ OrcProto.Stream.Kind.LENGTH, true, true));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
+ OrcProto.Stream.Kind.LENGTH, false, true));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
+ OrcProto.Stream.Kind.DATA, true, true));
+ assertEquals(3, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
+ OrcProto.Stream.Kind.DATA, false, true));
+ assertEquals(6, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
+ OrcProto.Stream.Kind.SECONDARY, true, true));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
+ OrcProto.Stream.Kind.SECONDARY, false, true));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
+ OrcProto.Stream.Kind.DATA, true, true));
+ assertEquals(3, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
+ OrcProto.Stream.Kind.DATA, false, true));
+ assertEquals(7, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
+ OrcProto.Stream.Kind.SECONDARY, true, true));
+ assertEquals(5, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
+ OrcProto.Stream.Kind.SECONDARY, false, true));
+ }
+
+ @Test
+ public void testPartialPlan() throws Exception {
+ DiskRangeList result;
+
+ // set the streams
+ List<OrcProto.Stream> streams = new ArrayList<OrcProto.Stream>();
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.PRESENT)
+ .setColumn(1).setLength(1000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DATA)
+ .setColumn(1).setLength(99000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.PRESENT)
+ .setColumn(2).setLength(2000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DATA)
+ .setColumn(2).setLength(98000).build());
+
+ boolean[] columns = new boolean[]{true, true, false};
+ boolean[] rowGroups = new boolean[]{true, true, false, false, true, false};
+
+ // set the index
+ OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.length];
+ indexes[1] = OrcProto.RowIndex.newBuilder()
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(0).addPositions(-1).addPositions(-1)
+ .addPositions(0)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(100).addPositions(-1).addPositions(-1)
+ .addPositions(10000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(200).addPositions(-1).addPositions(-1)
+ .addPositions(20000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(300).addPositions(-1).addPositions(-1)
+ .addPositions(30000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(400).addPositions(-1).addPositions(-1)
+ .addPositions(40000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(500).addPositions(-1).addPositions(-1)
+ .addPositions(50000)
+ .build())
+ .build();
+
+ // set encodings
+ List<OrcProto.ColumnEncoding> encodings =
+ new ArrayList<OrcProto.ColumnEncoding>();
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+
+ // set types struct{x: int, y: int}
+ List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT)
+ .addSubtypes(1).addSubtypes(2).addFieldNames("x")
+ .addFieldNames("y").build());
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
+
+ // filter by rows and groups
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(0, 1000, 100, 1000, 400, 1000,
+ 1000, 11000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
+ 11000, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
+ 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, true);
+ assertThat(result, is(diskRanges(0, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
+ 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
+
+ // if we read no rows, don't read any bytes
+ rowGroups = new boolean[]{false, false, false, false, false, false};
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, false);
+ assertNull(result);
+
+ // all rows, but only columns 0 and 2.
+ rowGroups = null;
+ columns = new boolean[]{true, false, true};
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, null, false, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(100000, 102000, 102000, 200000)));
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, null, false, encodings, types, 32768, true);
+ assertThat(result, is(diskRanges(100000, 200000)));
+
+ rowGroups = new boolean[]{false, true, false, false, false, false};
+ indexes[2] = indexes[1];
+ indexes[1] = null;
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(100100, 102000,
+ 112000, 122000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, true);
+ assertThat(result, is(diskRanges(100100, 102000,
+ 112000, 122000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
+
+ rowGroups = new boolean[]{false, false, false, false, false, true};
+ indexes[1] = indexes[2];
+ columns = new boolean[]{true, true, true};
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(500, 1000, 51000, 100000, 100500, 102000,
+ 152000, 200000)));
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, true);
+ assertThat(result, is(diskRanges(500, 1000, 51000, 100000, 100500, 102000,
+ 152000, 200000)));
+ }
+
+
+ @Test
+ public void testPartialPlanCompressed() throws Exception {
+ DiskRangeList result;
+
+ // set the streams
+ List<OrcProto.Stream> streams = new ArrayList<OrcProto.Stream>();
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.PRESENT)
+ .setColumn(1).setLength(1000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DATA)
+ .setColumn(1).setLength(99000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.PRESENT)
+ .setColumn(2).setLength(2000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DATA)
+ .setColumn(2).setLength(98000).build());
+
+ boolean[] columns = new boolean[]{true, true, false};
+ boolean[] rowGroups = new boolean[]{true, true, false, false, true, false};
+
+ // set the index
+ OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.length];
+ indexes[1] = OrcProto.RowIndex.newBuilder()
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(0).addPositions(-1).addPositions(-1).addPositions(-1)
+ .addPositions(0)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(100).addPositions(-1).addPositions(-1).addPositions(-1)
+ .addPositions(10000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(200).addPositions(-1).addPositions(-1).addPositions(-1)
+ .addPositions(20000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(300).addPositions(-1).addPositions(-1).addPositions(-1)
+ .addPositions(30000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(400).addPositions(-1).addPositions(-1).addPositions(-1)
+ .addPositions(40000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(500).addPositions(-1).addPositions(-1).addPositions(-1)
+ .addPositions(50000)
+ .build())
+ .build();
+
+ // set encodings
+ List<OrcProto.ColumnEncoding> encodings =
+ new ArrayList<OrcProto.ColumnEncoding>();
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+
+ // set types struct{x: int, y: int}
+ List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT)
+ .addSubtypes(1).addSubtypes(2).addFieldNames("x")
+ .addFieldNames("y").build());
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
+
+ // filter by rows and groups
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, true, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(0, 1000, 100, 1000,
+ 400, 1000, 1000, 11000+(2*32771),
+ 11000, 21000+(2*32771), 41000, 100000)));
+
+ rowGroups = new boolean[]{false, false, false, false, false, true};
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, true, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(500, 1000, 51000, 100000)));
+ }
+
+ @Test
+ public void testPartialPlanString() throws Exception {
+ DiskRangeList result;
+
+ // set the streams
+ List<OrcProto.Stream> streams = new ArrayList<OrcProto.Stream>();
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.PRESENT)
+ .setColumn(1).setLength(1000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DATA)
+ .setColumn(1).setLength(94000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.LENGTH)
+ .setColumn(1).setLength(2000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DICTIONARY_DATA)
+ .setColumn(1).setLength(3000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.PRESENT)
+ .setColumn(2).setLength(2000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DATA)
+ .setColumn(2).setLength(98000).build());
+
+ boolean[] columns = new boolean[]{true, true, false};
+ boolean[] rowGroups = new boolean[]{false, true, false, false, true, true};
+
+ // set the index
+ OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.length];
+ indexes[1] = OrcProto.RowIndex.newBuilder()
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(0).addPositions(-1).addPositions(-1)
+ .addPositions(0)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(100).addPositions(-1).addPositions(-1)
+ .addPositions(10000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(200).addPositions(-1).addPositions(-1)
+ .addPositions(20000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(300).addPositions(-1).addPositions(-1)
+ .addPositions(30000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(400).addPositions(-1).addPositions(-1)
+ .addPositions(40000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(500).addPositions(-1).addPositions(-1)
+ .addPositions(50000)
+ .build())
+ .build();
+
+ // set encodings
+ List<OrcProto.ColumnEncoding> encodings =
+ new ArrayList<OrcProto.ColumnEncoding>();
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DICTIONARY).build());
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+
+ // set types struct{x: string, y: int}
+ List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT)
+ .addSubtypes(1).addSubtypes(2).addFieldNames("x")
+ .addFieldNames("y").build());
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRING).build());
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
+
+ // filter by rows and groups
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(100, 1000, 400, 1000, 500, 1000,
+ 11000, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
+ 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
+ 51000, 95000, 95000, 97000, 97000, 100000)));
+ }
+
+ @Test
+ public void testIntNullSafeEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong(i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong(15);
+ assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testIntEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong(i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong(15);
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testIntInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(15L);
+ args.add(19L);
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong(i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong(19);
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong(15);
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDoubleNullSafeEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addDouble(i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addDouble(15.0);
+ assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDoubleEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addDouble(i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addDouble(15.0);
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDoubleInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(15.0);
+ args.add(19.0);
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.FLOAT,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addDouble(i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addDouble(19.0);
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addDouble(15.0);
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testStringNullSafeEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString("str_" + i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString("str_15");
+ assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testStringEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString("str_" + i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString("str_15");
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testStringInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add("str_15");
+ args.add("str_19");
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString("str_" + i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString("str_19");
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString("str_15");
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDateWritableNullSafeEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x",
+ new DateWritable(15).get(), null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong((new DateWritable(i)).getDays());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new DateWritable(15)).getDays());
+ assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDateWritableEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DATE, "x",
+ new DateWritable(15).get(), null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong((new DateWritable(i)).getDays());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new DateWritable(15)).getDays());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDateWritableInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(new DateWritable(15).get());
+ args.add(new DateWritable(19).get());
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DATE,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong((new DateWritable(i)).getDays());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new DateWritable(19)).getDays());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new DateWritable(15)).getDays());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testTimestampNullSafeEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x",
+ new Timestamp(15),
+ null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong((new Timestamp(i)).getTime());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new Timestamp(15)).getTime());
+ assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testTimestampEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong((new Timestamp(i)).getTime());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new Timestamp(15)).getTime());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testTimestampInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(new Timestamp(15));
+ args.add(new Timestamp(19));
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.TIMESTAMP,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong((new Timestamp(i)).getTime());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new Timestamp(19)).getTime());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new Timestamp(15)).getTime());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDecimalNullSafeEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x",
+ new HiveDecimalWritable("15"),
+ null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString(HiveDecimal.create(i).toString());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString(HiveDecimal.create(15).toString());
+ assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDecimalEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DECIMAL, "x",
+ new HiveDecimalWritable("15"),
+ null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString(HiveDecimal.create(i).toString());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString(HiveDecimal.create(15).toString());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDecimalInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(new HiveDecimalWritable("15"));
+ args.add(new HiveDecimalWritable("19"));
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString(HiveDecimal.create(i).toString());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString(HiveDecimal.create(19).toString());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString(HiveDecimal.create(15).toString());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testNullsInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(new HiveDecimalWritable("15"));
+ args.add(null);
+ args.add(new HiveDecimalWritable("19"));
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString(HiveDecimal.create(i).toString());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", false));
+ // hasNull is false, so bloom filter should return NO
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", true));
+ // hasNull is true, so bloom filter should return YES_NO_NULL
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString(HiveDecimal.create(19).toString());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString(HiveDecimal.create(15).toString());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testClose() throws Exception {
+ DataReader mockedDataReader = mock(DataReader.class);
+ closeMockedRecordReader(mockedDataReader);
+
+ verify(mockedDataReader, atLeastOnce()).close();
+ }
+
+ @Test
+ public void testCloseWithException() throws Exception {
+ DataReader mockedDataReader = mock(DataReader.class);
+ doThrow(IOException.class).when(mockedDataReader).close();
+
+ try {
+ closeMockedRecordReader(mockedDataReader);
+ fail("Exception should have been thrown when Record Reader was closed");
+ } catch (IOException expected) {
+
+ }
+
+ verify(mockedDataReader, atLeastOnce()).close();
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+
+ private void closeMockedRecordReader(DataReader mockedDataReader) throws IOException {
+ Configuration conf = new Configuration();
+ Path path = new Path(workDir, "empty.orc");
+ FileSystem.get(conf).delete(path, true);
+ Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf)
+ .setSchema(TypeDescription.createLong()));
+ writer.close();
+ Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
+
+ RecordReader recordReader = reader.rows(new Reader.Options()
+ .dataReader(mockedDataReader));
+
+ recordReader.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/impl/TestStreamName.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestStreamName.java b/orc/src/test/org/apache/orc/impl/TestStreamName.java
new file mode 100644
index 0000000..be58d4c
--- /dev/null
+++ b/orc/src/test/org/apache/orc/impl/TestStreamName.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import org.apache.orc.OrcProto;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class TestStreamName {
+
+ @Test
+ public void test1() throws Exception {
+ StreamName s1 = new StreamName(3, OrcProto.Stream.Kind.DATA);
+ StreamName s2 = new StreamName(3,
+ OrcProto.Stream.Kind.DICTIONARY_DATA);
+ StreamName s3 = new StreamName(5, OrcProto.Stream.Kind.DATA);
+ StreamName s4 = new StreamName(5,
+ OrcProto.Stream.Kind.DICTIONARY_DATA);
+ StreamName s1p = new StreamName(3, OrcProto.Stream.Kind.DATA);
+ assertEquals(true, s1.equals(s1));
+ assertEquals(false, s1.equals(s2));
+ assertEquals(false, s1.equals(s3));
+ assertEquals(true, s1.equals(s1p));
+ assertEquals(true, s1.compareTo(null) < 0);
+ assertEquals(false, s1.equals(null));
+ assertEquals(true, s1.compareTo(s2) < 0);
+ assertEquals(true, s2.compareTo(s3) < 0);
+ assertEquals(true, s3.compareTo(s4) < 0);
+ assertEquals(true, s4.compareTo(s1p) > 0);
+ assertEquals(0, s1p.compareTo(s1));
+ }
+}
[04/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorOrcFile.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorOrcFile.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorOrcFile.java
deleted file mode 100644
index 6589692..0000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorOrcFile.java
+++ /dev/null
@@ -1,2791 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hive.ql.io.orc;
-
-import com.google.common.collect.Lists;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hive.common.util.HiveTestUtils;
-import org.apache.orc.BinaryColumnStatistics;
-import org.apache.orc.BooleanColumnStatistics;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.CompressionKind;
-import org.apache.orc.DataReader;
-import org.apache.orc.DecimalColumnStatistics;
-import org.apache.orc.DoubleColumnStatistics;
-import org.apache.orc.IntegerColumnStatistics;
-import org.apache.orc.impl.DataReaderProperties;
-import org.apache.orc.impl.MemoryManager;
-import org.apache.orc.impl.OrcIndex;
-import org.apache.orc.OrcProto;
-import org.apache.orc.OrcUtils;
-import org.apache.orc.StringColumnStatistics;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.StripeStatistics;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.Writer;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-
-import java.io.File;
-import java.io.IOException;
-import java.math.BigInteger;
-import java.nio.ByteBuffer;
-import java.sql.Date;
-import java.sql.Timestamp;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-
-import static junit.framework.Assert.assertEquals;
-import static junit.framework.Assert.assertNotNull;
-import static junit.framework.Assert.assertNull;
-import static junit.framework.Assert.assertTrue;
-
-/**
- * Tests for the vectorized reader and writer for ORC files.
- */
-public class TestVectorOrcFile {
-
- public static class InnerStruct {
- int int1;
- Text string1 = new Text();
- InnerStruct(int int1, Text string1) {
- this.int1 = int1;
- this.string1.set(string1);
- }
- InnerStruct(int int1, String string1) {
- this.int1 = int1;
- this.string1.set(string1);
- }
-
- public String toString() {
- return "{" + int1 + ", " + string1 + "}";
- }
- }
-
- public static class MiddleStruct {
- List<InnerStruct> list = new ArrayList<InnerStruct>();
-
- MiddleStruct(InnerStruct... items) {
- list.clear();
- list.addAll(Arrays.asList(items));
- }
- }
-
- private static InnerStruct inner(int i, String s) {
- return new InnerStruct(i, s);
- }
-
- private static Map<String, InnerStruct> map(InnerStruct... items) {
- Map<String, InnerStruct> result = new HashMap<String, InnerStruct>();
- for(InnerStruct i: items) {
- result.put(i.string1.toString(), i);
- }
- return result;
- }
-
- private static List<InnerStruct> list(InnerStruct... items) {
- List<InnerStruct> result = new ArrayList<InnerStruct>();
- result.addAll(Arrays.asList(items));
- return result;
- }
-
- private static BytesWritable bytes(int... items) {
- BytesWritable result = new BytesWritable();
- result.setSize(items.length);
- for(int i=0; i < items.length; ++i) {
- result.getBytes()[i] = (byte) items[i];
- }
- return result;
- }
-
- private static byte[] bytesArray(int... items) {
- byte[] result = new byte[items.length];
- for(int i=0; i < items.length; ++i) {
- result[i] = (byte) items[i];
- }
- return result;
- }
-
- private static ByteBuffer byteBuf(int... items) {
- ByteBuffer result = ByteBuffer.allocate(items.length);
- for(int item: items) {
- result.put((byte) item);
- }
- result.flip();
- return result;
- }
-
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem () throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestVectorOrcFile." +
- testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @Test
- public void testReadFormat_0_11() throws Exception {
- Path oldFilePath =
- new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc"));
- Reader reader = OrcFile.createReader(oldFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- int stripeCount = 0;
- int rowCount = 0;
- long currentOffset = -1;
- for(StripeInformation stripe : reader.getStripes()) {
- stripeCount += 1;
- rowCount += stripe.getNumberOfRows();
- if (currentOffset < 0) {
- currentOffset = stripe.getOffset() + stripe.getIndexLength()
- + stripe.getDataLength() + stripe.getFooterLength();
- } else {
- assertEquals(currentOffset, stripe.getOffset());
- currentOffset += stripe.getIndexLength() + stripe.getDataLength()
- + stripe.getFooterLength();
- }
- }
- assertEquals(reader.getNumberOfRows(), rowCount);
- assertEquals(2, stripeCount);
-
- // check the stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(7500, stats[1].getNumberOfValues());
- assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount());
- assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount());
- assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString());
-
- assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
- assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
- assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
- assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum());
- assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000",
- stats[3].toString());
-
- assertEquals(Long.MAX_VALUE,
- ((IntegerColumnStatistics) stats[5]).getMaximum());
- assertEquals(Long.MAX_VALUE,
- ((IntegerColumnStatistics) stats[5]).getMinimum());
- assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
- assertEquals(
- "count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807",
- stats[5].toString());
-
- assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
- assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
- assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(),
- 0.00001);
- assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0",
- stats[7].toString());
-
- assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString());
-
- // check the inspectors
- TypeDescription schema = reader.getSchema();
- assertEquals(TypeDescription.Category.STRUCT, schema.getCategory());
- assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint,"
- + "int1:int,long1:bigint,float1:float,double1:double,bytes1:"
- + "binary,string1:string,middle:struct<list:array<struct<int1:int,"
- + "string1:string>>>,list:array<struct<int1:int,string1:string>>,"
- + "map:map<string,struct<int1:int,string1:string>>,ts:timestamp,"
- + "decimal1:decimal(38,10)>", schema.toString());
- VectorizedRowBatch batch = schema.createRowBatch();
-
- RecordReader rows = reader.rows();
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(1024, batch.size);
-
- // check the contents of the first row
- assertEquals(false, getBoolean(batch, 0));
- assertEquals(1, getByte(batch, 0));
- assertEquals(1024, getShort(batch, 0));
- assertEquals(65536, getInt(batch, 0));
- assertEquals(Long.MAX_VALUE, getLong(batch, 0));
- assertEquals(1.0, getFloat(batch, 0), 0.00001);
- assertEquals(-15.0, getDouble(batch, 0), 0.00001);
- assertEquals(bytes(0, 1, 2, 3, 4), getBinary(batch, 0));
- assertEquals("hi", getText(batch, 0).toString());
-
- StructColumnVector middle = (StructColumnVector) batch.cols[9];
- ListColumnVector midList = (ListColumnVector) middle.fields[0];
- StructColumnVector midListStruct = (StructColumnVector) midList.child;
- LongColumnVector midListInt = (LongColumnVector) midListStruct.fields[0];
- BytesColumnVector midListStr = (BytesColumnVector) midListStruct.fields[1];
- ListColumnVector list = (ListColumnVector) batch.cols[10];
- StructColumnVector listStruct = (StructColumnVector) list.child;
- LongColumnVector listInts = (LongColumnVector) listStruct.fields[0];
- BytesColumnVector listStrs = (BytesColumnVector) listStruct.fields[1];
- MapColumnVector map = (MapColumnVector) batch.cols[11];
- BytesColumnVector mapKey = (BytesColumnVector) map.keys;
- StructColumnVector mapValue = (StructColumnVector) map.values;
- LongColumnVector mapValueInts = (LongColumnVector) mapValue.fields[0];
- BytesColumnVector mapValueStrs = (BytesColumnVector) mapValue.fields[1];
- TimestampColumnVector timestamp = (TimestampColumnVector) batch.cols[12];
- DecimalColumnVector decs = (DecimalColumnVector) batch.cols[13];
-
- assertEquals(false, middle.isNull[0]);
- assertEquals(2, midList.lengths[0]);
- int start = (int) midList.offsets[0];
- assertEquals(1, midListInt.vector[start]);
- assertEquals("bye", midListStr.toString(start));
- assertEquals(2, midListInt.vector[start + 1]);
- assertEquals("sigh", midListStr.toString(start + 1));
-
- assertEquals(2, list.lengths[0]);
- start = (int) list.offsets[0];
- assertEquals(3, listInts.vector[start]);
- assertEquals("good", listStrs.toString(start));
- assertEquals(4, listInts.vector[start + 1]);
- assertEquals("bad", listStrs.toString(start + 1));
- assertEquals(0, map.lengths[0]);
- assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"),
- timestamp.asScratchTimestamp(0));
- assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547456")),
- decs.vector[0]);
-
- // check the contents of row 7499
- rows.seekToRow(7499);
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(true, getBoolean(batch, 0));
- assertEquals(100, getByte(batch, 0));
- assertEquals(2048, getShort(batch, 0));
- assertEquals(65536, getInt(batch, 0));
- assertEquals(Long.MAX_VALUE, getLong(batch, 0));
- assertEquals(2.0, getFloat(batch, 0), 0.00001);
- assertEquals(-5.0, getDouble(batch, 0), 0.00001);
- assertEquals(bytes(), getBinary(batch, 0));
- assertEquals("bye", getText(batch, 0).toString());
- assertEquals(false, middle.isNull[0]);
- assertEquals(2, midList.lengths[0]);
- start = (int) midList.offsets[0];
- assertEquals(1, midListInt.vector[start]);
- assertEquals("bye", midListStr.toString(start));
- assertEquals(2, midListInt.vector[start + 1]);
- assertEquals("sigh", midListStr.toString(start + 1));
- assertEquals(3, list.lengths[0]);
- start = (int) list.offsets[0];
- assertEquals(100000000, listInts.vector[start]);
- assertEquals("cat", listStrs.toString(start));
- assertEquals(-100000, listInts.vector[start + 1]);
- assertEquals("in", listStrs.toString(start + 1));
- assertEquals(1234, listInts.vector[start + 2]);
- assertEquals("hat", listStrs.toString(start + 2));
- assertEquals(2, map.lengths[0]);
- start = (int) map.offsets[0];
- assertEquals("chani", mapKey.toString(start));
- assertEquals(5, mapValueInts.vector[start]);
- assertEquals("chani", mapValueStrs.toString(start));
- assertEquals("mauddib", mapKey.toString(start + 1));
- assertEquals(1, mapValueInts.vector[start + 1]);
- assertEquals("mauddib", mapValueStrs.toString(start + 1));
- assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"),
- timestamp.asScratchTimestamp(0));
- assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547457")),
- decs.vector[0]);
-
- // handle the close up
- assertEquals(false, rows.nextBatch(batch));
- rows.close();
- }
-
- @Test
- public void testTimestamp() throws Exception {
- TypeDescription schema = TypeDescription.createTimestamp();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
- .bufferSize(10000).version(org.apache.orc.OrcFile.Version.V_0_11));
- List<Timestamp> tslist = Lists.newArrayList();
- tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.000999"));
- tslist.add(Timestamp.valueOf("2003-01-01 00:00:00.000000222"));
- tslist.add(Timestamp.valueOf("1999-01-01 00:00:00.999999999"));
- tslist.add(Timestamp.valueOf("1995-01-01 00:00:00.688888888"));
- tslist.add(Timestamp.valueOf("2002-01-01 00:00:00.1"));
- tslist.add(Timestamp.valueOf("2010-03-02 00:00:00.000009001"));
- tslist.add(Timestamp.valueOf("2005-01-01 00:00:00.000002229"));
- tslist.add(Timestamp.valueOf("2006-01-01 00:00:00.900203003"));
- tslist.add(Timestamp.valueOf("2003-01-01 00:00:00.800000007"));
- tslist.add(Timestamp.valueOf("1996-08-02 00:00:00.723100809"));
- tslist.add(Timestamp.valueOf("1998-11-02 00:00:00.857340643"));
- tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
-
- VectorizedRowBatch batch = new VectorizedRowBatch(1, 1024);
- TimestampColumnVector vec = new TimestampColumnVector(1024);
- batch.cols[0] = vec;
- batch.reset();
- batch.size = tslist.size();
- for (int i=0; i < tslist.size(); ++i) {
- Timestamp ts = tslist.get(i);
- vec.set(i, ts);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- TimestampColumnVector timestamps = (TimestampColumnVector) batch.cols[0];
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(tslist.get(idx++).getNanos(),
- timestamps.asScratchTimestamp(r).getNanos());
- }
- }
- assertEquals(tslist.size(), rows.getRowNumber());
- assertEquals(0, writer.getSchema().getMaximumId());
- boolean[] expected = new boolean[] {false};
- boolean[] included = OrcUtils.includeColumns("", writer.getSchema());
- assertEquals(true, Arrays.equals(expected, included));
- }
-
- @Test
- public void testStringAndBinaryStatistics() throws Exception {
-
- TypeDescription schema = TypeDescription.createStruct()
- .addField("bytes1", TypeDescription.createBinary())
- .addField("string1", TypeDescription.createString());
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 4;
- BytesColumnVector field1 = (BytesColumnVector) batch.cols[0];
- BytesColumnVector field2 = (BytesColumnVector) batch.cols[1];
- field1.setVal(0, bytesArray(0, 1, 2, 3, 4));
- field1.setVal(1, bytesArray(0, 1, 2, 3));
- field1.setVal(2, bytesArray(0, 1, 2, 3, 4, 5));
- field1.noNulls = false;
- field1.isNull[3] = true;
- field2.setVal(0, "foo".getBytes());
- field2.setVal(1, "bar".getBytes());
- field2.noNulls = false;
- field2.isNull[2] = true;
- field2.setVal(3, "hi".getBytes());
- writer.addRowBatch(batch);
- writer.close();
- schema = writer.getSchema();
- assertEquals(2, schema.getMaximumId());
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- boolean[] expected = new boolean[] {false, false, true};
- boolean[] included = OrcUtils.includeColumns("string1", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- expected = new boolean[] {false, false, false};
- included = OrcUtils.includeColumns("", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- expected = new boolean[] {false, false, false};
- included = OrcUtils.includeColumns(null, schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- // check the stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(4, stats[0].getNumberOfValues());
- assertEquals("count: 4 hasNull: false", stats[0].toString());
-
- assertEquals(3, stats[1].getNumberOfValues());
- assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
- assertEquals("count: 3 hasNull: true sum: 15", stats[1].toString());
-
- assertEquals(3, stats[2].getNumberOfValues());
- assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
- assertEquals("hi", ((StringColumnStatistics) stats[2]).getMaximum());
- assertEquals(8, ((StringColumnStatistics) stats[2]).getSum());
- assertEquals("count: 3 hasNull: true min: bar max: hi sum: 8",
- stats[2].toString());
-
- // check the inspectors
- batch = reader.getSchema().createRowBatch();
- BytesColumnVector bytes = (BytesColumnVector) batch.cols[0];
- BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
- RecordReader rows = reader.rows();
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(4, batch.size);
-
- // check the contents of the first row
- assertEquals(bytes(0,1,2,3,4), getBinary(bytes, 0));
- assertEquals("foo", strs.toString(0));
-
- // check the contents of second row
- assertEquals(bytes(0,1,2,3), getBinary(bytes, 1));
- assertEquals("bar", strs.toString(1));
-
- // check the contents of third row
- assertEquals(bytes(0,1,2,3,4,5), getBinary(bytes, 2));
- assertNull(strs.toString(2));
-
- // check the contents of fourth row
- assertNull(getBinary(bytes, 3));
- assertEquals("hi", strs.toString(3));
-
- // handle the close up
- assertEquals(false, rows.hasNext());
- rows.close();
- }
-
-
- @Test
- public void testStripeLevelStats() throws Exception {
- TypeDescription schema = TypeDescription.createStruct()
- .addField("int1", TypeDescription.createInt())
- .addField("string1", TypeDescription.createString());
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 1000;
- LongColumnVector field1 = (LongColumnVector) batch.cols[0];
- BytesColumnVector field2 = (BytesColumnVector) batch.cols[1];
- field1.isRepeating = true;
- field2.isRepeating = true;
- for (int b = 0; b < 11; b++) {
- if (b >= 5) {
- if (b >= 10) {
- field1.vector[0] = 3;
- field2.setVal(0, "three".getBytes());
- } else {
- field1.vector[0] = 2;
- field2.setVal(0, "two".getBytes());
- }
- } else {
- field1.vector[0] = 1;
- field2.setVal(0, "one".getBytes());
- }
- writer.addRowBatch(batch);
- }
-
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- schema = writer.getSchema();
- assertEquals(2, schema.getMaximumId());
- boolean[] expected = new boolean[] {false, true, false};
- boolean[] included = OrcUtils.includeColumns("int1", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- List<StripeStatistics> stats = reader.getStripeStatistics();
- int numStripes = stats.size();
- assertEquals(3, numStripes);
- StripeStatistics ss1 = stats.get(0);
- StripeStatistics ss2 = stats.get(1);
- StripeStatistics ss3 = stats.get(2);
-
- assertEquals(5000, ss1.getColumnStatistics()[0].getNumberOfValues());
- assertEquals(5000, ss2.getColumnStatistics()[0].getNumberOfValues());
- assertEquals(1000, ss3.getColumnStatistics()[0].getNumberOfValues());
-
- assertEquals(5000, (ss1.getColumnStatistics()[1]).getNumberOfValues());
- assertEquals(5000, (ss2.getColumnStatistics()[1]).getNumberOfValues());
- assertEquals(1000, (ss3.getColumnStatistics()[1]).getNumberOfValues());
- assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMinimum());
- assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMinimum());
- assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMinimum());
- assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMaximum());
- assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMaximum());
- assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMaximum());
- assertEquals(5000, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getSum());
- assertEquals(10000, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getSum());
- assertEquals(3000, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getSum());
-
- assertEquals(5000, (ss1.getColumnStatistics()[2]).getNumberOfValues());
- assertEquals(5000, (ss2.getColumnStatistics()[2]).getNumberOfValues());
- assertEquals(1000, (ss3.getColumnStatistics()[2]).getNumberOfValues());
- assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMinimum());
- assertEquals("two", ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getMinimum());
- assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMinimum());
- assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMaximum());
- assertEquals("two", ((StringColumnStatistics) ss2.getColumnStatistics()[2]).getMaximum());
- assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMaximum());
- assertEquals(15000, ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getSum());
- assertEquals(15000, ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getSum());
- assertEquals(5000, ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getSum());
-
- RecordReaderImpl recordReader = (RecordReaderImpl) reader.rows();
- OrcProto.RowIndex[] index = recordReader.readRowIndex(0, null, null).getRowGroupIndex();
- assertEquals(3, index.length);
- List<OrcProto.RowIndexEntry> items = index[1].getEntryList();
- assertEquals(1, items.size());
- assertEquals(3, items.get(0).getPositionsCount());
- assertEquals(0, items.get(0).getPositions(0));
- assertEquals(0, items.get(0).getPositions(1));
- assertEquals(0, items.get(0).getPositions(2));
- assertEquals(1,
- items.get(0).getStatistics().getIntStatistics().getMinimum());
- index = recordReader.readRowIndex(1, null, null).getRowGroupIndex();
- assertEquals(3, index.length);
- items = index[1].getEntryList();
- assertEquals(2,
- items.get(0).getStatistics().getIntStatistics().getMaximum());
- }
-
- private static void setInner(StructColumnVector inner, int rowId,
- int i, String value) {
- ((LongColumnVector) inner.fields[0]).vector[rowId] = i;
- if (value != null) {
- ((BytesColumnVector) inner.fields[1]).setVal(rowId, value.getBytes());
- } else {
- inner.fields[1].isNull[rowId] = true;
- inner.fields[1].noNulls = false;
- }
- }
-
- private static void checkInner(StructColumnVector inner, int rowId,
- int rowInBatch, int i, String value) {
- assertEquals("row " + rowId, i,
- ((LongColumnVector) inner.fields[0]).vector[rowInBatch]);
- if (value != null) {
- assertEquals("row " + rowId, value,
- ((BytesColumnVector) inner.fields[1]).toString(rowInBatch));
- } else {
- assertEquals("row " + rowId, true, inner.fields[1].isNull[rowInBatch]);
- assertEquals("row " + rowId, false, inner.fields[1].noNulls);
- }
- }
-
- private static void setInnerList(ListColumnVector list, int rowId,
- List<InnerStruct> value) {
- if (value != null) {
- if (list.childCount + value.size() > list.child.isNull.length) {
- list.child.ensureSize(list.childCount * 2, true);
- }
- list.lengths[rowId] = value.size();
- list.offsets[rowId] = list.childCount;
- for (int i = 0; i < list.lengths[rowId]; ++i) {
- InnerStruct inner = value.get(i);
- setInner((StructColumnVector) list.child, i + list.childCount,
- inner.int1, inner.string1.toString());
- }
- list.childCount += value.size();
- } else {
- list.isNull[rowId] = true;
- list.noNulls = false;
- }
- }
-
- private static void checkInnerList(ListColumnVector list, int rowId,
- int rowInBatch, List<InnerStruct> value) {
- if (value != null) {
- assertEquals("row " + rowId, value.size(), list.lengths[rowInBatch]);
- int start = (int) list.offsets[rowInBatch];
- for (int i = 0; i < list.lengths[rowInBatch]; ++i) {
- InnerStruct inner = value.get(i);
- checkInner((StructColumnVector) list.child, rowId, i + start,
- inner.int1, inner.string1.toString());
- }
- list.childCount += value.size();
- } else {
- assertEquals("row " + rowId, true, list.isNull[rowInBatch]);
- assertEquals("row " + rowId, false, list.noNulls);
- }
- }
-
- private static void setInnerMap(MapColumnVector map, int rowId,
- Map<String, InnerStruct> value) {
- if (value != null) {
- if (map.childCount >= map.keys.isNull.length) {
- map.keys.ensureSize(map.childCount * 2, true);
- map.values.ensureSize(map.childCount * 2, true);
- }
- map.lengths[rowId] = value.size();
- int offset = map.childCount;
- map.offsets[rowId] = offset;
-
- for (Map.Entry<String, InnerStruct> entry : value.entrySet()) {
- ((BytesColumnVector) map.keys).setVal(offset, entry.getKey().getBytes());
- InnerStruct inner = entry.getValue();
- setInner((StructColumnVector) map.values, offset, inner.int1,
- inner.string1.toString());
- offset += 1;
- }
- map.childCount = offset;
- } else {
- map.isNull[rowId] = true;
- map.noNulls = false;
- }
- }
-
- private static void checkInnerMap(MapColumnVector map, int rowId,
- int rowInBatch,
- Map<String, InnerStruct> value) {
- if (value != null) {
- assertEquals("row " + rowId, value.size(), map.lengths[rowInBatch]);
- int offset = (int) map.offsets[rowInBatch];
- for(int i=0; i < value.size(); ++i) {
- String key = ((BytesColumnVector) map.keys).toString(offset + i);
- InnerStruct expected = value.get(key);
- checkInner((StructColumnVector) map.values, rowId, offset + i,
- expected.int1, expected.string1.toString());
- }
- } else {
- assertEquals("row " + rowId, true, map.isNull[rowId]);
- assertEquals("row " + rowId, false, map.noNulls);
- }
- }
-
- private static void setMiddleStruct(StructColumnVector middle, int rowId,
- MiddleStruct value) {
- if (value != null) {
- setInnerList((ListColumnVector) middle.fields[0], rowId, value.list);
- } else {
- middle.isNull[rowId] = true;
- middle.noNulls = false;
- }
- }
-
- private static void checkMiddleStruct(StructColumnVector middle, int rowId,
- int rowInBatch, MiddleStruct value) {
- if (value != null) {
- checkInnerList((ListColumnVector) middle.fields[0], rowId, rowInBatch,
- value.list);
- } else {
- assertEquals("row " + rowId, true, middle.isNull[rowInBatch]);
- assertEquals("row " + rowId, false, middle.noNulls);
- }
- }
-
- private static void setBigRow(VectorizedRowBatch batch, int rowId,
- Boolean b1, Byte b2, Short s1,
- Integer i1, Long l1, Float f1,
- Double d1, BytesWritable b3, String s2,
- MiddleStruct m1, List<InnerStruct> l2,
- Map<String, InnerStruct> m2) {
- ((LongColumnVector) batch.cols[0]).vector[rowId] = b1 ? 1 : 0;
- ((LongColumnVector) batch.cols[1]).vector[rowId] = b2;
- ((LongColumnVector) batch.cols[2]).vector[rowId] = s1;
- ((LongColumnVector) batch.cols[3]).vector[rowId] = i1;
- ((LongColumnVector) batch.cols[4]).vector[rowId] = l1;
- ((DoubleColumnVector) batch.cols[5]).vector[rowId] = f1;
- ((DoubleColumnVector) batch.cols[6]).vector[rowId] = d1;
- if (b3 != null) {
- ((BytesColumnVector) batch.cols[7]).setVal(rowId, b3.getBytes(), 0,
- b3.getLength());
- } else {
- batch.cols[7].isNull[rowId] = true;
- batch.cols[7].noNulls = false;
- }
- if (s2 != null) {
- ((BytesColumnVector) batch.cols[8]).setVal(rowId, s2.getBytes());
- } else {
- batch.cols[8].isNull[rowId] = true;
- batch.cols[8].noNulls = false;
- }
- setMiddleStruct((StructColumnVector) batch.cols[9], rowId, m1);
- setInnerList((ListColumnVector) batch.cols[10], rowId, l2);
- setInnerMap((MapColumnVector) batch.cols[11], rowId, m2);
- }
-
- private static void checkBigRow(VectorizedRowBatch batch,
- int rowInBatch,
- int rowId,
- boolean b1, byte b2, short s1,
- int i1, long l1, float f1,
- double d1, BytesWritable b3, String s2,
- MiddleStruct m1, List<InnerStruct> l2,
- Map<String, InnerStruct> m2) {
- assertEquals("row " + rowId, b1, getBoolean(batch, rowInBatch));
- assertEquals("row " + rowId, b2, getByte(batch, rowInBatch));
- assertEquals("row " + rowId, s1, getShort(batch, rowInBatch));
- assertEquals("row " + rowId, i1, getInt(batch, rowInBatch));
- assertEquals("row " + rowId, l1, getLong(batch, rowInBatch));
- assertEquals("row " + rowId, f1, getFloat(batch, rowInBatch), 0.0001);
- assertEquals("row " + rowId, d1, getDouble(batch, rowInBatch), 0.0001);
- if (b3 != null) {
- BytesColumnVector bytes = (BytesColumnVector) batch.cols[7];
- assertEquals("row " + rowId, b3.getLength(), bytes.length[rowInBatch]);
- for(int i=0; i < b3.getLength(); ++i) {
- assertEquals("row " + rowId + " byte " + i, b3.getBytes()[i],
- bytes.vector[rowInBatch][bytes.start[rowInBatch] + i]);
- }
- } else {
- assertEquals("row " + rowId, true, batch.cols[7].isNull[rowInBatch]);
- assertEquals("row " + rowId, false, batch.cols[7].noNulls);
- }
- if (s2 != null) {
- assertEquals("row " + rowId, s2, getText(batch, rowInBatch).toString());
- } else {
- assertEquals("row " + rowId, true, batch.cols[8].isNull[rowInBatch]);
- assertEquals("row " + rowId, false, batch.cols[8].noNulls);
- }
- checkMiddleStruct((StructColumnVector) batch.cols[9], rowId, rowInBatch,
- m1);
- checkInnerList((ListColumnVector) batch.cols[10], rowId, rowInBatch, l2);
- checkInnerMap((MapColumnVector) batch.cols[11], rowId, rowInBatch, m2);
- }
-
- private static boolean getBoolean(VectorizedRowBatch batch, int rowId) {
- return ((LongColumnVector) batch.cols[0]).vector[rowId] != 0;
- }
-
- private static byte getByte(VectorizedRowBatch batch, int rowId) {
- return (byte) ((LongColumnVector) batch.cols[1]).vector[rowId];
- }
-
- private static short getShort(VectorizedRowBatch batch, int rowId) {
- return (short) ((LongColumnVector) batch.cols[2]).vector[rowId];
- }
-
- private static int getInt(VectorizedRowBatch batch, int rowId) {
- return (int) ((LongColumnVector) batch.cols[3]).vector[rowId];
- }
-
- private static long getLong(VectorizedRowBatch batch, int rowId) {
- return ((LongColumnVector) batch.cols[4]).vector[rowId];
- }
-
- private static float getFloat(VectorizedRowBatch batch, int rowId) {
- return (float) ((DoubleColumnVector) batch.cols[5]).vector[rowId];
- }
-
- private static double getDouble(VectorizedRowBatch batch, int rowId) {
- return ((DoubleColumnVector) batch.cols[6]).vector[rowId];
- }
-
- private static BytesWritable getBinary(BytesColumnVector column, int rowId) {
- if (column.isRepeating) {
- rowId = 0;
- }
- if (column.noNulls || !column.isNull[rowId]) {
- return new BytesWritable(Arrays.copyOfRange(column.vector[rowId],
- column.start[rowId], column.start[rowId] + column.length[rowId]));
- } else {
- return null;
- }
- }
-
- private static BytesWritable getBinary(VectorizedRowBatch batch, int rowId) {
- return getBinary((BytesColumnVector) batch.cols[7], rowId);
- }
-
- private static Text getText(BytesColumnVector vector, int rowId) {
- if (vector.isRepeating) {
- rowId = 0;
- }
- if (vector.noNulls || !vector.isNull[rowId]) {
- return new Text(Arrays.copyOfRange(vector.vector[rowId],
- vector.start[rowId], vector.start[rowId] + vector.length[rowId]));
- } else {
- return null;
- }
- }
-
- private static Text getText(VectorizedRowBatch batch, int rowId) {
- return getText((BytesColumnVector) batch.cols[8], rowId);
- }
-
- private static InnerStruct getInner(StructColumnVector vector,
- int rowId) {
- return new InnerStruct(
- (int) ((LongColumnVector) vector.fields[0]).vector[rowId],
- getText((BytesColumnVector) vector.fields[1], rowId));
- }
-
- private static List<InnerStruct> getList(ListColumnVector cv,
- int rowId) {
- if (cv.isRepeating) {
- rowId = 0;
- }
- if (cv.noNulls || !cv.isNull[rowId]) {
- List<InnerStruct> result =
- new ArrayList<InnerStruct>((int) cv.lengths[rowId]);
- for(long i=cv.offsets[rowId];
- i < cv.offsets[rowId] + cv.lengths[rowId]; ++i) {
- result.add(getInner((StructColumnVector) cv.child, (int) i));
- }
- return result;
- } else {
- return null;
- }
- }
-
- private static List<InnerStruct> getMidList(VectorizedRowBatch batch,
- int rowId) {
- return getList((ListColumnVector) ((StructColumnVector) batch.cols[9])
- .fields[0], rowId);
- }
-
- private static List<InnerStruct> getList(VectorizedRowBatch batch,
- int rowId) {
- return getList((ListColumnVector) batch.cols[10], rowId);
- }
-
- private static Map<Text, InnerStruct> getMap(VectorizedRowBatch batch,
- int rowId) {
- MapColumnVector cv = (MapColumnVector) batch.cols[11];
- if (cv.isRepeating) {
- rowId = 0;
- }
- if (cv.noNulls || !cv.isNull[rowId]) {
- Map<Text, InnerStruct> result =
- new HashMap<Text, InnerStruct>((int) cv.lengths[rowId]);
- for(long i=cv.offsets[rowId];
- i < cv.offsets[rowId] + cv.lengths[rowId]; ++i) {
- result.put(getText((BytesColumnVector) cv.keys, (int) i),
- getInner((StructColumnVector) cv.values, (int) i));
- }
- return result;
- } else {
- return null;
- }
- }
-
- private static TypeDescription createInnerSchema() {
- return TypeDescription.createStruct()
- .addField("int1", TypeDescription.createInt())
- .addField("string1", TypeDescription.createString());
- }
-
- private static TypeDescription createBigRowSchema() {
- return TypeDescription.createStruct()
- .addField("boolean1", TypeDescription.createBoolean())
- .addField("byte1", TypeDescription.createByte())
- .addField("short1", TypeDescription.createShort())
- .addField("int1", TypeDescription.createInt())
- .addField("long1", TypeDescription.createLong())
- .addField("float1", TypeDescription.createFloat())
- .addField("double1", TypeDescription.createDouble())
- .addField("bytes1", TypeDescription.createBinary())
- .addField("string1", TypeDescription.createString())
- .addField("middle", TypeDescription.createStruct()
- .addField("list", TypeDescription.createList(createInnerSchema())))
- .addField("list", TypeDescription.createList(createInnerSchema()))
- .addField("map", TypeDescription.createMap(
- TypeDescription.createString(),
- createInnerSchema()));
- }
-
- static void assertArrayEquals(boolean[] expected, boolean[] actual) {
- assertEquals(expected.length, actual.length);
- boolean diff = false;
- for(int i=0; i < expected.length; ++i) {
- if (expected[i] != actual[i]) {
- System.out.println("Difference at " + i + " expected: " + expected[i] +
- " actual: " + actual[i]);
- diff = true;
- }
- }
- assertEquals(false, diff);
- }
-
- @Test
- public void test1() throws Exception {
- TypeDescription schema = createBigRowSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 2;
- setBigRow(batch, 0, false, (byte) 1, (short) 1024, 65536,
- Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi",
- new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
- list(inner(3, "good"), inner(4, "bad")),
- map());
- setBigRow(batch, 1, true, (byte) 100, (short) 2048, 65536,
- Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye",
- new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
- list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")),
- map(inner(5, "chani"), inner(1, "mauddib")));
- writer.addRowBatch(batch);
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- schema = writer.getSchema();
- assertEquals(23, schema.getMaximumId());
- boolean[] expected = new boolean[] {false, false, false, false, false,
- false, false, false, false, false,
- false, false, false, false, false,
- false, false, false, false, false,
- false, false, false, false};
- boolean[] included = OrcUtils.includeColumns("", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- expected = new boolean[] {false, true, false, false, false,
- false, false, false, false, true,
- true, true, true, true, true,
- false, false, false, false, true,
- true, true, true, true};
- included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema);
-
- assertArrayEquals(expected, included);
-
- expected = new boolean[] {false, true, false, false, false,
- false, false, false, false, true,
- true, true, true, true, true,
- false, false, false, false, true,
- true, true, true, true};
- included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema);
- assertArrayEquals(expected, included);
-
- expected = new boolean[] {false, true, true, true, true,
- true, true, true, true, true,
- true, true, true, true, true,
- true, true, true, true, true,
- true, true, true, true};
- included = OrcUtils.includeColumns(
- "boolean1,byte1,short1,int1,long1,float1,double1,bytes1,string1,middle,list,map",
- schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- // check the stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(2, stats[1].getNumberOfValues());
- assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount());
- assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount());
- assertEquals("count: 2 hasNull: false true: 1", stats[1].toString());
-
- assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
- assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
- assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
- assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum());
- assertEquals("count: 2 hasNull: false min: 1024 max: 2048 sum: 3072",
- stats[3].toString());
-
- StripeStatistics ss = reader.getStripeStatistics().get(0);
- assertEquals(2, ss.getColumnStatistics()[0].getNumberOfValues());
- assertEquals(1, ((BooleanColumnStatistics) ss.getColumnStatistics()[1]).getTrueCount());
- assertEquals(1024, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMinimum());
- assertEquals(2048, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMaximum());
- assertEquals(3072, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getSum());
- assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
- assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
- assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
- assertEquals("count: 2 hasNull: false min: -15.0 max: -5.0 sum: -20.0",
- stats[7].toString());
-
- assertEquals("count: 2 hasNull: false min: bye max: hi sum: 5", stats[9].toString());
-
- // check the schema
- TypeDescription readerSchema = reader.getSchema();
- assertEquals(TypeDescription.Category.STRUCT, readerSchema.getCategory());
- assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint,"
- + "int1:int,long1:bigint,float1:float,double1:double,bytes1:"
- + "binary,string1:string,middle:struct<list:array<struct<int1:int,"
- + "string1:string>>>,list:array<struct<int1:int,string1:string>>,"
- + "map:map<string,struct<int1:int,string1:string>>>",
- readerSchema.toString());
- List<String> fieldNames = readerSchema.getFieldNames();
- List<TypeDescription> fieldTypes = readerSchema.getChildren();
- assertEquals("boolean1", fieldNames.get(0));
- assertEquals(TypeDescription.Category.BOOLEAN, fieldTypes.get(0).getCategory());
- assertEquals("byte1", fieldNames.get(1));
- assertEquals(TypeDescription.Category.BYTE, fieldTypes.get(1).getCategory());
- assertEquals("short1", fieldNames.get(2));
- assertEquals(TypeDescription.Category.SHORT, fieldTypes.get(2).getCategory());
- assertEquals("int1", fieldNames.get(3));
- assertEquals(TypeDescription.Category.INT, fieldTypes.get(3).getCategory());
- assertEquals("long1", fieldNames.get(4));
- assertEquals(TypeDescription.Category.LONG, fieldTypes.get(4).getCategory());
- assertEquals("float1", fieldNames.get(5));
- assertEquals(TypeDescription.Category.FLOAT, fieldTypes.get(5).getCategory());
- assertEquals("double1", fieldNames.get(6));
- assertEquals(TypeDescription.Category.DOUBLE, fieldTypes.get(6).getCategory());
- assertEquals("bytes1", fieldNames.get(7));
- assertEquals(TypeDescription.Category.BINARY, fieldTypes.get(7).getCategory());
- assertEquals("string1", fieldNames.get(8));
- assertEquals(TypeDescription.Category.STRING, fieldTypes.get(8).getCategory());
- assertEquals("middle", fieldNames.get(9));
- TypeDescription middle = fieldTypes.get(9);
- assertEquals(TypeDescription.Category.STRUCT, middle.getCategory());
- TypeDescription midList = middle.getChildren().get(0);
- assertEquals(TypeDescription.Category.LIST, midList.getCategory());
- TypeDescription inner = midList.getChildren().get(0);
- assertEquals(TypeDescription.Category.STRUCT, inner.getCategory());
- assertEquals("int1", inner.getFieldNames().get(0));
- assertEquals("string1", inner.getFieldNames().get(1));
-
- RecordReader rows = reader.rows();
- // create a new batch
- batch = readerSchema.createRowBatch();
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(2, batch.size);
- assertEquals(false, rows.hasNext());
-
- // check the contents of the first row
- assertEquals(false, getBoolean(batch, 0));
- assertEquals(1, getByte(batch, 0));
- assertEquals(1024, getShort(batch, 0));
- assertEquals(65536, getInt(batch, 0));
- assertEquals(Long.MAX_VALUE, getLong(batch, 0));
- assertEquals(1.0, getFloat(batch, 0), 0.00001);
- assertEquals(-15.0, getDouble(batch, 0), 0.00001);
- assertEquals(bytes(0,1,2,3,4), getBinary(batch, 0));
- assertEquals("hi", getText(batch, 0).toString());
- List<InnerStruct> midRow = getMidList(batch, 0);
- assertNotNull(midRow);
- assertEquals(2, midRow.size());
- assertEquals(1, midRow.get(0).int1);
- assertEquals("bye", midRow.get(0).string1.toString());
- assertEquals(2, midRow.get(1).int1);
- assertEquals("sigh", midRow.get(1).string1.toString());
- List<InnerStruct> list = getList(batch, 0);
- assertEquals(2, list.size());
- assertEquals(3, list.get(0).int1);
- assertEquals("good", list.get(0).string1.toString());
- assertEquals(4, list.get(1).int1);
- assertEquals("bad", list.get(1).string1.toString());
- Map<Text, InnerStruct> map = getMap(batch, 0);
- assertEquals(0, map.size());
-
- // check the contents of second row
- assertEquals(true, getBoolean(batch, 1));
- assertEquals(100, getByte(batch, 1));
- assertEquals(2048, getShort(batch, 1));
- assertEquals(65536, getInt(batch, 1));
- assertEquals(Long.MAX_VALUE, getLong(batch, 1));
- assertEquals(2.0, getFloat(batch, 1), 0.00001);
- assertEquals(-5.0, getDouble(batch, 1), 0.00001);
- assertEquals(bytes(), getBinary(batch, 1));
- assertEquals("bye", getText(batch, 1).toString());
- midRow = getMidList(batch, 1);
- assertNotNull(midRow);
- assertEquals(2, midRow.size());
- assertEquals(1, midRow.get(0).int1);
- assertEquals("bye", midRow.get(0).string1.toString());
- assertEquals(2, midRow.get(1).int1);
- assertEquals("sigh", midRow.get(1).string1.toString());
- list = getList(batch, 1);
- assertEquals(3, list.size());
- assertEquals(100000000, list.get(0).int1);
- assertEquals("cat", list.get(0).string1.toString());
- assertEquals(-100000, list.get(1).int1);
- assertEquals("in", list.get(1).string1.toString());
- assertEquals(1234, list.get(2).int1);
- assertEquals("hat", list.get(2).string1.toString());
- map = getMap(batch, 1);
- assertEquals(2, map.size());
- InnerStruct value = map.get(new Text("chani"));
- assertEquals(5, value.int1);
- assertEquals("chani", value.string1.toString());
- value = map.get(new Text("mauddib"));
- assertEquals(1, value.int1);
- assertEquals("mauddib", value.string1.toString());
-
- // handle the close up
- assertEquals(false, rows.nextBatch(batch));
- rows.close();
- }
-
- @Test
- public void testColumnProjection() throws Exception {
- TypeDescription schema = createInnerSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(1000)
- .compress(CompressionKind.NONE)
- .bufferSize(100)
- .rowIndexStride(1000));
- VectorizedRowBatch batch = schema.createRowBatch();
- Random r1 = new Random(1);
- Random r2 = new Random(2);
- int x;
- int minInt=0, maxInt=0;
- String y;
- String minStr = null, maxStr = null;
- batch.size = 1000;
- boolean first = true;
- for(int b=0; b < 21; ++b) {
- for(int r=0; r < 1000; ++r) {
- x = r1.nextInt();
- y = Long.toHexString(r2.nextLong());
- if (first || x < minInt) {
- minInt = x;
- }
- if (first || x > maxInt) {
- maxInt = x;
- }
- if (first || y.compareTo(minStr) < 0) {
- minStr = y;
- }
- if (first || y.compareTo(maxStr) > 0) {
- maxStr = y;
- }
- first = false;
- ((LongColumnVector) batch.cols[0]).vector[r] = x;
- ((BytesColumnVector) batch.cols[1]).setVal(r, y.getBytes());
- }
- writer.addRowBatch(batch);
- }
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- // check out the statistics
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(3, stats.length);
- for(ColumnStatistics s: stats) {
- assertEquals(21000, s.getNumberOfValues());
- if (s instanceof IntegerColumnStatistics) {
- assertEquals(minInt, ((IntegerColumnStatistics) s).getMinimum());
- assertEquals(maxInt, ((IntegerColumnStatistics) s).getMaximum());
- } else if (s instanceof StringColumnStatistics) {
- assertEquals(maxStr, ((StringColumnStatistics) s).getMaximum());
- assertEquals(minStr, ((StringColumnStatistics) s).getMinimum());
- }
- }
-
- // check out the types
- TypeDescription type = reader.getSchema();
- assertEquals(TypeDescription.Category.STRUCT, type.getCategory());
- assertEquals(2, type.getChildren().size());
- TypeDescription type1 = type.getChildren().get(0);
- TypeDescription type2 = type.getChildren().get(1);
- assertEquals(TypeDescription.Category.INT, type1.getCategory());
- assertEquals(TypeDescription.Category.STRING, type2.getCategory());
- assertEquals("struct<int1:int,string1:string>", type.toString());
-
- // read the contents and make sure they match
- RecordReader rows1 = reader.rows(new boolean[]{true, true, false});
- RecordReader rows2 = reader.rows(new boolean[]{true, false, true});
- r1 = new Random(1);
- r2 = new Random(2);
- VectorizedRowBatch batch1 = reader.getSchema().createRowBatch(1000);
- VectorizedRowBatch batch2 = reader.getSchema().createRowBatch(1000);
- for(int i = 0; i < 21000; i += 1000) {
- assertEquals(true, rows1.nextBatch(batch1));
- assertEquals(true, rows2.nextBatch(batch2));
- assertEquals(1000, batch1.size);
- assertEquals(1000, batch2.size);
- for(int j=0; j < 1000; ++j) {
- assertEquals(r1.nextInt(),
- ((LongColumnVector) batch1.cols[0]).vector[j]);
- assertEquals(Long.toHexString(r2.nextLong()),
- ((BytesColumnVector) batch2.cols[1]).toString(j));
- }
- }
- assertEquals(false, rows1.nextBatch(batch1));
- assertEquals(false, rows2.nextBatch(batch2));
- rows1.close();
- rows2.close();
- }
-
- @Test
- public void testEmptyFile() throws Exception {
- TypeDescription schema = createBigRowSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(1000)
- .compress(CompressionKind.NONE)
- .bufferSize(100));
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- assertEquals(false, reader.rows().hasNext());
- assertEquals(CompressionKind.NONE, reader.getCompressionKind());
- assertEquals(0, reader.getNumberOfRows());
- assertEquals(0, reader.getCompressionSize());
- assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
- assertEquals(3, reader.getContentLength());
- assertEquals(false, reader.getStripes().iterator().hasNext());
- }
-
- @Test
- public void metaData() throws Exception {
- TypeDescription schema = createBigRowSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(1000)
- .compress(CompressionKind.NONE)
- .bufferSize(100));
- writer.addUserMetadata("my.meta", byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127,
- -128));
- writer.addUserMetadata("clobber", byteBuf(1, 2, 3));
- writer.addUserMetadata("clobber", byteBuf(4, 3, 2, 1));
- ByteBuffer bigBuf = ByteBuffer.allocate(40000);
- Random random = new Random(0);
- random.nextBytes(bigBuf.array());
- writer.addUserMetadata("big", bigBuf);
- bigBuf.position(0);
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 1;
- setBigRow(batch, 0, true, (byte) 127, (short) 1024, 42,
- 42L * 1024 * 1024 * 1024, (float) 3.1415, -2.713, null,
- null, null, null, null);
- writer.addRowBatch(batch);
- writer.addUserMetadata("clobber", byteBuf(5,7,11,13,17,19));
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- assertEquals(byteBuf(5,7,11,13,17,19), reader.getMetadataValue("clobber"));
- assertEquals(byteBuf(1,2,3,4,5,6,7,-1,-2,127,-128),
- reader.getMetadataValue("my.meta"));
- assertEquals(bigBuf, reader.getMetadataValue("big"));
- try {
- reader.getMetadataValue("unknown");
- assertTrue(false);
- } catch (IllegalArgumentException iae) {
- // PASS
- }
- int i = 0;
- for(String key: reader.getMetadataKeys()) {
- if ("my.meta".equals(key) ||
- "clobber".equals(key) ||
- "big".equals(key)) {
- i += 1;
- } else {
- throw new IllegalArgumentException("unknown key " + key);
- }
- }
- assertEquals(3, i);
- int numStripes = reader.getStripeStatistics().size();
- assertEquals(1, numStripes);
- }
-
- /**
- * Generate an ORC file with a range of dates and times.
- */
- public void createOrcDateFile(Path file, int minYear, int maxYear
- ) throws IOException {
- TypeDescription schema = TypeDescription.createStruct()
- .addField("time", TypeDescription.createTimestamp())
- .addField("date", TypeDescription.createDate());
- Writer writer = OrcFile.createWriter(file,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .blockPadding(false));
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 1000;
- for (int year = minYear; year < maxYear; ++year) {
- for (int ms = 1000; ms < 2000; ++ms) {
- TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0];
- timestampColVector.set(ms - 1000,
- Timestamp.valueOf(year +
- "-05-05 12:34:56." + ms));
- ((LongColumnVector) batch.cols[1]).vector[ms - 1000] =
- new DateWritable(new Date(year - 1900, 11, 25)).getDays();
- }
- writer.addRowBatch(batch);
- }
- writer.close();
- Reader reader = OrcFile.createReader(file,
- OrcFile.readerOptions(conf));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch(1000);
- TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
- LongColumnVector dates = (LongColumnVector) batch.cols[1];
- for (int year = minYear; year < maxYear; ++year) {
- rows.nextBatch(batch);
- assertEquals(1000, batch.size);
- for(int ms = 1000; ms < 2000; ++ms) {
- StringBuilder buffer = new StringBuilder();
- times.stringifyValue(buffer, ms - 1000);
- String expected = Integer.toString(year) + "-05-05 12:34:56.";
- // suppress the final zeros on the string by dividing by the largest
- // power of 10 that divides evenly.
- int roundedMs = ms;
- for(int round = 1000; round > 0; round /= 10) {
- if (ms % round == 0) {
- roundedMs = ms / round;
- break;
- }
- }
- expected += roundedMs;
- assertEquals(expected, buffer.toString());
- assertEquals(Integer.toString(year) + "-12-25",
- new DateWritable((int) dates.vector[ms - 1000]).toString());
- }
- }
- rows.nextBatch(batch);
- assertEquals(0, batch.size);
- }
-
- @Test
- public void testDate1900() throws Exception {
- createOrcDateFile(testFilePath, 1900, 1970);
- }
-
- @Test
- public void testDate2038() throws Exception {
- createOrcDateFile(testFilePath, 2038, 2250);
- }
-
- private static void setUnion(VectorizedRowBatch batch, int rowId,
- Timestamp ts, Integer tag, Integer i, String s,
- HiveDecimalWritable dec) {
- UnionColumnVector union = (UnionColumnVector) batch.cols[1];
- if (ts != null) {
- TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0];
- timestampColVector.set(rowId, ts);
- } else {
- batch.cols[0].isNull[rowId] = true;
- batch.cols[0].noNulls = false;
- }
- if (tag != null) {
- union.tags[rowId] = tag;
- if (tag == 0) {
- if (i != null) {
- ((LongColumnVector) union.fields[tag]).vector[rowId] = i;
- } else {
- union.fields[tag].isNull[rowId] = true;
- union.fields[tag].noNulls = false;
- }
- } else if (tag == 1) {
- if (s != null) {
- ((BytesColumnVector) union.fields[tag]).setVal(rowId, s.getBytes());
- } else {
- union.fields[tag].isNull[rowId] = true;
- union.fields[tag].noNulls = false;
- }
- } else {
- throw new IllegalArgumentException("Bad tag " + tag);
- }
- } else {
- batch.cols[1].isNull[rowId] = true;
- batch.cols[1].noNulls = false;
- }
- if (dec != null) {
- ((DecimalColumnVector) batch.cols[2]).vector[rowId] = dec;
- } else {
- batch.cols[2].isNull[rowId] = true;
- batch.cols[2].noNulls = false;
- }
- }
-
- /**
- * We test union, timestamp, and decimal separately since we need to make the
- * object inspector manually. (The Hive reflection-based doesn't handle
- * them properly.)
- */
- @Test
- public void testUnionAndTimestamp() throws Exception {
- TypeDescription schema = TypeDescription.createStruct()
- .addField("time", TypeDescription.createTimestamp())
- .addField("union", TypeDescription.createUnion()
- .addUnionChild(TypeDescription.createInt())
- .addUnionChild(TypeDescription.createString()))
- .addField("decimal", TypeDescription.createDecimal()
- .withPrecision(38)
- .withScale(18));
- HiveDecimal maxValue = HiveDecimal.create("10000000000000000000");
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(1000)
- .compress(CompressionKind.NONE)
- .bufferSize(100)
- .blockPadding(false));
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 6;
- setUnion(batch, 0, Timestamp.valueOf("2000-03-12 15:00:00"), 0, 42, null,
- new HiveDecimalWritable("12345678.6547456"));
- setUnion(batch, 1, Timestamp.valueOf("2000-03-20 12:00:00.123456789"),
- 1, null, "hello", new HiveDecimalWritable("-5643.234"));
-
- setUnion(batch, 2, null, null, null, null, null);
- setUnion(batch, 3, null, 0, null, null, null);
- setUnion(batch, 4, null, 1, null, null, null);
-
- setUnion(batch, 5, Timestamp.valueOf("1970-01-01 00:00:00"), 0, 200000,
- null, new HiveDecimalWritable("10000000000000000000"));
- writer.addRowBatch(batch);
-
- batch.reset();
- Random rand = new Random(42);
- for(int i=1970; i < 2038; ++i) {
- Timestamp ts = Timestamp.valueOf(i + "-05-05 12:34:56." + i);
- HiveDecimal dec =
- HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18));
- if ((i & 1) == 0) {
- setUnion(batch, batch.size++, ts, 0, i*i, null,
- new HiveDecimalWritable(dec));
- } else {
- setUnion(batch, batch.size++, ts, 1, null, Integer.toString(i*i),
- new HiveDecimalWritable(dec));
- }
- if (maxValue.compareTo(dec) < 0) {
- maxValue = dec;
- }
- }
- writer.addRowBatch(batch);
- batch.reset();
-
- // let's add a lot of constant rows to test the rle
- batch.size = 1000;
- for(int c=0; c < batch.cols.length; ++c) {
- batch.cols[c].setRepeating(true);
- }
- ((UnionColumnVector) batch.cols[1]).fields[0].isRepeating = true;
- setUnion(batch, 0, null, 0, 1732050807, null, null);
- for(int i=0; i < 5; ++i) {
- writer.addRowBatch(batch);
- }
-
- batch.reset();
- batch.size = 3;
- setUnion(batch, 0, null, 0, 0, null, null);
- setUnion(batch, 1, null, 0, 10, null, null);
- setUnion(batch, 2, null, 0, 138, null, null);
- writer.addRowBatch(batch);
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- schema = writer.getSchema();
- assertEquals(5, schema.getMaximumId());
- boolean[] expected = new boolean[] {false, false, false, false, false, false};
- boolean[] included = OrcUtils.includeColumns("", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- expected = new boolean[] {false, true, false, false, false, true};
- included = OrcUtils.includeColumns("time,decimal", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- expected = new boolean[] {false, false, true, true, true, false};
- included = OrcUtils.includeColumns("union", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
- assertEquals(5077, reader.getNumberOfRows());
- DecimalColumnStatistics stats =
- (DecimalColumnStatistics) reader.getStatistics()[5];
- assertEquals(71, stats.getNumberOfValues());
- assertEquals(HiveDecimal.create("-5643.234"), stats.getMinimum());
- assertEquals(maxValue, stats.getMaximum());
- // TODO: fix this
-// assertEquals(null,stats.getSum());
- int stripeCount = 0;
- int rowCount = 0;
- long currentOffset = -1;
- for(StripeInformation stripe: reader.getStripes()) {
- stripeCount += 1;
- rowCount += stripe.getNumberOfRows();
- if (currentOffset < 0) {
- currentOffset = stripe.getOffset() + stripe.getLength();
- } else {
- assertEquals(currentOffset, stripe.getOffset());
- currentOffset += stripe.getLength();
- }
- }
- assertEquals(reader.getNumberOfRows(), rowCount);
- assertEquals(2, stripeCount);
- assertEquals(reader.getContentLength(), currentOffset);
- RecordReader rows = reader.rows();
- assertEquals(0, rows.getRowNumber());
- assertEquals(0.0, rows.getProgress(), 0.000001);
-
- schema = reader.getSchema();
- batch = schema.createRowBatch(74);
- assertEquals(0, rows.getRowNumber());
- rows.nextBatch(batch);
- assertEquals(74, batch.size);
- assertEquals(74, rows.getRowNumber());
- TimestampColumnVector ts = (TimestampColumnVector) batch.cols[0];
- UnionColumnVector union = (UnionColumnVector) batch.cols[1];
- LongColumnVector longs = (LongColumnVector) union.fields[0];
- BytesColumnVector strs = (BytesColumnVector) union.fields[1];
- DecimalColumnVector decs = (DecimalColumnVector) batch.cols[2];
-
- assertEquals("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal(38,18)>",
- schema.toString());
- assertEquals("2000-03-12 15:00:00.0", ts.asScratchTimestamp(0).toString());
- assertEquals(0, union.tags[0]);
- assertEquals(42, longs.vector[0]);
- assertEquals("12345678.6547456", decs.vector[0].toString());
-
- assertEquals("2000-03-20 12:00:00.123456789", ts.asScratchTimestamp(1).toString());
- assertEquals(1, union.tags[1]);
- assertEquals("hello", strs.toString(1));
- assertEquals("-5643.234", decs.vector[1].toString());
-
- assertEquals(false, ts.noNulls);
- assertEquals(false, union.noNulls);
- assertEquals(false, decs.noNulls);
- assertEquals(true, ts.isNull[2]);
- assertEquals(true, union.isNull[2]);
- assertEquals(true, decs.isNull[2]);
-
- assertEquals(true, ts.isNull[3]);
- assertEquals(false, union.isNull[3]);
- assertEquals(0, union.tags[3]);
- assertEquals(true, longs.isNull[3]);
- assertEquals(true, decs.isNull[3]);
-
- assertEquals(true, ts.isNull[4]);
- assertEquals(false, union.isNull[4]);
- assertEquals(1, union.tags[4]);
- assertEquals(true, strs.isNull[4]);
- assertEquals(true, decs.isNull[4]);
-
- assertEquals(false, ts.isNull[5]);
- assertEquals("1970-01-01 00:00:00.0", ts.asScratchTimestamp(5).toString());
- assertEquals(false, union.isNull[5]);
- assertEquals(0, union.tags[5]);
- assertEquals(false, longs.isNull[5]);
- assertEquals(200000, longs.vector[5]);
- assertEquals(false, decs.isNull[5]);
- assertEquals("10000000000000000000", decs.vector[5].toString());
-
- rand = new Random(42);
- for(int i=1970; i < 2038; ++i) {
- int row = 6 + i - 1970;
- assertEquals(Timestamp.valueOf(i + "-05-05 12:34:56." + i),
- ts.asScratchTimestamp(row));
- if ((i & 1) == 0) {
- assertEquals(0, union.tags[row]);
- assertEquals(i*i, longs.vector[row]);
- } else {
- assertEquals(1, union.tags[row]);
- assertEquals(Integer.toString(i * i), strs.toString(row));
- }
- assertEquals(new HiveDecimalWritable(HiveDecimal.create(new BigInteger(64, rand),
- rand.nextInt(18))), decs.vector[row]);
- }
-
- // rebuild the row batch, so that we can read by 1000 rows
- batch = schema.createRowBatch(1000);
- ts = (TimestampColumnVector) batch.cols[0];
- union = (UnionColumnVector) batch.cols[1];
- longs = (LongColumnVector) union.fields[0];
- strs = (BytesColumnVector) union.fields[1];
- decs = (DecimalColumnVector) batch.cols[2];
-
- for(int i=0; i < 5; ++i) {
- rows.nextBatch(batch);
- assertEquals("batch " + i, 1000, batch.size);
- assertEquals("batch " + i, false, union.isRepeating);
- assertEquals("batch " + i, true, union.noNulls);
- for(int r=0; r < batch.size; ++r) {
- assertEquals("bad tag at " + i + "." +r, 0, union.tags[r]);
- }
- assertEquals("batch " + i, true, longs.isRepeating);
- assertEquals("batch " + i, 1732050807, longs.vector[0]);
- }
-
- rows.nextBatch(batch);
- assertEquals(3, batch.size);
- assertEquals(0, union.tags[0]);
- assertEquals(0, longs.vector[0]);
- assertEquals(0, union.tags[1]);
- assertEquals(10, longs.vector[1]);
- assertEquals(0, union.tags[2]);
- assertEquals(138, longs.vector[2]);
-
- rows.nextBatch(batch);
- assertEquals(0, batch.size);
- assertEquals(1.0, rows.getProgress(), 0.00001);
- assertEquals(reader.getNumberOfRows(), rows.getRowNumber());
- rows.seekToRow(1);
- rows.nextBatch(batch);
- assertEquals(1000, batch.size);
- assertEquals(Timestamp.valueOf("2000-03-20 12:00:00.123456789"), ts.asScratchTimestamp(0));
- assertEquals(1, union.tags[0]);
- assertEquals("hello", strs.toString(0));
- assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), decs.vector[0]);
- rows.close();
- }
-
- /**
- * Read and write a randomly generated snappy file.
- * @throws Exception
- */
- @Test
- public void testSnappy() throws Exception {
- TypeDescription schema = createInnerSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(1000)
- .compress(CompressionKind.SNAPPY)
- .bufferSize(100));
- VectorizedRowBatch batch = schema.createRowBatch();
- Random rand = new Random(12);
- batch.size = 1000;
- for(int b=0; b < 10; ++b) {
- for (int r=0; r < 1000; ++r) {
- ((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt();
- ((BytesColumnVector) batch.cols[1]).setVal(r,
- Integer.toHexString(rand.nextInt()).getBytes());
- }
- writer.addRowBatch(batch);
- }
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind());
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch(1000);
- rand = new Random(12);
- LongColumnVector longs = (LongColumnVector) batch.cols[0];
- BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
- for(int b=0; b < 10; ++b) {
- rows.nextBatch(batch);
- assertEquals(1000, batch.size);
- for(int r=0; r < batch.size; ++r) {
- assertEquals(rand.nextInt(), longs.vector[r]);
- assertEquals(Integer.toHexString(rand.nextInt()), strs.toString(r));
- }
- }
- rows.nextBatch(batch);
- assertEquals(0, batch.size);
- rows.close();
- }
-
- /**
- * Read and write a randomly generated snappy file.
- * @throws Exception
- */
- @Test
- public void testWithoutIndex() throws Exception {
- TypeDescription schema = createInnerSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(5000)
- .compress(CompressionKind.SNAPPY)
- .bufferSize(1000)
- .rowIndexStride(0));
- VectorizedRowBatch batch = schema.createRowBatch();
- Random rand = new Random(24);
- batch.size = 5;
- for(int c=0; c < batch.cols.length; ++c) {
- batch.cols[c].setRepeating(true);
- }
- for(int i=0; i < 10000; ++i) {
- ((LongColumnVector) batch.cols[0]).vector[0] = rand.nextInt();
- ((BytesColumnVector) batch.cols[1])
- .setVal(0, Integer.toBinaryString(rand.nextInt()).getBytes());
- writer.addRowBatch(batch);
- }
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- assertEquals(50000, reader.getNumberOfRows());
- assertEquals(0, reader.getRowIndexStride());
- StripeInformation stripe = reader.getStripes().iterator().next();
- assertEquals(true, stripe.getDataLength() != 0);
- assertEquals(0, stripe.getIndexLength());
- RecordReader rows = reader.rows();
- rand = new Random(24);
- batch = reader.getSchema().createRowBatch(1000);
- LongColumnVector longs = (LongColumnVector) batch.cols[0];
- BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
- for(int i=0; i < 50; ++i) {
- rows.nextBatch(batch);
- assertEquals("batch " + i, 1000, batch.size);
- for(int j=0; j < 200; ++j) {
- int intVal = rand.nextInt();
- String strVal = Integer.toBinaryString(rand.nextInt());
- for (int k = 0; k < 5; ++k) {
- assertEquals(intVal, longs.vector[j * 5 + k]);
- assertEquals(strVal, strs.toString(j * 5 + k));
- }
- }
- }
- rows.nextBatch(batch);
- assertEquals(0, batch.size);
- rows.close();
- }
-
- @Test
- public void testSeek() throws Exception {
- TypeDescription schema = createBigRowSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(200000)
- .bufferSize(65536)
- .rowIndexStride(1000));
- VectorizedRowBatch batch = schema.createRowBatch();
- Random rand = new Random(42);
- final int COUNT=32768;
- long[] intValues= new long[COUNT];
- double[] doubleValues = new double[COUNT];
- String[] stringValues = new String[COUNT];
- BytesWritable[] byteValues = new BytesWritable[COUNT];
- String[] words = new String[128];
- for(int i=0; i < words.length; ++i) {
- words[i] = Integer.toHexString(rand.nextInt());
- }
- for(int i=0; i < COUNT/2; ++i) {
- intValues[2*i] = rand.nextLong();
- intValues[2*i+1] = intValues[2*i];
- stringValues[2*i] = words[rand.nextInt(words.length)];
- stringValues[2*i+1] = stringValues[2*i];
- }
- for(int i=0; i < COUNT; ++i) {
- doubleValues[i] = rand.nextDouble();
- byte[] buf = new byte[20];
- rand.nextBytes(buf);
- byteValues[i] = new BytesWritable(buf);
- }
- for(int i=0; i < COUNT; ++i) {
- appendRandomRow(batch, intValues, doubleValues, stringValues,
- byteValues, words, i);
- if (batch.size == 1024) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- }
- if (batch.size != 0) {
- writer.addRowBatch(batch);
- }
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- assertEquals(COUNT, reader.getNumberOfRows());
- RecordReader rows = reader.rows();
- // get the row index
- DataReader meta = RecordReaderUtils.createDefaultDataReader(
- DataReaderProperties.builder()
- .withBufferSize(reader.getCompressionSize())
- .withFileSystem(fs)
- .withPath(testFilePath)
- .withCompression(reader.getCompressionKind())
- .withTypeCount(reader.getSchema().getMaximumId() + 1)
- .withZeroCopy(false)
- .build());
- OrcIndex index =
- meta.readRowIndex(reader.getStripes().get(0), null, null, null, null,
- null);
- // check the primitive columns to make sure they have the right number of
- // items in the first row group
- for(int c=1; c < 9; ++c) {
- OrcProto.RowIndex colIndex = index.getRowGroupIndex()[c];
- assertEquals(1000,
- colIndex.getEntry(0).getStatistics().getNumberOfValues());
- }
- batch = reader.getSchema().createRowBatch();
- int nextRowInBatch = -1;
- for(int i=COUNT-1; i >= 0; --i, --nextRowInBatch) {
- // if we have consumed the previous batch read a new one
- if (nextRowInBatch < 0) {
- long base = Math.max(i - 1023, 0);
- rows.seekToRow(base);
- assertEquals("row " + i, true, rows.nextBatch(batch));
- nextRowInBatch = batch.size - 1;
- }
- checkRandomRow(batch, intValues, doubleValues,
- stringValues, byteValues, words, i, nextRowInBatch);
- }
- rows.close();
- Iterator<StripeInformation> stripeIterator =
- reader.getStripes().iterator();
- long offsetOfStripe2 = 0;
- long offsetOfStripe4 = 0;
- long lastRowOfStripe2 = 0;
- for(int i = 0; i < 5; ++i) {
- StripeInformation stripe = stripeIterator.next();
- if (i < 2) {
- lastRowOfStripe2 += stripe.getNumberOfRows();
- } else if (i == 2) {
- offsetOfStripe2 = stripe.getOffset();
- lastRowOfStripe2 += stripe.getNumberOfRows() - 1;
- } else if (i == 4) {
- offsetOfStripe4 = stripe.getOffset();
- }
- }
- boolean[] columns = new boolean[reader.getStatistics().length];
- columns[5] = true; // long colulmn
- columns[9] = true; // text column
- rows = reader.rowsOptions(new Reader.Options()
- .range(offsetOfStripe2, offsetOfStripe4 - offsetOfStripe2)
- .include(columns));
- rows.seekToRow(lastRowOfStripe2);
- // we only want two rows
- batch = reader.getSchema().createRowBatch(2);
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(1, batch.size);
- assertEquals(intValues[(int) lastRowOfStripe2], getLong(batch, 0));
- assertEquals(stringValues[(int) lastRowOfStripe2],
- getText(batch, 0).toString());
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(intValues[(int) lastRowOfStripe2 + 1], getLong(batch, 0));
- assertEquals(stringValues[(int) lastRowOfStripe2 + 1],
- getText(batch, 0).toString());
- rows.close();
- }
-
- private void appendRandomRow(VectorizedRowBatch batch,
- long[] intValues, double[] doubleValues,
- String[] stringValues,
- BytesWritable[] byteValues,
- String[] words, int i) {
- InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]);
- InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32),
- words[i % words.length] + "-x");
- setBigRow(batch, batch.size++, (intValues[i] & 1) == 0, (byte) intValues[i],
- (short) intValues[i], (int) intValues[i], intValues[i],
- (float) doubleValues[i], doubleValues[i], byteValues[i], stringValues[i],
- new MiddleStruct(inner, inner2), list(), map(inner, inner2));
- }
-
- private void checkRandomRow(VectorizedRowBatch batch,
- long[] intValues, double[] doubleValues,
- String[] stringValues,
- BytesWritable[] byteValues,
- String[] words, int i, int rowInBatch) {
- InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]);
- InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32),
- words[i % words.length] + "-x");
- checkBigRow(batch, rowInBatch, i, (intValues[i] & 1) == 0, (byte) intValues[i],
- (short) intValues[i], (int) intValues[i], intValues[i],
- (float) doubleValues[i], doubleValues[i], byteValues[i], stringValues[i],
- new MiddleStruct(inner, inner2), list(), map(inner, inner2));
- }
-
- private static class MyMemoryManager extends MemoryManager {
- final long totalSpace;
- double rate;
- Path path = null;
- long lastAllocation = 0;
- int rows = 0;
- Callback callback;
-
- MyMemoryManager(Configuration conf, long totalSpace, double rate) {
- super(conf);
- this.totalSpace = totalSpace;
- this.rate = rate;
- }
-
- @Override
- public void addWriter(Path path, long requestedAllocation,
- Callback callback) {
- this.path = path;
- this.lastAllocation = requestedAllocation;
- this.callback = callback;
- }
-
- @Override
- public synchronized void removeWriter(Path path) {
- this.path = null;
- this.lastAllocation = 0;
- }
-
- @Override
- public long getTotalMemoryPool() {
- return totalSpace;
- }
-
- @Override
- public double getAllocationScale() {
- return rate;
- }
-
- @Override
- public void addedRow(int count) throws IOException {
- rows += count;
- if (rows % 100 == 0) {
- callback.checkMemory(rate);
- }
- }
- }
-
- @Test
- public void testMemoryManagementV11() throws Exception {
- TypeDescription schema = createInnerSchema();
- MyMemoryManager memory = new MyMemoryManager(conf, 10000, 0.1);
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .compress(CompressionKind.NONE)
- .stripeSize(50000)
- .bufferSize(100)
- .rowIndexStride(0)
- .memory(memory)
- .version(OrcFile.Version.V_0_11));
- assertEquals(testFilePath, memory.path);
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 1;
- for(int i=0; i < 2500; ++i) {
- ((LongColumnVector) batch.cols[0]).vector[0] = i * 300;
- ((BytesColumnVector) batch.cols[1]).setVal(0,
- Integer.toHexString(10*i).getBytes());
- writer.addRowBatch(batch);
- }
- writer.close();
- assertEquals(null, memory.path);
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- int i = 0;
- for(StripeInformation stripe: reader.getStripes()) {
- i += 1;
- assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(),
- stripe.getDataLength() < 5000);
- }
- assertEquals(25, i);
- assertEquals(2500, reader.getNumberOfRows());
- }
-
- @Test
- public void testMemoryManagementV12() throws Exception {
- TypeDescription schema = createInnerSchema();
- MyMemoryManager memory = new MyMemoryManager(conf, 10000, 0.1);
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .compress(CompressionKind.NONE)
- .stripeSize(50000)
- .bufferSize(100)
- .rowIndexStride(0)
- .memory(memory)
- .version(OrcFile.Version.V_0_12));
- VectorizedRowBatch batch = schema.createRowBatch();
- assertEquals(testFilePath, memory.path);
- batch.size = 1;
- for(int i=0; i < 2500; ++i) {
- ((LongColumnVector) batch.cols[0]).vector[0] = i * 300;
- ((BytesColumnVector) batch.cols[1]).setVal(0,
- Integer.toHexString(10*i).getBytes());
- writer.addRowBatch(batch);
- }
- writer.close();
- assertEquals(null, memory.path);
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- int i = 0;
- for(StripeInformation stripe: reader.getStripes()) {
- i += 1;
- assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(),
- stripe.getDataLength() < 5000);
- }
- // with HIVE-7832, the dictionaries will be disabled after writing the first
- // stripe as there are too many distinct values. Hence only 3 stripes as
- // compared to 25 stripes in version 0.11 (above test case)
- assertEquals(3, i);
- assertEquals(2500, reader.getNumberOfRows());
- }
-
- @Test
- public void testPredicatePushdown() throws Exception {
- TypeDescription schema = createInnerSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(400000L)
- .compress(CompressionKind.NONE)
- .bufferSize(500)
- .rowIndexStride(1000));
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.ensureSize(3500);
- batch.size = 3500;
- for(int i=0; i < 3500; ++i) {
- ((LongColumnVector) batch.cols[0]).vector[i] = i * 300;
- ((BytesColumnVector) batch.cols[1]).setVal(i,
- Integer.toHexString(10*i).getBytes());
- }
- writer.addRowBatch(batch);
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- assertEquals(3500, reader.getNumberOfRows());
-
- SearchArgument sarg = SearchArgumentFactory.newBuilder()
- .startAnd()
- .startNot()
- .lessThan("int1", PredicateLeaf.Type.LONG, 300000L)
- .end()
- .lessThan("int1", PredicateLeaf.Type.LONG, 600000L)
- .end()
- .build();
- RecordReader rows = reader.rowsOptions(new Reader.Options()
- .range(0L, Long.MAX_VALUE)
- .include(new boolean[]{true, true, true})
- .searchArgument(sarg, new String[]{null, "int1", "string1"}));
- batch = reader.getSchema().createRowBatch(2000);
- LongColumnVector ints = (LongColumnVector) batch.cols[0];
- BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
-
- assertEquals(1000L, rows.getRowNumber());
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(1000, batch.size);
-
- for(int i=1000; i < 2000; ++i) {
- assertEquals(300 * i, ints.vector[i - 1000]);
- assertEquals(Integer.toHexString(10*i), strs.toString(i - 1000));
- }
- assertEquals(false, rows.nextBatch(batch));
- assertEquals(3500, rows.getRowNumber());
-
- // look through the file with no rows selected
- sarg = SearchArgumentFactory.newBuilder()
- .startAnd()
- .lessThan("int1", PredicateLeaf.Type.LONG, 0L)
- .end()
- .build();
- rows = reader.rowsOptions(new Reader.Options()
- .range(0L, Long.MAX_VALUE)
- .include(new boolean[]{true, true, true})
- .searchArgument(sarg, new String[]{null, "int1", "string1"}));
- assertEquals(3500L, rows.getRowNumber());
- assertTrue(!rows.hasNext());
-
- // select first 100 and last 100 rows
- sarg = SearchArgumentFactory.newBuilder()
- .startOr()
- .lessThan("int1", PredicateLeaf.Type.LONG, 300L * 100)
- .startNot()
- .lessThan("int1", PredicateLeaf.Type.LONG, 300L * 3400)
- .end()
- .end()
- .build();
- rows = reader.rowsOptions(new Reader.Options()
- .range(0L, Long.MAX_VALUE)
- .include(new boolean[]{true, true, true})
- .searchArgument(sarg, new String[]{null, "int1", "string1"}));
- assertEquals(0, rows.getRowNumber());
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(1000, batch.size);
- assertEquals(3000, rows.getRowNumber());
- for(int i=0; i < 1000; ++i) {
- assertEquals(300 * i, ints.vector[i]);
- assertEquals(Integer.toHexString(10*i), strs.toString(i));
- }
-
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(500, batch.size);
- assertEquals(3500, rows.getRowNumber());
- for(int i=3000; i < 3500; ++i) {
- assertEquals(300 * i, ints.vector[i - 3000]);
- assertEquals(Integer.toHexString(10*i), strs.toString(i - 3000));
- }
- assertEquals(false, rows.nextBatch(batch));
- assertEquals(3500, rows.getRowNumber());
- }
-
- /**
- * Test all of the types that have distinct ORC writers using the vectorized
- * writer with different combinations of repeating and null values.
-
<TRUNCATED>
[19/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/TestVectorOrcFile.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestVectorOrcFile.java b/orc/src/test/org/apache/orc/TestVectorOrcFile.java
new file mode 100644
index 0000000..112edb9
--- /dev/null
+++ b/orc/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -0,0 +1,2782 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import com.google.common.collect.Lists;
+
+import junit.framework.Assert;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.orc.impl.DataReaderProperties;
+import org.apache.orc.impl.MemoryManager;
+import org.apache.orc.impl.OrcIndex;
+import org.apache.orc.impl.RecordReaderImpl;
+import org.apache.orc.impl.RecordReaderUtils;
+import org.apache.orc.tools.TestJsonFileDump;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+import java.io.File;
+import java.io.IOException;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import static junit.framework.TestCase.assertNotNull;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Tests for the vectorized reader and writer for ORC files.
+ */
+public class TestVectorOrcFile {
+
+ public static class InnerStruct {
+ int int1;
+ Text string1 = new Text();
+ InnerStruct(int int1, Text string1) {
+ this.int1 = int1;
+ this.string1.set(string1);
+ }
+ InnerStruct(int int1, String string1) {
+ this.int1 = int1;
+ this.string1.set(string1);
+ }
+
+ public String toString() {
+ return "{" + int1 + ", " + string1 + "}";
+ }
+ }
+
+ public static class MiddleStruct {
+ List<InnerStruct> list = new ArrayList<InnerStruct>();
+
+ MiddleStruct(InnerStruct... items) {
+ list.clear();
+ list.addAll(Arrays.asList(items));
+ }
+ }
+
+ private static InnerStruct inner(int i, String s) {
+ return new InnerStruct(i, s);
+ }
+
+ private static Map<String, InnerStruct> map(InnerStruct... items) {
+ Map<String, InnerStruct> result = new HashMap<String, InnerStruct>();
+ for(InnerStruct i: items) {
+ result.put(i.string1.toString(), i);
+ }
+ return result;
+ }
+
+ private static List<InnerStruct> list(InnerStruct... items) {
+ List<InnerStruct> result = new ArrayList<InnerStruct>();
+ result.addAll(Arrays.asList(items));
+ return result;
+ }
+
+ private static BytesWritable bytes(int... items) {
+ BytesWritable result = new BytesWritable();
+ result.setSize(items.length);
+ for(int i=0; i < items.length; ++i) {
+ result.getBytes()[i] = (byte) items[i];
+ }
+ return result;
+ }
+
+ private static byte[] bytesArray(int... items) {
+ byte[] result = new byte[items.length];
+ for(int i=0; i < items.length; ++i) {
+ result[i] = (byte) items[i];
+ }
+ return result;
+ }
+
+ private static ByteBuffer byteBuf(int... items) {
+ ByteBuffer result = ByteBuffer.allocate(items.length);
+ for(int item: items) {
+ result.put((byte) item);
+ }
+ result.flip();
+ return result;
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem () throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestVectorOrcFile." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testReadFormat_0_11() throws Exception {
+ Path oldFilePath =
+ new Path(TestJsonFileDump.getFileFromClasspath("orc-file-11-format.orc"));
+ Reader reader = OrcFile.createReader(oldFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ int stripeCount = 0;
+ int rowCount = 0;
+ long currentOffset = -1;
+ for(StripeInformation stripe : reader.getStripes()) {
+ stripeCount += 1;
+ rowCount += stripe.getNumberOfRows();
+ if (currentOffset < 0) {
+ currentOffset = stripe.getOffset() + stripe.getIndexLength()
+ + stripe.getDataLength() + stripe.getFooterLength();
+ } else {
+ assertEquals(currentOffset, stripe.getOffset());
+ currentOffset += stripe.getIndexLength() + stripe.getDataLength()
+ + stripe.getFooterLength();
+ }
+ }
+ Assert.assertEquals(reader.getNumberOfRows(), rowCount);
+ assertEquals(2, stripeCount);
+
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(7500, stats[1].getNumberOfValues());
+ assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount());
+ assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount());
+ assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString());
+
+ assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
+ assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
+ assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum());
+ assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000",
+ stats[3].toString());
+
+ assertEquals(Long.MAX_VALUE,
+ ((IntegerColumnStatistics) stats[5]).getMaximum());
+ assertEquals(Long.MAX_VALUE,
+ ((IntegerColumnStatistics) stats[5]).getMinimum());
+ assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
+ assertEquals(
+ "count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807",
+ stats[5].toString());
+
+ assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001);
+ assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001);
+ assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(),
+ 0.00001);
+ assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0",
+ stats[7].toString());
+
+ assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString());
+
+ // check the inspectors
+ TypeDescription schema = reader.getSchema();
+ assertEquals(TypeDescription.Category.STRUCT, schema.getCategory());
+ assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint,"
+ + "int1:int,long1:bigint,float1:float,double1:double,bytes1:"
+ + "binary,string1:string,middle:struct<list:array<struct<int1:int,"
+ + "string1:string>>>,list:array<struct<int1:int,string1:string>>,"
+ + "map:map<string,struct<int1:int,string1:string>>,ts:timestamp,"
+ + "decimal1:decimal(38,10)>", schema.toString());
+ VectorizedRowBatch batch = schema.createRowBatch();
+
+ RecordReader rows = reader.rows();
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1024, batch.size);
+
+ // check the contents of the first row
+ assertEquals(false, getBoolean(batch, 0));
+ assertEquals(1, getByte(batch, 0));
+ assertEquals(1024, getShort(batch, 0));
+ assertEquals(65536, getInt(batch, 0));
+ assertEquals(Long.MAX_VALUE, getLong(batch, 0));
+ assertEquals(1.0, getFloat(batch, 0), 0.00001);
+ assertEquals(-15.0, getDouble(batch, 0), 0.00001);
+ assertEquals(bytes(0, 1, 2, 3, 4), getBinary(batch, 0));
+ assertEquals("hi", getText(batch, 0).toString());
+
+ StructColumnVector middle = (StructColumnVector) batch.cols[9];
+ ListColumnVector midList = (ListColumnVector) middle.fields[0];
+ StructColumnVector midListStruct = (StructColumnVector) midList.child;
+ LongColumnVector midListInt = (LongColumnVector) midListStruct.fields[0];
+ BytesColumnVector midListStr = (BytesColumnVector) midListStruct.fields[1];
+ ListColumnVector list = (ListColumnVector) batch.cols[10];
+ StructColumnVector listStruct = (StructColumnVector) list.child;
+ LongColumnVector listInts = (LongColumnVector) listStruct.fields[0];
+ BytesColumnVector listStrs = (BytesColumnVector) listStruct.fields[1];
+ MapColumnVector map = (MapColumnVector) batch.cols[11];
+ BytesColumnVector mapKey = (BytesColumnVector) map.keys;
+ StructColumnVector mapValue = (StructColumnVector) map.values;
+ LongColumnVector mapValueInts = (LongColumnVector) mapValue.fields[0];
+ BytesColumnVector mapValueStrs = (BytesColumnVector) mapValue.fields[1];
+ TimestampColumnVector timestamp = (TimestampColumnVector) batch.cols[12];
+ DecimalColumnVector decs = (DecimalColumnVector) batch.cols[13];
+
+ assertEquals(false, middle.isNull[0]);
+ assertEquals(2, midList.lengths[0]);
+ int start = (int) midList.offsets[0];
+ assertEquals(1, midListInt.vector[start]);
+ assertEquals("bye", midListStr.toString(start));
+ assertEquals(2, midListInt.vector[start + 1]);
+ assertEquals("sigh", midListStr.toString(start + 1));
+
+ assertEquals(2, list.lengths[0]);
+ start = (int) list.offsets[0];
+ assertEquals(3, listInts.vector[start]);
+ assertEquals("good", listStrs.toString(start));
+ assertEquals(4, listInts.vector[start + 1]);
+ assertEquals("bad", listStrs.toString(start + 1));
+ assertEquals(0, map.lengths[0]);
+ assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"),
+ timestamp.asScratchTimestamp(0));
+ assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547456")),
+ decs.vector[0]);
+
+ // check the contents of row 7499
+ rows.seekToRow(7499);
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(true, getBoolean(batch, 0));
+ assertEquals(100, getByte(batch, 0));
+ assertEquals(2048, getShort(batch, 0));
+ assertEquals(65536, getInt(batch, 0));
+ assertEquals(Long.MAX_VALUE, getLong(batch, 0));
+ assertEquals(2.0, getFloat(batch, 0), 0.00001);
+ assertEquals(-5.0, getDouble(batch, 0), 0.00001);
+ assertEquals(bytes(), getBinary(batch, 0));
+ assertEquals("bye", getText(batch, 0).toString());
+ assertEquals(false, middle.isNull[0]);
+ assertEquals(2, midList.lengths[0]);
+ start = (int) midList.offsets[0];
+ assertEquals(1, midListInt.vector[start]);
+ assertEquals("bye", midListStr.toString(start));
+ assertEquals(2, midListInt.vector[start + 1]);
+ assertEquals("sigh", midListStr.toString(start + 1));
+ assertEquals(3, list.lengths[0]);
+ start = (int) list.offsets[0];
+ assertEquals(100000000, listInts.vector[start]);
+ assertEquals("cat", listStrs.toString(start));
+ assertEquals(-100000, listInts.vector[start + 1]);
+ assertEquals("in", listStrs.toString(start + 1));
+ assertEquals(1234, listInts.vector[start + 2]);
+ assertEquals("hat", listStrs.toString(start + 2));
+ assertEquals(2, map.lengths[0]);
+ start = (int) map.offsets[0];
+ assertEquals("chani", mapKey.toString(start));
+ assertEquals(5, mapValueInts.vector[start]);
+ assertEquals("chani", mapValueStrs.toString(start));
+ assertEquals("mauddib", mapKey.toString(start + 1));
+ assertEquals(1, mapValueInts.vector[start + 1]);
+ assertEquals("mauddib", mapValueStrs.toString(start + 1));
+ assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"),
+ timestamp.asScratchTimestamp(0));
+ assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547457")),
+ decs.vector[0]);
+
+ // handle the close up
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ rows.close();
+ }
+
+ @Test
+ public void testTimestamp() throws Exception {
+ TypeDescription schema = TypeDescription.createTimestamp();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .bufferSize(10000).version(org.apache.orc.OrcFile.Version.V_0_11));
+ List<Timestamp> tslist = Lists.newArrayList();
+ tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.000999"));
+ tslist.add(Timestamp.valueOf("2003-01-01 00:00:00.000000222"));
+ tslist.add(Timestamp.valueOf("1999-01-01 00:00:00.999999999"));
+ tslist.add(Timestamp.valueOf("1995-01-01 00:00:00.688888888"));
+ tslist.add(Timestamp.valueOf("2002-01-01 00:00:00.1"));
+ tslist.add(Timestamp.valueOf("2010-03-02 00:00:00.000009001"));
+ tslist.add(Timestamp.valueOf("2005-01-01 00:00:00.000002229"));
+ tslist.add(Timestamp.valueOf("2006-01-01 00:00:00.900203003"));
+ tslist.add(Timestamp.valueOf("2003-01-01 00:00:00.800000007"));
+ tslist.add(Timestamp.valueOf("1996-08-02 00:00:00.723100809"));
+ tslist.add(Timestamp.valueOf("1998-11-02 00:00:00.857340643"));
+ tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
+
+ VectorizedRowBatch batch = new VectorizedRowBatch(1, 1024);
+ TimestampColumnVector vec = new TimestampColumnVector(1024);
+ batch.cols[0] = vec;
+ batch.reset();
+ batch.size = tslist.size();
+ for (int i=0; i < tslist.size(); ++i) {
+ Timestamp ts = tslist.get(i);
+ vec.set(i, ts);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ TimestampColumnVector timestamps = (TimestampColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(tslist.get(idx++).getNanos(),
+ timestamps.asScratchTimestamp(r).getNanos());
+ }
+ }
+ Assert.assertEquals(tslist.size(), rows.getRowNumber());
+ assertEquals(0, writer.getSchema().getMaximumId());
+ boolean[] expected = new boolean[] {false};
+ boolean[] included = OrcUtils.includeColumns("", writer.getSchema());
+ assertEquals(true, Arrays.equals(expected, included));
+ }
+
+ @Test
+ public void testStringAndBinaryStatistics() throws Exception {
+
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("bytes1", TypeDescription.createBinary())
+ .addField("string1", TypeDescription.createString());
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 4;
+ BytesColumnVector field1 = (BytesColumnVector) batch.cols[0];
+ BytesColumnVector field2 = (BytesColumnVector) batch.cols[1];
+ field1.setVal(0, bytesArray(0, 1, 2, 3, 4));
+ field1.setVal(1, bytesArray(0, 1, 2, 3));
+ field1.setVal(2, bytesArray(0, 1, 2, 3, 4, 5));
+ field1.noNulls = false;
+ field1.isNull[3] = true;
+ field2.setVal(0, "foo".getBytes());
+ field2.setVal(1, "bar".getBytes());
+ field2.noNulls = false;
+ field2.isNull[2] = true;
+ field2.setVal(3, "hi".getBytes());
+ writer.addRowBatch(batch);
+ writer.close();
+ schema = writer.getSchema();
+ assertEquals(2, schema.getMaximumId());
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ boolean[] expected = new boolean[] {false, false, true};
+ boolean[] included = OrcUtils.includeColumns("string1", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ expected = new boolean[] {false, false, false};
+ included = OrcUtils.includeColumns("", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ expected = new boolean[] {false, false, false};
+ included = OrcUtils.includeColumns(null, schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(4, stats[0].getNumberOfValues());
+ assertEquals("count: 4 hasNull: false", stats[0].toString());
+
+ assertEquals(3, stats[1].getNumberOfValues());
+ assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
+ assertEquals("count: 3 hasNull: true sum: 15", stats[1].toString());
+
+ assertEquals(3, stats[2].getNumberOfValues());
+ assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
+ assertEquals("hi", ((StringColumnStatistics) stats[2]).getMaximum());
+ assertEquals(8, ((StringColumnStatistics) stats[2]).getSum());
+ assertEquals("count: 3 hasNull: true min: bar max: hi sum: 8",
+ stats[2].toString());
+
+ // check the inspectors
+ batch = reader.getSchema().createRowBatch();
+ BytesColumnVector bytes = (BytesColumnVector) batch.cols[0];
+ BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
+ RecordReader rows = reader.rows();
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(4, batch.size);
+
+ // check the contents of the first row
+ assertEquals(bytes(0,1,2,3,4), getBinary(bytes, 0));
+ assertEquals("foo", strs.toString(0));
+
+ // check the contents of second row
+ assertEquals(bytes(0,1,2,3), getBinary(bytes, 1));
+ assertEquals("bar", strs.toString(1));
+
+ // check the contents of third row
+ assertEquals(bytes(0,1,2,3,4,5), getBinary(bytes, 2));
+ assertNull(strs.toString(2));
+
+ // check the contents of fourth row
+ assertNull(getBinary(bytes, 3));
+ assertEquals("hi", strs.toString(3));
+
+ // handle the close up
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ rows.close();
+ }
+
+
+ @Test
+ public void testStripeLevelStats() throws Exception {
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("int1", TypeDescription.createInt())
+ .addField("string1", TypeDescription.createString());
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 1000;
+ LongColumnVector field1 = (LongColumnVector) batch.cols[0];
+ BytesColumnVector field2 = (BytesColumnVector) batch.cols[1];
+ field1.isRepeating = true;
+ field2.isRepeating = true;
+ for (int b = 0; b < 11; b++) {
+ if (b >= 5) {
+ if (b >= 10) {
+ field1.vector[0] = 3;
+ field2.setVal(0, "three".getBytes());
+ } else {
+ field1.vector[0] = 2;
+ field2.setVal(0, "two".getBytes());
+ }
+ } else {
+ field1.vector[0] = 1;
+ field2.setVal(0, "one".getBytes());
+ }
+ writer.addRowBatch(batch);
+ }
+
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ schema = writer.getSchema();
+ assertEquals(2, schema.getMaximumId());
+ boolean[] expected = new boolean[] {false, true, false};
+ boolean[] included = OrcUtils.includeColumns("int1", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ List<StripeStatistics> stats = reader.getStripeStatistics();
+ int numStripes = stats.size();
+ assertEquals(3, numStripes);
+ StripeStatistics ss1 = stats.get(0);
+ StripeStatistics ss2 = stats.get(1);
+ StripeStatistics ss3 = stats.get(2);
+
+ assertEquals(5000, ss1.getColumnStatistics()[0].getNumberOfValues());
+ assertEquals(5000, ss2.getColumnStatistics()[0].getNumberOfValues());
+ assertEquals(1000, ss3.getColumnStatistics()[0].getNumberOfValues());
+
+ assertEquals(5000, (ss1.getColumnStatistics()[1]).getNumberOfValues());
+ assertEquals(5000, (ss2.getColumnStatistics()[1]).getNumberOfValues());
+ assertEquals(1000, (ss3.getColumnStatistics()[1]).getNumberOfValues());
+ assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMinimum());
+ assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMinimum());
+ assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMinimum());
+ assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMaximum());
+ assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMaximum());
+ assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMaximum());
+ assertEquals(5000, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getSum());
+ assertEquals(10000, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getSum());
+ assertEquals(3000, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getSum());
+
+ assertEquals(5000, (ss1.getColumnStatistics()[2]).getNumberOfValues());
+ assertEquals(5000, (ss2.getColumnStatistics()[2]).getNumberOfValues());
+ assertEquals(1000, (ss3.getColumnStatistics()[2]).getNumberOfValues());
+ assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMinimum());
+ assertEquals("two", ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getMinimum());
+ assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMinimum());
+ assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMaximum());
+ assertEquals("two", ((StringColumnStatistics) ss2.getColumnStatistics()[2]).getMaximum());
+ assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMaximum());
+ assertEquals(15000, ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getSum());
+ assertEquals(15000, ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getSum());
+ assertEquals(5000, ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getSum());
+
+ RecordReaderImpl recordReader = (RecordReaderImpl) reader.rows();
+ OrcProto.RowIndex[] index = recordReader.readRowIndex(0, null, null).getRowGroupIndex();
+ assertEquals(3, index.length);
+ List<OrcProto.RowIndexEntry> items = index[1].getEntryList();
+ assertEquals(1, items.size());
+ assertEquals(3, items.get(0).getPositionsCount());
+ assertEquals(0, items.get(0).getPositions(0));
+ assertEquals(0, items.get(0).getPositions(1));
+ assertEquals(0, items.get(0).getPositions(2));
+ assertEquals(1,
+ items.get(0).getStatistics().getIntStatistics().getMinimum());
+ index = recordReader.readRowIndex(1, null, null).getRowGroupIndex();
+ assertEquals(3, index.length);
+ items = index[1].getEntryList();
+ assertEquals(2,
+ items.get(0).getStatistics().getIntStatistics().getMaximum());
+ }
+
+ private static void setInner(StructColumnVector inner, int rowId,
+ int i, String value) {
+ ((LongColumnVector) inner.fields[0]).vector[rowId] = i;
+ if (value != null) {
+ ((BytesColumnVector) inner.fields[1]).setVal(rowId, value.getBytes());
+ } else {
+ inner.fields[1].isNull[rowId] = true;
+ inner.fields[1].noNulls = false;
+ }
+ }
+
+ private static void checkInner(StructColumnVector inner, int rowId,
+ int rowInBatch, int i, String value) {
+ assertEquals("row " + rowId, i,
+ ((LongColumnVector) inner.fields[0]).vector[rowInBatch]);
+ if (value != null) {
+ assertEquals("row " + rowId, value,
+ ((BytesColumnVector) inner.fields[1]).toString(rowInBatch));
+ } else {
+ assertEquals("row " + rowId, true, inner.fields[1].isNull[rowInBatch]);
+ assertEquals("row " + rowId, false, inner.fields[1].noNulls);
+ }
+ }
+
+ private static void setInnerList(ListColumnVector list, int rowId,
+ List<InnerStruct> value) {
+ if (value != null) {
+ if (list.childCount + value.size() > list.child.isNull.length) {
+ list.child.ensureSize(list.childCount * 2, true);
+ }
+ list.lengths[rowId] = value.size();
+ list.offsets[rowId] = list.childCount;
+ for (int i = 0; i < list.lengths[rowId]; ++i) {
+ InnerStruct inner = value.get(i);
+ setInner((StructColumnVector) list.child, i + list.childCount,
+ inner.int1, inner.string1.toString());
+ }
+ list.childCount += value.size();
+ } else {
+ list.isNull[rowId] = true;
+ list.noNulls = false;
+ }
+ }
+
+ private static void checkInnerList(ListColumnVector list, int rowId,
+ int rowInBatch, List<InnerStruct> value) {
+ if (value != null) {
+ assertEquals("row " + rowId, value.size(), list.lengths[rowInBatch]);
+ int start = (int) list.offsets[rowInBatch];
+ for (int i = 0; i < list.lengths[rowInBatch]; ++i) {
+ InnerStruct inner = value.get(i);
+ checkInner((StructColumnVector) list.child, rowId, i + start,
+ inner.int1, inner.string1.toString());
+ }
+ list.childCount += value.size();
+ } else {
+ assertEquals("row " + rowId, true, list.isNull[rowInBatch]);
+ assertEquals("row " + rowId, false, list.noNulls);
+ }
+ }
+
+ private static void setInnerMap(MapColumnVector map, int rowId,
+ Map<String, InnerStruct> value) {
+ if (value != null) {
+ if (map.childCount >= map.keys.isNull.length) {
+ map.keys.ensureSize(map.childCount * 2, true);
+ map.values.ensureSize(map.childCount * 2, true);
+ }
+ map.lengths[rowId] = value.size();
+ int offset = map.childCount;
+ map.offsets[rowId] = offset;
+
+ for (Map.Entry<String, InnerStruct> entry : value.entrySet()) {
+ ((BytesColumnVector) map.keys).setVal(offset, entry.getKey().getBytes());
+ InnerStruct inner = entry.getValue();
+ setInner((StructColumnVector) map.values, offset, inner.int1,
+ inner.string1.toString());
+ offset += 1;
+ }
+ map.childCount = offset;
+ } else {
+ map.isNull[rowId] = true;
+ map.noNulls = false;
+ }
+ }
+
+ private static void checkInnerMap(MapColumnVector map, int rowId,
+ int rowInBatch,
+ Map<String, InnerStruct> value) {
+ if (value != null) {
+ assertEquals("row " + rowId, value.size(), map.lengths[rowInBatch]);
+ int offset = (int) map.offsets[rowInBatch];
+ for(int i=0; i < value.size(); ++i) {
+ String key = ((BytesColumnVector) map.keys).toString(offset + i);
+ InnerStruct expected = value.get(key);
+ checkInner((StructColumnVector) map.values, rowId, offset + i,
+ expected.int1, expected.string1.toString());
+ }
+ } else {
+ assertEquals("row " + rowId, true, map.isNull[rowId]);
+ assertEquals("row " + rowId, false, map.noNulls);
+ }
+ }
+
+ private static void setMiddleStruct(StructColumnVector middle, int rowId,
+ MiddleStruct value) {
+ if (value != null) {
+ setInnerList((ListColumnVector) middle.fields[0], rowId, value.list);
+ } else {
+ middle.isNull[rowId] = true;
+ middle.noNulls = false;
+ }
+ }
+
+ private static void checkMiddleStruct(StructColumnVector middle, int rowId,
+ int rowInBatch, MiddleStruct value) {
+ if (value != null) {
+ checkInnerList((ListColumnVector) middle.fields[0], rowId, rowInBatch,
+ value.list);
+ } else {
+ assertEquals("row " + rowId, true, middle.isNull[rowInBatch]);
+ assertEquals("row " + rowId, false, middle.noNulls);
+ }
+ }
+
+ private static void setBigRow(VectorizedRowBatch batch, int rowId,
+ Boolean b1, Byte b2, Short s1,
+ Integer i1, Long l1, Float f1,
+ Double d1, BytesWritable b3, String s2,
+ MiddleStruct m1, List<InnerStruct> l2,
+ Map<String, InnerStruct> m2) {
+ ((LongColumnVector) batch.cols[0]).vector[rowId] = b1 ? 1 : 0;
+ ((LongColumnVector) batch.cols[1]).vector[rowId] = b2;
+ ((LongColumnVector) batch.cols[2]).vector[rowId] = s1;
+ ((LongColumnVector) batch.cols[3]).vector[rowId] = i1;
+ ((LongColumnVector) batch.cols[4]).vector[rowId] = l1;
+ ((DoubleColumnVector) batch.cols[5]).vector[rowId] = f1;
+ ((DoubleColumnVector) batch.cols[6]).vector[rowId] = d1;
+ if (b3 != null) {
+ ((BytesColumnVector) batch.cols[7]).setVal(rowId, b3.getBytes(), 0,
+ b3.getLength());
+ } else {
+ batch.cols[7].isNull[rowId] = true;
+ batch.cols[7].noNulls = false;
+ }
+ if (s2 != null) {
+ ((BytesColumnVector) batch.cols[8]).setVal(rowId, s2.getBytes());
+ } else {
+ batch.cols[8].isNull[rowId] = true;
+ batch.cols[8].noNulls = false;
+ }
+ setMiddleStruct((StructColumnVector) batch.cols[9], rowId, m1);
+ setInnerList((ListColumnVector) batch.cols[10], rowId, l2);
+ setInnerMap((MapColumnVector) batch.cols[11], rowId, m2);
+ }
+
+ private static void checkBigRow(VectorizedRowBatch batch,
+ int rowInBatch,
+ int rowId,
+ boolean b1, byte b2, short s1,
+ int i1, long l1, float f1,
+ double d1, BytesWritable b3, String s2,
+ MiddleStruct m1, List<InnerStruct> l2,
+ Map<String, InnerStruct> m2) {
+ assertEquals("row " + rowId, b1, getBoolean(batch, rowInBatch));
+ assertEquals("row " + rowId, b2, getByte(batch, rowInBatch));
+ assertEquals("row " + rowId, s1, getShort(batch, rowInBatch));
+ assertEquals("row " + rowId, i1, getInt(batch, rowInBatch));
+ assertEquals("row " + rowId, l1, getLong(batch, rowInBatch));
+ assertEquals("row " + rowId, f1, getFloat(batch, rowInBatch), 0.0001);
+ assertEquals("row " + rowId, d1, getDouble(batch, rowInBatch), 0.0001);
+ if (b3 != null) {
+ BytesColumnVector bytes = (BytesColumnVector) batch.cols[7];
+ assertEquals("row " + rowId, b3.getLength(), bytes.length[rowInBatch]);
+ for(int i=0; i < b3.getLength(); ++i) {
+ assertEquals("row " + rowId + " byte " + i, b3.getBytes()[i],
+ bytes.vector[rowInBatch][bytes.start[rowInBatch] + i]);
+ }
+ } else {
+ assertEquals("row " + rowId, true, batch.cols[7].isNull[rowInBatch]);
+ assertEquals("row " + rowId, false, batch.cols[7].noNulls);
+ }
+ if (s2 != null) {
+ assertEquals("row " + rowId, s2, getText(batch, rowInBatch).toString());
+ } else {
+ assertEquals("row " + rowId, true, batch.cols[8].isNull[rowInBatch]);
+ assertEquals("row " + rowId, false, batch.cols[8].noNulls);
+ }
+ checkMiddleStruct((StructColumnVector) batch.cols[9], rowId, rowInBatch,
+ m1);
+ checkInnerList((ListColumnVector) batch.cols[10], rowId, rowInBatch, l2);
+ checkInnerMap((MapColumnVector) batch.cols[11], rowId, rowInBatch, m2);
+ }
+
+ private static boolean getBoolean(VectorizedRowBatch batch, int rowId) {
+ return ((LongColumnVector) batch.cols[0]).vector[rowId] != 0;
+ }
+
+ private static byte getByte(VectorizedRowBatch batch, int rowId) {
+ return (byte) ((LongColumnVector) batch.cols[1]).vector[rowId];
+ }
+
+ private static short getShort(VectorizedRowBatch batch, int rowId) {
+ return (short) ((LongColumnVector) batch.cols[2]).vector[rowId];
+ }
+
+ private static int getInt(VectorizedRowBatch batch, int rowId) {
+ return (int) ((LongColumnVector) batch.cols[3]).vector[rowId];
+ }
+
+ private static long getLong(VectorizedRowBatch batch, int rowId) {
+ return ((LongColumnVector) batch.cols[4]).vector[rowId];
+ }
+
+ private static float getFloat(VectorizedRowBatch batch, int rowId) {
+ return (float) ((DoubleColumnVector) batch.cols[5]).vector[rowId];
+ }
+
+ private static double getDouble(VectorizedRowBatch batch, int rowId) {
+ return ((DoubleColumnVector) batch.cols[6]).vector[rowId];
+ }
+
+ private static BytesWritable getBinary(BytesColumnVector column, int rowId) {
+ if (column.isRepeating) {
+ rowId = 0;
+ }
+ if (column.noNulls || !column.isNull[rowId]) {
+ return new BytesWritable(Arrays.copyOfRange(column.vector[rowId],
+ column.start[rowId], column.start[rowId] + column.length[rowId]));
+ } else {
+ return null;
+ }
+ }
+
+ private static BytesWritable getBinary(VectorizedRowBatch batch, int rowId) {
+ return getBinary((BytesColumnVector) batch.cols[7], rowId);
+ }
+
+ private static Text getText(BytesColumnVector vector, int rowId) {
+ if (vector.isRepeating) {
+ rowId = 0;
+ }
+ if (vector.noNulls || !vector.isNull[rowId]) {
+ return new Text(Arrays.copyOfRange(vector.vector[rowId],
+ vector.start[rowId], vector.start[rowId] + vector.length[rowId]));
+ } else {
+ return null;
+ }
+ }
+
+ private static Text getText(VectorizedRowBatch batch, int rowId) {
+ return getText((BytesColumnVector) batch.cols[8], rowId);
+ }
+
+ private static InnerStruct getInner(StructColumnVector vector,
+ int rowId) {
+ return new InnerStruct(
+ (int) ((LongColumnVector) vector.fields[0]).vector[rowId],
+ getText((BytesColumnVector) vector.fields[1], rowId));
+ }
+
+ private static List<InnerStruct> getList(ListColumnVector cv,
+ int rowId) {
+ if (cv.isRepeating) {
+ rowId = 0;
+ }
+ if (cv.noNulls || !cv.isNull[rowId]) {
+ List<InnerStruct> result =
+ new ArrayList<InnerStruct>((int) cv.lengths[rowId]);
+ for(long i=cv.offsets[rowId];
+ i < cv.offsets[rowId] + cv.lengths[rowId]; ++i) {
+ result.add(getInner((StructColumnVector) cv.child, (int) i));
+ }
+ return result;
+ } else {
+ return null;
+ }
+ }
+
+ private static List<InnerStruct> getMidList(VectorizedRowBatch batch,
+ int rowId) {
+ return getList((ListColumnVector) ((StructColumnVector) batch.cols[9])
+ .fields[0], rowId);
+ }
+
+ private static List<InnerStruct> getList(VectorizedRowBatch batch,
+ int rowId) {
+ return getList((ListColumnVector) batch.cols[10], rowId);
+ }
+
+ private static Map<Text, InnerStruct> getMap(VectorizedRowBatch batch,
+ int rowId) {
+ MapColumnVector cv = (MapColumnVector) batch.cols[11];
+ if (cv.isRepeating) {
+ rowId = 0;
+ }
+ if (cv.noNulls || !cv.isNull[rowId]) {
+ Map<Text, InnerStruct> result =
+ new HashMap<Text, InnerStruct>((int) cv.lengths[rowId]);
+ for(long i=cv.offsets[rowId];
+ i < cv.offsets[rowId] + cv.lengths[rowId]; ++i) {
+ result.put(getText((BytesColumnVector) cv.keys, (int) i),
+ getInner((StructColumnVector) cv.values, (int) i));
+ }
+ return result;
+ } else {
+ return null;
+ }
+ }
+
+ private static TypeDescription createInnerSchema() {
+ return TypeDescription.createStruct()
+ .addField("int1", TypeDescription.createInt())
+ .addField("string1", TypeDescription.createString());
+ }
+
+ private static TypeDescription createBigRowSchema() {
+ return TypeDescription.createStruct()
+ .addField("boolean1", TypeDescription.createBoolean())
+ .addField("byte1", TypeDescription.createByte())
+ .addField("short1", TypeDescription.createShort())
+ .addField("int1", TypeDescription.createInt())
+ .addField("long1", TypeDescription.createLong())
+ .addField("float1", TypeDescription.createFloat())
+ .addField("double1", TypeDescription.createDouble())
+ .addField("bytes1", TypeDescription.createBinary())
+ .addField("string1", TypeDescription.createString())
+ .addField("middle", TypeDescription.createStruct()
+ .addField("list", TypeDescription.createList(createInnerSchema())))
+ .addField("list", TypeDescription.createList(createInnerSchema()))
+ .addField("map", TypeDescription.createMap(
+ TypeDescription.createString(),
+ createInnerSchema()));
+ }
+
+ static void assertArrayEquals(boolean[] expected, boolean[] actual) {
+ assertEquals(expected.length, actual.length);
+ boolean diff = false;
+ for(int i=0; i < expected.length; ++i) {
+ if (expected[i] != actual[i]) {
+ System.out.println("Difference at " + i + " expected: " + expected[i] +
+ " actual: " + actual[i]);
+ diff = true;
+ }
+ }
+ assertEquals(false, diff);
+ }
+
+ @Test
+ public void test1() throws Exception {
+ TypeDescription schema = createBigRowSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 2;
+ setBigRow(batch, 0, false, (byte) 1, (short) 1024, 65536,
+ Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi",
+ new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
+ list(inner(3, "good"), inner(4, "bad")),
+ map());
+ setBigRow(batch, 1, true, (byte) 100, (short) 2048, 65536,
+ Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye",
+ new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
+ list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")),
+ map(inner(5, "chani"), inner(1, "mauddib")));
+ writer.addRowBatch(batch);
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ schema = writer.getSchema();
+ assertEquals(23, schema.getMaximumId());
+ boolean[] expected = new boolean[] {false, false, false, false, false,
+ false, false, false, false, false,
+ false, false, false, false, false,
+ false, false, false, false, false,
+ false, false, false, false};
+ boolean[] included = OrcUtils.includeColumns("", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ expected = new boolean[] {false, true, false, false, false,
+ false, false, false, false, true,
+ true, true, true, true, true,
+ false, false, false, false, true,
+ true, true, true, true};
+ included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema);
+
+ assertArrayEquals(expected, included);
+
+ expected = new boolean[] {false, true, false, false, false,
+ false, false, false, false, true,
+ true, true, true, true, true,
+ false, false, false, false, true,
+ true, true, true, true};
+ included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema);
+ assertArrayEquals(expected, included);
+
+ expected = new boolean[] {false, true, true, true, true,
+ true, true, true, true, true,
+ true, true, true, true, true,
+ true, true, true, true, true,
+ true, true, true, true};
+ included = OrcUtils.includeColumns(
+ "boolean1,byte1,short1,int1,long1,float1,double1,bytes1,string1,middle,list,map",
+ schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(2, stats[1].getNumberOfValues());
+ assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount());
+ assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount());
+ assertEquals("count: 2 hasNull: false true: 1", stats[1].toString());
+
+ assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
+ assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
+ assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum());
+ assertEquals("count: 2 hasNull: false min: 1024 max: 2048 sum: 3072",
+ stats[3].toString());
+
+ StripeStatistics ss = reader.getStripeStatistics().get(0);
+ assertEquals(2, ss.getColumnStatistics()[0].getNumberOfValues());
+ assertEquals(1, ((BooleanColumnStatistics) ss.getColumnStatistics()[1]).getTrueCount());
+ assertEquals(1024, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMinimum());
+ assertEquals(2048, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMaximum());
+ assertEquals(3072, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getSum());
+ assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001);
+ assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001);
+ assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
+ assertEquals("count: 2 hasNull: false min: -15.0 max: -5.0 sum: -20.0",
+ stats[7].toString());
+
+ assertEquals("count: 2 hasNull: false min: bye max: hi sum: 5", stats[9].toString());
+
+ // check the schema
+ TypeDescription readerSchema = reader.getSchema();
+ assertEquals(TypeDescription.Category.STRUCT, readerSchema.getCategory());
+ assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint,"
+ + "int1:int,long1:bigint,float1:float,double1:double,bytes1:"
+ + "binary,string1:string,middle:struct<list:array<struct<int1:int,"
+ + "string1:string>>>,list:array<struct<int1:int,string1:string>>,"
+ + "map:map<string,struct<int1:int,string1:string>>>",
+ readerSchema.toString());
+ List<String> fieldNames = readerSchema.getFieldNames();
+ List<TypeDescription> fieldTypes = readerSchema.getChildren();
+ assertEquals("boolean1", fieldNames.get(0));
+ assertEquals(TypeDescription.Category.BOOLEAN, fieldTypes.get(0).getCategory());
+ assertEquals("byte1", fieldNames.get(1));
+ assertEquals(TypeDescription.Category.BYTE, fieldTypes.get(1).getCategory());
+ assertEquals("short1", fieldNames.get(2));
+ assertEquals(TypeDescription.Category.SHORT, fieldTypes.get(2).getCategory());
+ assertEquals("int1", fieldNames.get(3));
+ assertEquals(TypeDescription.Category.INT, fieldTypes.get(3).getCategory());
+ assertEquals("long1", fieldNames.get(4));
+ assertEquals(TypeDescription.Category.LONG, fieldTypes.get(4).getCategory());
+ assertEquals("float1", fieldNames.get(5));
+ assertEquals(TypeDescription.Category.FLOAT, fieldTypes.get(5).getCategory());
+ assertEquals("double1", fieldNames.get(6));
+ assertEquals(TypeDescription.Category.DOUBLE, fieldTypes.get(6).getCategory());
+ assertEquals("bytes1", fieldNames.get(7));
+ assertEquals(TypeDescription.Category.BINARY, fieldTypes.get(7).getCategory());
+ assertEquals("string1", fieldNames.get(8));
+ assertEquals(TypeDescription.Category.STRING, fieldTypes.get(8).getCategory());
+ assertEquals("middle", fieldNames.get(9));
+ TypeDescription middle = fieldTypes.get(9);
+ assertEquals(TypeDescription.Category.STRUCT, middle.getCategory());
+ TypeDescription midList = middle.getChildren().get(0);
+ assertEquals(TypeDescription.Category.LIST, midList.getCategory());
+ TypeDescription inner = midList.getChildren().get(0);
+ assertEquals(TypeDescription.Category.STRUCT, inner.getCategory());
+ assertEquals("int1", inner.getFieldNames().get(0));
+ assertEquals("string1", inner.getFieldNames().get(1));
+
+ RecordReader rows = reader.rows();
+ // create a new batch
+ batch = readerSchema.createRowBatch();
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(2, batch.size);
+ Assert.assertEquals(false, rows.nextBatch(batch));
+
+ // check the contents of the first row
+ assertEquals(false, getBoolean(batch, 0));
+ assertEquals(1, getByte(batch, 0));
+ assertEquals(1024, getShort(batch, 0));
+ assertEquals(65536, getInt(batch, 0));
+ assertEquals(Long.MAX_VALUE, getLong(batch, 0));
+ assertEquals(1.0, getFloat(batch, 0), 0.00001);
+ assertEquals(-15.0, getDouble(batch, 0), 0.00001);
+ assertEquals(bytes(0,1,2,3,4), getBinary(batch, 0));
+ assertEquals("hi", getText(batch, 0).toString());
+ List<InnerStruct> midRow = getMidList(batch, 0);
+ assertNotNull(midRow);
+ assertEquals(2, midRow.size());
+ assertEquals(1, midRow.get(0).int1);
+ assertEquals("bye", midRow.get(0).string1.toString());
+ assertEquals(2, midRow.get(1).int1);
+ assertEquals("sigh", midRow.get(1).string1.toString());
+ List<InnerStruct> list = getList(batch, 0);
+ assertEquals(2, list.size());
+ assertEquals(3, list.get(0).int1);
+ assertEquals("good", list.get(0).string1.toString());
+ assertEquals(4, list.get(1).int1);
+ assertEquals("bad", list.get(1).string1.toString());
+ Map<Text, InnerStruct> map = getMap(batch, 0);
+ assertEquals(0, map.size());
+
+ // check the contents of second row
+ assertEquals(true, getBoolean(batch, 1));
+ assertEquals(100, getByte(batch, 1));
+ assertEquals(2048, getShort(batch, 1));
+ assertEquals(65536, getInt(batch, 1));
+ assertEquals(Long.MAX_VALUE, getLong(batch, 1));
+ assertEquals(2.0, getFloat(batch, 1), 0.00001);
+ assertEquals(-5.0, getDouble(batch, 1), 0.00001);
+ assertEquals(bytes(), getBinary(batch, 1));
+ assertEquals("bye", getText(batch, 1).toString());
+ midRow = getMidList(batch, 1);
+ assertNotNull(midRow);
+ assertEquals(2, midRow.size());
+ assertEquals(1, midRow.get(0).int1);
+ assertEquals("bye", midRow.get(0).string1.toString());
+ assertEquals(2, midRow.get(1).int1);
+ assertEquals("sigh", midRow.get(1).string1.toString());
+ list = getList(batch, 1);
+ assertEquals(3, list.size());
+ assertEquals(100000000, list.get(0).int1);
+ assertEquals("cat", list.get(0).string1.toString());
+ assertEquals(-100000, list.get(1).int1);
+ assertEquals("in", list.get(1).string1.toString());
+ assertEquals(1234, list.get(2).int1);
+ assertEquals("hat", list.get(2).string1.toString());
+ map = getMap(batch, 1);
+ assertEquals(2, map.size());
+ InnerStruct value = map.get(new Text("chani"));
+ assertEquals(5, value.int1);
+ assertEquals("chani", value.string1.toString());
+ value = map.get(new Text("mauddib"));
+ assertEquals(1, value.int1);
+ assertEquals("mauddib", value.string1.toString());
+
+ // handle the close up
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ rows.close();
+ }
+
+ @Test
+ public void testColumnProjection() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(1000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(100)
+ .rowIndexStride(1000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ Random r1 = new Random(1);
+ Random r2 = new Random(2);
+ int x;
+ int minInt=0, maxInt=0;
+ String y;
+ String minStr = null, maxStr = null;
+ batch.size = 1000;
+ boolean first = true;
+ for(int b=0; b < 21; ++b) {
+ for(int r=0; r < 1000; ++r) {
+ x = r1.nextInt();
+ y = Long.toHexString(r2.nextLong());
+ if (first || x < minInt) {
+ minInt = x;
+ }
+ if (first || x > maxInt) {
+ maxInt = x;
+ }
+ if (first || y.compareTo(minStr) < 0) {
+ minStr = y;
+ }
+ if (first || y.compareTo(maxStr) > 0) {
+ maxStr = y;
+ }
+ first = false;
+ ((LongColumnVector) batch.cols[0]).vector[r] = x;
+ ((BytesColumnVector) batch.cols[1]).setVal(r, y.getBytes());
+ }
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ // check out the statistics
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(3, stats.length);
+ for(ColumnStatistics s: stats) {
+ assertEquals(21000, s.getNumberOfValues());
+ if (s instanceof IntegerColumnStatistics) {
+ assertEquals(minInt, ((IntegerColumnStatistics) s).getMinimum());
+ assertEquals(maxInt, ((IntegerColumnStatistics) s).getMaximum());
+ } else if (s instanceof StringColumnStatistics) {
+ assertEquals(maxStr, ((StringColumnStatistics) s).getMaximum());
+ assertEquals(minStr, ((StringColumnStatistics) s).getMinimum());
+ }
+ }
+
+ // check out the types
+ TypeDescription type = reader.getSchema();
+ assertEquals(TypeDescription.Category.STRUCT, type.getCategory());
+ assertEquals(2, type.getChildren().size());
+ TypeDescription type1 = type.getChildren().get(0);
+ TypeDescription type2 = type.getChildren().get(1);
+ assertEquals(TypeDescription.Category.INT, type1.getCategory());
+ assertEquals(TypeDescription.Category.STRING, type2.getCategory());
+ assertEquals("struct<int1:int,string1:string>", type.toString());
+
+ // read the contents and make sure they match
+ RecordReader rows1 = reader.rows(
+ new Reader.Options().include(new boolean[]{true, true, false}));
+ RecordReader rows2 = reader.rows(
+ new Reader.Options().include(new boolean[]{true, false, true}));
+ r1 = new Random(1);
+ r2 = new Random(2);
+ VectorizedRowBatch batch1 = reader.getSchema().createRowBatch(1000);
+ VectorizedRowBatch batch2 = reader.getSchema().createRowBatch(1000);
+ for(int i = 0; i < 21000; i += 1000) {
+ Assert.assertEquals(true, rows1.nextBatch(batch1));
+ Assert.assertEquals(true, rows2.nextBatch(batch2));
+ assertEquals(1000, batch1.size);
+ assertEquals(1000, batch2.size);
+ for(int j=0; j < 1000; ++j) {
+ assertEquals(r1.nextInt(),
+ ((LongColumnVector) batch1.cols[0]).vector[j]);
+ assertEquals(Long.toHexString(r2.nextLong()),
+ ((BytesColumnVector) batch2.cols[1]).toString(j));
+ }
+ }
+ Assert.assertEquals(false, rows1.nextBatch(batch1));
+ Assert.assertEquals(false, rows2.nextBatch(batch2));
+ rows1.close();
+ rows2.close();
+ }
+
+ @Test
+ public void testEmptyFile() throws Exception {
+ TypeDescription schema = createBigRowSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(1000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(100));
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ VectorizedRowBatch batch = reader.getSchema().createRowBatch();
+ Assert.assertEquals(false, reader.rows().nextBatch(batch));
+ Assert.assertEquals(CompressionKind.NONE, reader.getCompressionKind());
+ Assert.assertEquals(0, reader.getNumberOfRows());
+ Assert.assertEquals(0, reader.getCompressionSize());
+ Assert.assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
+ Assert.assertEquals(3, reader.getContentLength());
+ Assert.assertEquals(false, reader.getStripes().iterator().hasNext());
+ }
+
+ @Test
+ public void metaData() throws Exception {
+ TypeDescription schema = createBigRowSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(1000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(100));
+ writer.addUserMetadata("my.meta", byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127,
+ -128));
+ writer.addUserMetadata("clobber", byteBuf(1, 2, 3));
+ writer.addUserMetadata("clobber", byteBuf(4, 3, 2, 1));
+ ByteBuffer bigBuf = ByteBuffer.allocate(40000);
+ Random random = new Random(0);
+ random.nextBytes(bigBuf.array());
+ writer.addUserMetadata("big", bigBuf);
+ bigBuf.position(0);
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 1;
+ setBigRow(batch, 0, true, (byte) 127, (short) 1024, 42,
+ 42L * 1024 * 1024 * 1024, (float) 3.1415, -2.713, null,
+ null, null, null, null);
+ writer.addRowBatch(batch);
+ writer.addUserMetadata("clobber", byteBuf(5,7,11,13,17,19));
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ Assert.assertEquals(byteBuf(5, 7, 11, 13, 17, 19), reader.getMetadataValue("clobber"));
+ Assert.assertEquals(byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127, -128),
+ reader.getMetadataValue("my.meta"));
+ Assert.assertEquals(bigBuf, reader.getMetadataValue("big"));
+ try {
+ reader.getMetadataValue("unknown");
+ assertTrue(false);
+ } catch (IllegalArgumentException iae) {
+ // PASS
+ }
+ int i = 0;
+ for(String key: reader.getMetadataKeys()) {
+ if ("my.meta".equals(key) ||
+ "clobber".equals(key) ||
+ "big".equals(key)) {
+ i += 1;
+ } else {
+ throw new IllegalArgumentException("unknown key " + key);
+ }
+ }
+ assertEquals(3, i);
+ int numStripes = reader.getStripeStatistics().size();
+ assertEquals(1, numStripes);
+ }
+
+ /**
+ * Generate an ORC file with a range of dates and times.
+ */
+ public void createOrcDateFile(Path file, int minYear, int maxYear
+ ) throws IOException {
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("time", TypeDescription.createTimestamp())
+ .addField("date", TypeDescription.createDate());
+ Writer writer = OrcFile.createWriter(file,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .blockPadding(false));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 1000;
+ for (int year = minYear; year < maxYear; ++year) {
+ for (int ms = 1000; ms < 2000; ++ms) {
+ TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0];
+ timestampColVector.set(ms - 1000,
+ Timestamp.valueOf(year +
+ "-05-05 12:34:56." + ms));
+ ((LongColumnVector) batch.cols[1]).vector[ms - 1000] =
+ new DateWritable(new Date(year - 1900, 11, 25)).getDays();
+ }
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ Reader reader = OrcFile.createReader(file,
+ OrcFile.readerOptions(conf));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch(1000);
+ TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
+ LongColumnVector dates = (LongColumnVector) batch.cols[1];
+ for (int year = minYear; year < maxYear; ++year) {
+ rows.nextBatch(batch);
+ assertEquals(1000, batch.size);
+ for(int ms = 1000; ms < 2000; ++ms) {
+ StringBuilder buffer = new StringBuilder();
+ times.stringifyValue(buffer, ms - 1000);
+ String expected = Integer.toString(year) + "-05-05 12:34:56.";
+ // suppress the final zeros on the string by dividing by the largest
+ // power of 10 that divides evenly.
+ int roundedMs = ms;
+ for(int round = 1000; round > 0; round /= 10) {
+ if (ms % round == 0) {
+ roundedMs = ms / round;
+ break;
+ }
+ }
+ expected += roundedMs;
+ assertEquals(expected, buffer.toString());
+ assertEquals(Integer.toString(year) + "-12-25",
+ new DateWritable((int) dates.vector[ms - 1000]).toString());
+ }
+ }
+ rows.nextBatch(batch);
+ assertEquals(0, batch.size);
+ }
+
+ @Test
+ public void testDate1900() throws Exception {
+ createOrcDateFile(testFilePath, 1900, 1970);
+ }
+
+ @Test
+ public void testDate2038() throws Exception {
+ createOrcDateFile(testFilePath, 2038, 2250);
+ }
+
+ private static void setUnion(VectorizedRowBatch batch, int rowId,
+ Timestamp ts, Integer tag, Integer i, String s,
+ HiveDecimalWritable dec) {
+ UnionColumnVector union = (UnionColumnVector) batch.cols[1];
+ if (ts != null) {
+ TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0];
+ timestampColVector.set(rowId, ts);
+ } else {
+ batch.cols[0].isNull[rowId] = true;
+ batch.cols[0].noNulls = false;
+ }
+ if (tag != null) {
+ union.tags[rowId] = tag;
+ if (tag == 0) {
+ if (i != null) {
+ ((LongColumnVector) union.fields[tag]).vector[rowId] = i;
+ } else {
+ union.fields[tag].isNull[rowId] = true;
+ union.fields[tag].noNulls = false;
+ }
+ } else if (tag == 1) {
+ if (s != null) {
+ ((BytesColumnVector) union.fields[tag]).setVal(rowId, s.getBytes());
+ } else {
+ union.fields[tag].isNull[rowId] = true;
+ union.fields[tag].noNulls = false;
+ }
+ } else {
+ throw new IllegalArgumentException("Bad tag " + tag);
+ }
+ } else {
+ batch.cols[1].isNull[rowId] = true;
+ batch.cols[1].noNulls = false;
+ }
+ if (dec != null) {
+ ((DecimalColumnVector) batch.cols[2]).vector[rowId] = dec;
+ } else {
+ batch.cols[2].isNull[rowId] = true;
+ batch.cols[2].noNulls = false;
+ }
+ }
+
+ /**
+ * We test union, timestamp, and decimal separately since we need to make the
+ * object inspector manually. (The Hive reflection-based doesn't handle
+ * them properly.)
+ */
+ @Test
+ public void testUnionAndTimestamp() throws Exception {
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("time", TypeDescription.createTimestamp())
+ .addField("union", TypeDescription.createUnion()
+ .addUnionChild(TypeDescription.createInt())
+ .addUnionChild(TypeDescription.createString()))
+ .addField("decimal", TypeDescription.createDecimal()
+ .withPrecision(38)
+ .withScale(18));
+ HiveDecimal maxValue = HiveDecimal.create("10000000000000000000");
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(1000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(100)
+ .blockPadding(false));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 6;
+ setUnion(batch, 0, Timestamp.valueOf("2000-03-12 15:00:00"), 0, 42, null,
+ new HiveDecimalWritable("12345678.6547456"));
+ setUnion(batch, 1, Timestamp.valueOf("2000-03-20 12:00:00.123456789"),
+ 1, null, "hello", new HiveDecimalWritable("-5643.234"));
+
+ setUnion(batch, 2, null, null, null, null, null);
+ setUnion(batch, 3, null, 0, null, null, null);
+ setUnion(batch, 4, null, 1, null, null, null);
+
+ setUnion(batch, 5, Timestamp.valueOf("1970-01-01 00:00:00"), 0, 200000,
+ null, new HiveDecimalWritable("10000000000000000000"));
+ writer.addRowBatch(batch);
+
+ batch.reset();
+ Random rand = new Random(42);
+ for(int i=1970; i < 2038; ++i) {
+ Timestamp ts = Timestamp.valueOf(i + "-05-05 12:34:56." + i);
+ HiveDecimal dec =
+ HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18));
+ if ((i & 1) == 0) {
+ setUnion(batch, batch.size++, ts, 0, i*i, null,
+ new HiveDecimalWritable(dec));
+ } else {
+ setUnion(batch, batch.size++, ts, 1, null, Integer.toString(i*i),
+ new HiveDecimalWritable(dec));
+ }
+ if (maxValue.compareTo(dec) < 0) {
+ maxValue = dec;
+ }
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+
+ // let's add a lot of constant rows to test the rle
+ batch.size = 1000;
+ for(int c=0; c < batch.cols.length; ++c) {
+ batch.cols[c].setRepeating(true);
+ }
+ ((UnionColumnVector) batch.cols[1]).fields[0].isRepeating = true;
+ setUnion(batch, 0, null, 0, 1732050807, null, null);
+ for(int i=0; i < 5; ++i) {
+ writer.addRowBatch(batch);
+ }
+
+ batch.reset();
+ batch.size = 3;
+ setUnion(batch, 0, null, 0, 0, null, null);
+ setUnion(batch, 1, null, 0, 10, null, null);
+ setUnion(batch, 2, null, 0, 138, null, null);
+ writer.addRowBatch(batch);
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ schema = writer.getSchema();
+ assertEquals(5, schema.getMaximumId());
+ boolean[] expected = new boolean[] {false, false, false, false, false, false};
+ boolean[] included = OrcUtils.includeColumns("", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ expected = new boolean[] {false, true, false, false, false, true};
+ included = OrcUtils.includeColumns("time,decimal", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ expected = new boolean[] {false, false, true, true, true, false};
+ included = OrcUtils.includeColumns("union", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ Assert.assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
+ Assert.assertEquals(5077, reader.getNumberOfRows());
+ DecimalColumnStatistics stats =
+ (DecimalColumnStatistics) reader.getStatistics()[5];
+ assertEquals(71, stats.getNumberOfValues());
+ assertEquals(HiveDecimal.create("-5643.234"), stats.getMinimum());
+ assertEquals(maxValue, stats.getMaximum());
+ // TODO: fix this
+// assertEquals(null,stats.getSum());
+ int stripeCount = 0;
+ int rowCount = 0;
+ long currentOffset = -1;
+ for(StripeInformation stripe: reader.getStripes()) {
+ stripeCount += 1;
+ rowCount += stripe.getNumberOfRows();
+ if (currentOffset < 0) {
+ currentOffset = stripe.getOffset() + stripe.getLength();
+ } else {
+ assertEquals(currentOffset, stripe.getOffset());
+ currentOffset += stripe.getLength();
+ }
+ }
+ Assert.assertEquals(reader.getNumberOfRows(), rowCount);
+ assertEquals(2, stripeCount);
+ Assert.assertEquals(reader.getContentLength(), currentOffset);
+ RecordReader rows = reader.rows();
+ Assert.assertEquals(0, rows.getRowNumber());
+ Assert.assertEquals(0.0, rows.getProgress(), 0.000001);
+
+ schema = reader.getSchema();
+ batch = schema.createRowBatch(74);
+ Assert.assertEquals(0, rows.getRowNumber());
+ rows.nextBatch(batch);
+ assertEquals(74, batch.size);
+ Assert.assertEquals(74, rows.getRowNumber());
+ TimestampColumnVector ts = (TimestampColumnVector) batch.cols[0];
+ UnionColumnVector union = (UnionColumnVector) batch.cols[1];
+ LongColumnVector longs = (LongColumnVector) union.fields[0];
+ BytesColumnVector strs = (BytesColumnVector) union.fields[1];
+ DecimalColumnVector decs = (DecimalColumnVector) batch.cols[2];
+
+ assertEquals("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal(38,18)>",
+ schema.toString());
+ assertEquals("2000-03-12 15:00:00.0", ts.asScratchTimestamp(0).toString());
+ assertEquals(0, union.tags[0]);
+ assertEquals(42, longs.vector[0]);
+ assertEquals("12345678.6547456", decs.vector[0].toString());
+
+ assertEquals("2000-03-20 12:00:00.123456789", ts.asScratchTimestamp(1).toString());
+ assertEquals(1, union.tags[1]);
+ assertEquals("hello", strs.toString(1));
+ assertEquals("-5643.234", decs.vector[1].toString());
+
+ assertEquals(false, ts.noNulls);
+ assertEquals(false, union.noNulls);
+ assertEquals(false, decs.noNulls);
+ assertEquals(true, ts.isNull[2]);
+ assertEquals(true, union.isNull[2]);
+ assertEquals(true, decs.isNull[2]);
+
+ assertEquals(true, ts.isNull[3]);
+ assertEquals(false, union.isNull[3]);
+ assertEquals(0, union.tags[3]);
+ assertEquals(true, longs.isNull[3]);
+ assertEquals(true, decs.isNull[3]);
+
+ assertEquals(true, ts.isNull[4]);
+ assertEquals(false, union.isNull[4]);
+ assertEquals(1, union.tags[4]);
+ assertEquals(true, strs.isNull[4]);
+ assertEquals(true, decs.isNull[4]);
+
+ assertEquals(false, ts.isNull[5]);
+ assertEquals("1970-01-01 00:00:00.0", ts.asScratchTimestamp(5).toString());
+ assertEquals(false, union.isNull[5]);
+ assertEquals(0, union.tags[5]);
+ assertEquals(false, longs.isNull[5]);
+ assertEquals(200000, longs.vector[5]);
+ assertEquals(false, decs.isNull[5]);
+ assertEquals("10000000000000000000", decs.vector[5].toString());
+
+ rand = new Random(42);
+ for(int i=1970; i < 2038; ++i) {
+ int row = 6 + i - 1970;
+ assertEquals(Timestamp.valueOf(i + "-05-05 12:34:56." + i),
+ ts.asScratchTimestamp(row));
+ if ((i & 1) == 0) {
+ assertEquals(0, union.tags[row]);
+ assertEquals(i*i, longs.vector[row]);
+ } else {
+ assertEquals(1, union.tags[row]);
+ assertEquals(Integer.toString(i * i), strs.toString(row));
+ }
+ assertEquals(new HiveDecimalWritable(HiveDecimal.create(new BigInteger(64, rand),
+ rand.nextInt(18))), decs.vector[row]);
+ }
+
+ // rebuild the row batch, so that we can read by 1000 rows
+ batch = schema.createRowBatch(1000);
+ ts = (TimestampColumnVector) batch.cols[0];
+ union = (UnionColumnVector) batch.cols[1];
+ longs = (LongColumnVector) union.fields[0];
+ strs = (BytesColumnVector) union.fields[1];
+ decs = (DecimalColumnVector) batch.cols[2];
+
+ for(int i=0; i < 5; ++i) {
+ rows.nextBatch(batch);
+ assertEquals("batch " + i, 1000, batch.size);
+ assertEquals("batch " + i, false, union.isRepeating);
+ assertEquals("batch " + i, true, union.noNulls);
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals("bad tag at " + i + "." +r, 0, union.tags[r]);
+ }
+ assertEquals("batch " + i, true, longs.isRepeating);
+ assertEquals("batch " + i, 1732050807, longs.vector[0]);
+ }
+
+ rows.nextBatch(batch);
+ assertEquals(3, batch.size);
+ assertEquals(0, union.tags[0]);
+ assertEquals(0, longs.vector[0]);
+ assertEquals(0, union.tags[1]);
+ assertEquals(10, longs.vector[1]);
+ assertEquals(0, union.tags[2]);
+ assertEquals(138, longs.vector[2]);
+
+ rows.nextBatch(batch);
+ assertEquals(0, batch.size);
+ Assert.assertEquals(1.0, rows.getProgress(), 0.00001);
+ Assert.assertEquals(reader.getNumberOfRows(), rows.getRowNumber());
+ rows.seekToRow(1);
+ rows.nextBatch(batch);
+ assertEquals(1000, batch.size);
+ assertEquals(Timestamp.valueOf("2000-03-20 12:00:00.123456789"), ts.asScratchTimestamp(0));
+ assertEquals(1, union.tags[0]);
+ assertEquals("hello", strs.toString(0));
+ assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), decs.vector[0]);
+ rows.close();
+ }
+
+ /**
+ * Read and write a randomly generated snappy file.
+ * @throws Exception
+ */
+ @Test
+ public void testSnappy() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(1000)
+ .compress(CompressionKind.SNAPPY)
+ .bufferSize(100));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ Random rand = new Random(12);
+ batch.size = 1000;
+ for(int b=0; b < 10; ++b) {
+ for (int r=0; r < 1000; ++r) {
+ ((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt();
+ ((BytesColumnVector) batch.cols[1]).setVal(r,
+ Integer.toHexString(rand.nextInt()).getBytes());
+ }
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ Assert.assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind());
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch(1000);
+ rand = new Random(12);
+ LongColumnVector longs = (LongColumnVector) batch.cols[0];
+ BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
+ for(int b=0; b < 10; ++b) {
+ rows.nextBatch(batch);
+ assertEquals(1000, batch.size);
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(rand.nextInt(), longs.vector[r]);
+ assertEquals(Integer.toHexString(rand.nextInt()), strs.toString(r));
+ }
+ }
+ rows.nextBatch(batch);
+ assertEquals(0, batch.size);
+ rows.close();
+ }
+
+ /**
+ * Read and write a randomly generated snappy file.
+ * @throws Exception
+ */
+ @Test
+ public void testWithoutIndex() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(5000)
+ .compress(CompressionKind.SNAPPY)
+ .bufferSize(1000)
+ .rowIndexStride(0));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ Random rand = new Random(24);
+ batch.size = 5;
+ for(int c=0; c < batch.cols.length; ++c) {
+ batch.cols[c].setRepeating(true);
+ }
+ for(int i=0; i < 10000; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[0] = rand.nextInt();
+ ((BytesColumnVector) batch.cols[1])
+ .setVal(0, Integer.toBinaryString(rand.nextInt()).getBytes());
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ Assert.assertEquals(50000, reader.getNumberOfRows());
+ Assert.assertEquals(0, reader.getRowIndexStride());
+ StripeInformation stripe = reader.getStripes().iterator().next();
+ assertEquals(true, stripe.getDataLength() != 0);
+ assertEquals(0, stripe.getIndexLength());
+ RecordReader rows = reader.rows();
+ rand = new Random(24);
+ batch = reader.getSchema().createRowBatch(1000);
+ LongColumnVector longs = (LongColumnVector) batch.cols[0];
+ BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
+ for(int i=0; i < 50; ++i) {
+ rows.nextBatch(batch);
+ assertEquals("batch " + i, 1000, batch.size);
+ for(int j=0; j < 200; ++j) {
+ int intVal = rand.nextInt();
+ String strVal = Integer.toBinaryString(rand.nextInt());
+ for (int k = 0; k < 5; ++k) {
+ assertEquals(intVal, longs.vector[j * 5 + k]);
+ assertEquals(strVal, strs.toString(j * 5 + k));
+ }
+ }
+ }
+ rows.nextBatch(batch);
+ assertEquals(0, batch.size);
+ rows.close();
+ }
+
+ @Test
+ public void testSeek() throws Exception {
+ TypeDescription schema = createBigRowSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(200000)
+ .bufferSize(65536)
+ .rowIndexStride(1000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ Random rand = new Random(42);
+ final int COUNT=32768;
+ long[] intValues= new long[COUNT];
+ double[] doubleValues = new double[COUNT];
+ String[] stringValues = new String[COUNT];
+ BytesWritable[] byteValues = new BytesWritable[COUNT];
+ String[] words = new String[128];
+ for(int i=0; i < words.length; ++i) {
+ words[i] = Integer.toHexString(rand.nextInt());
+ }
+ for(int i=0; i < COUNT/2; ++i) {
+ intValues[2*i] = rand.nextLong();
+ intValues[2*i+1] = intValues[2*i];
+ stringValues[2*i] = words[rand.nextInt(words.length)];
+ stringValues[2*i+1] = stringValues[2*i];
+ }
+ for(int i=0; i < COUNT; ++i) {
+ doubleValues[i] = rand.nextDouble();
+ byte[] buf = new byte[20];
+ rand.nextBytes(buf);
+ byteValues[i] = new BytesWritable(buf);
+ }
+ for(int i=0; i < COUNT; ++i) {
+ appendRandomRow(batch, intValues, doubleValues, stringValues,
+ byteValues, words, i);
+ if (batch.size == 1024) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size != 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ Assert.assertEquals(COUNT, reader.getNumberOfRows());
+ RecordReader rows = reader.rows();
+ // get the row index
+ DataReader meta = RecordReaderUtils.createDefaultDataReader(
+ DataReaderProperties.builder()
+ .withBufferSize(reader.getCompressionSize())
+ .withFileSystem(fs)
+ .withPath(testFilePath)
+ .withCompression(reader.getCompressionKind())
+ .withTypeCount(reader.getSchema().getMaximumId() + 1)
+ .withZeroCopy(false)
+ .build());
+ OrcIndex index =
+ meta.readRowIndex(reader.getStripes().get(0), null, null, null, null,
+ null);
+ // check the primitive columns to make sure they have the right number of
+ // items in the first row group
+ for(int c=1; c < 9; ++c) {
+ OrcProto.RowIndex colIndex = index.getRowGroupIndex()[c];
+ assertEquals(1000,
+ colIndex.getEntry(0).getStatistics().getNumberOfValues());
+ }
+ batch = reader.getSchema().createRowBatch();
+ int nextRowInBatch = -1;
+ for(int i=COUNT-1; i >= 0; --i, --nextRowInBatch) {
+ // if we have consumed the previous batch read a new one
+ if (nextRowInBatch < 0) {
+ long base = Math.max(i - 1023, 0);
+ rows.seekToRow(base);
+ Assert.assertEquals("row " + i, true, rows.nextBatch(batch));
+ nextRowInBatch = batch.size - 1;
+ }
+ checkRandomRow(batch, intValues, doubleValues,
+ stringValues, byteValues, words, i, nextRowInBatch);
+ }
+ rows.close();
+ Iterator<StripeInformation> stripeIterator =
+ reader.getStripes().iterator();
+ long offsetOfStripe2 = 0;
+ long offsetOfStripe4 = 0;
+ long lastRowOfStripe2 = 0;
+ for(int i = 0; i < 5; ++i) {
+ StripeInformation stripe = stripeIterator.next();
+ if (i < 2) {
+ lastRowOfStripe2 += stripe.getNumberOfRows();
+ } else if (i == 2) {
+ offsetOfStripe2 = stripe.getOffset();
+ lastRowOfStripe2 += stripe.getNumberOfRows() - 1;
+ } else if (i == 4) {
+ offsetOfStripe4 = stripe.getOffset();
+ }
+ }
+ boolean[] columns = new boolean[reader.getStatistics().length];
+ columns[5] = true; // long colulmn
+ columns[9] = true; // text column
+ rows = reader.rows(new Reader.Options()
+ .range(offsetOfStripe2, offsetOfStripe4 - offsetOfStripe2)
+ .include(columns));
+ rows.seekToRow(lastRowOfStripe2);
+ // we only want two rows
+ batch = reader.getSchema().createRowBatch(2);
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1, batch.size);
+ assertEquals(intValues[(int) lastRowOfStripe2], getLong(batch, 0));
+ assertEquals(stringValues[(int) lastRowOfStripe2],
+ getText(batch, 0).toString());
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(intValues[(int) lastRowOfStripe2 + 1], getLong(batch, 0));
+ assertEquals(stringValues[(int) lastRowOfStripe2 + 1],
+ getText(batch, 0).toString());
+ rows.close();
+ }
+
+ private void appendRandomRow(VectorizedRowBatch batch,
+ long[] intValues, double[] doubleValues,
+ String[] stringValues,
+ BytesWritable[] byteValues,
+ String[] words, int i) {
+ InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]);
+ InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32),
+ words[i % words.length] + "-x");
+ setBigRow(batch, batch.size++, (intValues[i] & 1) == 0, (byte) intValues[i],
+ (short) intValues[i], (int) intValues[i], intValues[i],
+ (float) doubleValues[i], doubleValues[i], byteValues[i], stringValues[i],
+ new MiddleStruct(inner, inner2), list(), map(inner, inner2));
+ }
+
+ private void checkRandomRow(VectorizedRowBatch batch,
+ long[] intValues, double[] doubleValues,
+ String[] stringValues,
+ BytesWritable[] byteValues,
+ String[] words, int i, int rowInBatch) {
+ InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]);
+ InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32),
+ words[i % words.length] + "-x");
+ checkBigRow(batch, rowInBatch, i, (intValues[i] & 1) == 0, (byte) intValues[i],
+ (short) intValues[i], (int) intValues[i], intValues[i],
+ (float) doubleValues[i], doubleValues[i], byteValues[i], stringValues[i],
+ new MiddleStruct(inner, inner2), list(), map(inner, inner2));
+ }
+
+ private static class MyMemoryManager extends MemoryManager {
+ final long totalSpace;
+ double rate;
+ Path path = null;
+ long lastAllocation = 0;
+ int rows = 0;
+ Callback callback;
+
+ MyMemoryManager(Configuration conf, long totalSpace, double rate) {
+ super(conf);
+ this.totalSpace = totalSpace;
+ this.rate = rate;
+ }
+
+ @Override
+ public void addWriter(Path path, long requestedAllocation,
+ Callback callback) {
+ this.path = path;
+ this.lastAllocation = requestedAllocation;
+ this.callback = callback;
+ }
+
+ @Override
+ public synchronized void removeWriter(Path path) {
+ this.path = null;
+ this.lastAllocation = 0;
+ }
+
+ @Override
+ public long getTotalMemoryPool() {
+ return totalSpace;
+ }
+
+ @Override
+ public double getAllocationScale() {
+ return rate;
+ }
+
+ @Override
+ public void addedRow(int count) throws IOException {
+ rows += count;
+ if (rows % 100 == 0) {
+ callback.checkMemory(rate);
+ }
+ }
+ }
+
+ @Test
+ public void testMemoryManagementV11() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ MyMemoryManager memory = new MyMemoryManager(conf, 10000, 0.1);
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .stripeSize(50000)
+ .bufferSize(100)
+ .rowIndexStride(0)
+ .memory(memory)
+ .version(OrcFile.Version.V_0_11));
+ assertEquals(testFilePath, memory.path);
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 1;
+ for(int i=0; i < 2500; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[0] = i * 300;
+ ((BytesColumnVector) batch.cols[1]).setVal(0,
+ Integer.toHexString(10*i).getBytes());
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ assertEquals(null, memory.path);
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ int i = 0;
+ for(StripeInformation stripe: reader.getStripes()) {
+ i += 1;
+ assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(),
+ stripe.getDataLength() < 5000);
+ }
+ assertEquals(25, i);
+ assertEquals(2500, reader.getNumberOfRows());
+ }
+
+ @Test
+ public void testMemoryManagementV12() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ MyMemoryManager memory = new MyMemoryManager(conf, 10000, 0.1);
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .stripeSize(50000)
+ .bufferSize(100)
+ .rowIndexStride(0)
+ .memory(memory)
+ .version(OrcFile.Version.V_0_12));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ assertEquals(testFilePath, memory.path);
+ batch.size = 1;
+ for(int i=0; i < 2500; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[0] = i * 300;
+ ((BytesColumnVector) batch.cols[1]).setVal(0,
+ Integer.toHexString(10*i).getBytes());
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ assertEquals(null, memory.path);
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ int i = 0;
+ for(StripeInformation stripe: reader.getStripes()) {
+ i += 1;
+ assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(),
+ stripe.getDataLength() < 5000);
+ }
+ // with HIVE-7832, the dictionaries will be disabled after writing the first
+ // stripe as there are too many distinct values. Hence only 3 stripes as
+ // compared to 25 stripes in version 0.11 (above test case)
+ assertEquals(3, i);
+ assertEquals(2500, reader.getNumberOfRows());
+ }
+
+ @Test
+ public void testPredicatePushdown() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(400000L)
+ .compress(CompressionKind.NONE)
+ .bufferSize(500)
+ .rowIndexStride(1000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.ensureSize(3500);
+ batch.size = 3500;
+ for(int i=0; i < 3500; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[i] = i * 300;
+ ((BytesColumnVector) batch.cols[1]).setVal(i,
+ Integer.toHexString(10*i).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals(3500, reader.getNumberOfRows());
+
+ SearchArgument sarg = SearchArgumentFactory.newBuilder()
+ .startAnd()
+ .startNot()
+ .lessThan("int1", PredicateLeaf.Type.LONG, 300000L)
+ .end()
+ .lessThan("int1", PredicateLeaf.Type.LONG, 600000L)
+ .end()
+ .build();
+ RecordReader rows = reader.rows(new Reader.Options()
+ .range(0L, Long.MAX_VALUE)
+ .include(new boolean[]{true, true, true})
+ .searchArgument(sarg, new String[]{null, "int1", "string1"}));
+ batch = reader.getSchema().createRowBatch(2000);
+ LongColumnVector ints = (LongColumnVector) batch.cols[0];
+ BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
+
+ Assert.assertEquals(1000L, rows.getRowNumber());
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1000, batch.size);
+
+ for(int i=1000; i < 2000; ++i) {
+ assertEquals(300 * i, ints.vector[i - 1000]);
+ assertEquals(Integer.toHexString(10*i), strs.toString(i - 1000));
+ }
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ Assert.assertEquals(3500, rows.getRowNumber());
+
+ // look through the file with no rows selected
+ sarg = SearchArgumentFactory.newBuilder()
+ .startAnd()
+ .lessThan("int1", PredicateLeaf.Type.LONG, 0L)
+ .end()
+ .build();
+ rows = reader.rows(new Reader.Options()
+ .range(0L, Long.MAX_VALUE)
+ .include(new boolean[]{true, true, true})
+ .searchArgument(sarg, new String[]{null, "int1", "string1"}));
+ Assert.assertEquals(3500L, rows.getRowNumber());
+ assertTrue(!rows.nextBatch(batch));
+
+ // select first 100 and last 100 rows
+ sarg = SearchArgumentFactory.newBuilder()
+ .startOr()
+ .lessThan("int1", PredicateLeaf.Type.LONG, 300L * 100)
+ .startNot()
+ .lessThan("int1", PredicateLeaf.Type.LONG, 300L * 3400)
+ .end()
+ .end()
+ .build();
+ rows = reader.rows(new Reader.Options()
+ .range(0L, Long.MAX_VALUE)
+ .include(new boolean[]{true, true, true})
+ .searchArgument(sarg, new String[]{null, "int1", "string1"}));
+ Assert.assertEquals(0, rows.getRowNumber());
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1000, batch.size);
+ Assert.assertEquals(3000, rows.getRowNumber());
+ for(int i=0; i < 1000; ++i) {
+ assertEquals(300 * i, ints.vector[i]);
+ assertEquals(Integer.toHexString(10*i), strs.toString(i));
+ }
+
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(500, batch.size);
+ Assert.assertEquals(3500, rows.getRowNumber());
+ for(int i=3000; i < 3500; ++i) {
+ assertEquals(300 * i, ints.vector[i - 3000]);
+ assertEquals(Integer.toHexString(10*i), strs.toString(i - 3000));
+ }
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ Assert.assertEquals(3500, rows.getRowNumber());
+ }
+
+ /**
+ * Test all of the types that have distinct ORC writers using the vectorized
+ * writer with different combinations of repeating and null values.
+ * @throws Exception
+ */
+ @Test
<TRUNCATED>
[15/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/resources/orc-file-dump.json
----------------------------------------------------------------------
diff --git a/orc/src/test/resources/orc-file-dump.json b/orc/src/test/resources/orc-file-dump.json
new file mode 100644
index 0000000..bf654a1
--- /dev/null
+++ b/orc/src/test/resources/orc-file-dump.json
@@ -0,0 +1,1355 @@
+{
+ "fileName": "TestFileDump.testDump.orc",
+ "fileVersion": "0.12",
+ "writerVersion": "HIVE_13083",
+ "numberOfRows": 21000,
+ "compression": "ZLIB",
+ "compressionBufferSize": 4096,
+ "schemaString": "struct<i:int,l:bigint,s:string>",
+ "schema": [
+ {
+ "columnId": 0,
+ "columnType": "STRUCT",
+ "childColumnNames": [
+ "i",
+ "l",
+ "s"
+ ],
+ "childColumnIds": [
+ 1,
+ 2,
+ 3
+ ]
+ },
+ {
+ "columnId": 1,
+ "columnType": "INT"
+ },
+ {
+ "columnId": 2,
+ "columnType": "LONG"
+ },
+ {
+ "columnId": 3,
+ "columnType": "STRING"
+ }
+ ],
+ "stripeStatistics": [
+ {
+ "stripeNumber": 1,
+ "columnStatistics": [
+ {
+ "columnId": 0,
+ "count": 5000,
+ "hasNull": false
+ },
+ {
+ "columnId": 1,
+ "count": 5000,
+ "hasNull": false,
+ "min": -2147115959,
+ "max": 2145210552,
+ "sum": 50111854553,
+ "type": "LONG"
+ },
+ {
+ "columnId": 2,
+ "count": 5000,
+ "hasNull": false,
+ "min": -9223180583305557329,
+ "max": 9221614132680747961,
+ "type": "LONG"
+ },
+ {
+ "columnId": 3,
+ "count": 4950,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 19283,
+ "type": "STRING"
+ }
+ ]
+ },
+ {
+ "stripeNumber": 2,
+ "columnStatistics": [
+ {
+ "columnId": 0,
+ "count": 5000,
+ "hasNull": false
+ },
+ {
+ "columnId": 1,
+ "count": 5000,
+ "hasNull": false,
+ "min": -2147390285,
+ "max": 2147224606,
+ "sum": -22290798217,
+ "type": "LONG"
+ },
+ {
+ "columnId": 2,
+ "count": 5000,
+ "hasNull": false,
+ "min": -9219295160509160427,
+ "max": 9217571024994660020,
+ "type": "LONG"
+ },
+ {
+ "columnId": 3,
+ "count": 4950,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 19397,
+ "type": "STRING"
+ }
+ ]
+ },
+ {
+ "stripeNumber": 3,
+ "columnStatistics": [
+ {
+ "columnId": 0,
+ "count": 5000,
+ "hasNull": false
+ },
+ {
+ "columnId": 1,
+ "count": 5000,
+ "hasNull": false,
+ "min": -2146954065,
+ "max": 2146722468,
+ "sum": 20639652136,
+ "type": "LONG"
+ },
+ {
+ "columnId": 2,
+ "count": 5000,
+ "hasNull": false,
+ "min": -9214076359988107846,
+ "max": 9222919052987871506,
+ "type": "LONG"
+ },
+ {
+ "columnId": 3,
+ "count": 4950,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 19031,
+ "type": "STRING"
+ }
+ ]
+ },
+ {
+ "stripeNumber": 4,
+ "columnStatistics": [
+ {
+ "columnId": 0,
+ "count": 5000,
+ "hasNull": false
+ },
+ {
+ "columnId": 1,
+ "count": 5000,
+ "hasNull": false,
+ "min": -2146969085,
+ "max": 2146025044,
+ "sum": -5156814387,
+ "type": "LONG"
+ },
+ {
+ "columnId": 2,
+ "count": 5000,
+ "hasNull": false,
+ "min": -9222731174895935707,
+ "max": 9220625004936875965,
+ "type": "LONG"
+ },
+ {
+ "columnId": 3,
+ "count": 4950,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 19459,
+ "type": "STRING"
+ }
+ ]
+ },
+ {
+ "stripeNumber": 5,
+ "columnStatistics": [
+ {
+ "columnId": 0,
+ "count": 1000,
+ "hasNull": false
+ },
+ {
+ "columnId": 1,
+ "count": 1000,
+ "hasNull": false,
+ "min": -2144303438,
+ "max": 2127599049,
+ "sum": 62841564778,
+ "type": "LONG"
+ },
+ {
+ "columnId": 2,
+ "count": 1000,
+ "hasNull": false,
+ "min": -9195133638801798919,
+ "max": 9218626063131504414,
+ "type": "LONG"
+ },
+ {
+ "columnId": 3,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3963,
+ "type": "STRING"
+ }
+ ]
+ }
+ ],
+ "fileStatistics": [
+ {
+ "columnId": 0,
+ "count": 21000,
+ "hasNull": false
+ },
+ {
+ "columnId": 1,
+ "count": 21000,
+ "hasNull": false,
+ "min": -2147390285,
+ "max": 2147224606,
+ "sum": 106145458863,
+ "type": "LONG"
+ },
+ {
+ "columnId": 2,
+ "count": 21000,
+ "hasNull": false,
+ "min": -9223180583305557329,
+ "max": 9222919052987871506,
+ "type": "LONG"
+ },
+ {
+ "columnId": 3,
+ "count": 20790,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 81133,
+ "type": "STRING"
+ }
+ ],
+ "stripes": [
+ {
+ "stripeNumber": 1,
+ "stripeInformation": {
+ "offset": 3,
+ "indexLength": 970,
+ "dataLength": 63770,
+ "footerLength": 90,
+ "rowCount": 5000
+ },
+ "streams": [
+ {
+ "columnId": 0,
+ "section": "ROW_INDEX",
+ "startOffset": 3,
+ "length": 17
+ },
+ {
+ "columnId": 1,
+ "section": "ROW_INDEX",
+ "startOffset": 20,
+ "length": 167
+ },
+ {
+ "columnId": 2,
+ "section": "ROW_INDEX",
+ "startOffset": 187,
+ "length": 171
+ },
+ {
+ "columnId": 3,
+ "section": "ROW_INDEX",
+ "startOffset": 358,
+ "length": 103
+ },
+ {
+ "columnId": 3,
+ "section": "BLOOM_FILTER",
+ "startOffset": 461,
+ "length": 512
+ },
+ {
+ "columnId": 1,
+ "section": "DATA",
+ "startOffset": 973,
+ "length": 20035
+ },
+ {
+ "columnId": 2,
+ "section": "DATA",
+ "startOffset": 21008,
+ "length": 40050
+ },
+ {
+ "columnId": 3,
+ "section": "PRESENT",
+ "startOffset": 61058,
+ "length": 17
+ },
+ {
+ "columnId": 3,
+ "section": "DATA",
+ "startOffset": 61075,
+ "length": 3510
+ },
+ {
+ "columnId": 3,
+ "section": "LENGTH",
+ "startOffset": 64585,
+ "length": 25
+ },
+ {
+ "columnId": 3,
+ "section": "DICTIONARY_DATA",
+ "startOffset": 64610,
+ "length": 133
+ }
+ ],
+ "encodings": [
+ {
+ "columnId": 0,
+ "kind": "DIRECT"
+ },
+ {
+ "columnId": 1,
+ "kind": "DIRECT_V2"
+ },
+ {
+ "columnId": 2,
+ "kind": "DIRECT_V2"
+ },
+ {
+ "columnId": 3,
+ "kind": "DICTIONARY_V2",
+ "dictionarySize": 35
+ }
+ ],
+ "indexes": [{
+ "columnId": 3,
+ "rowGroupIndexes": [
+ {
+ "entryId": 0,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3873,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ]
+ },
+ {
+ "entryId": 1,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3861,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 38,
+ 12,
+ 0,
+ 0,
+ 736,
+ 23
+ ]
+ },
+ {
+ "entryId": 2,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3946,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 78,
+ 12,
+ 0,
+ 0,
+ 1473,
+ 43
+ ]
+ },
+ {
+ "entryId": 3,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3774,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 118,
+ 12,
+ 0,
+ 0,
+ 2067,
+ 261
+ ]
+ },
+ {
+ "entryId": 4,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3829,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 158,
+ 12,
+ 0,
+ 0,
+ 2992,
+ 35
+ ]
+ }
+ ],
+ "bloomFilterIndexes": [
+ {
+ "entryId": 0,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 1,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 2,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 3,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 4,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ }
+ ],
+ "stripeLevelBloomFilter": {
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ }
+ }]
+ },
+ {
+ "stripeNumber": 2,
+ "stripeInformation": {
+ "offset": 64833,
+ "indexLength": 961,
+ "dataLength": 63763,
+ "footerLength": 88,
+ "rowCount": 5000
+ },
+ "streams": [
+ {
+ "columnId": 0,
+ "section": "ROW_INDEX",
+ "startOffset": 64833,
+ "length": 17
+ },
+ {
+ "columnId": 1,
+ "section": "ROW_INDEX",
+ "startOffset": 64850,
+ "length": 166
+ },
+ {
+ "columnId": 2,
+ "section": "ROW_INDEX",
+ "startOffset": 65016,
+ "length": 166
+ },
+ {
+ "columnId": 3,
+ "section": "ROW_INDEX",
+ "startOffset": 65182,
+ "length": 100
+ },
+ {
+ "columnId": 3,
+ "section": "BLOOM_FILTER",
+ "startOffset": 65282,
+ "length": 512
+ },
+ {
+ "columnId": 1,
+ "section": "DATA",
+ "startOffset": 65794,
+ "length": 20035
+ },
+ {
+ "columnId": 2,
+ "section": "DATA",
+ "startOffset": 85829,
+ "length": 40050
+ },
+ {
+ "columnId": 3,
+ "section": "PRESENT",
+ "startOffset": 125879,
+ "length": 17
+ },
+ {
+ "columnId": 3,
+ "section": "DATA",
+ "startOffset": 125896,
+ "length": 3503
+ },
+ {
+ "columnId": 3,
+ "section": "LENGTH",
+ "startOffset": 129399,
+ "length": 25
+ },
+ {
+ "columnId": 3,
+ "section": "DICTIONARY_DATA",
+ "startOffset": 129424,
+ "length": 133
+ }
+ ],
+ "encodings": [
+ {
+ "columnId": 0,
+ "kind": "DIRECT"
+ },
+ {
+ "columnId": 1,
+ "kind": "DIRECT_V2"
+ },
+ {
+ "columnId": 2,
+ "kind": "DIRECT_V2"
+ },
+ {
+ "columnId": 3,
+ "kind": "DICTIONARY_V2",
+ "dictionarySize": 35
+ }
+ ],
+ "indexes": [{
+ "columnId": 3,
+ "rowGroupIndexes": [
+ {
+ "entryId": 0,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3946,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ]
+ },
+ {
+ "entryId": 1,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3836,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 38,
+ 12,
+ 0,
+ 0,
+ 746,
+ 11
+ ]
+ },
+ {
+ "entryId": 2,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3791,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 78,
+ 12,
+ 0,
+ 0,
+ 1430,
+ 95
+ ]
+ },
+ {
+ "entryId": 3,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3904,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 118,
+ 12,
+ 0,
+ 0,
+ 2239,
+ 23
+ ]
+ },
+ {
+ "entryId": 4,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3920,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 158,
+ 12,
+ 0,
+ 0,
+ 2994,
+ 17
+ ]
+ }
+ ],
+ "bloomFilterIndexes": [
+ {
+ "entryId": 0,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 1,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 2,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 3,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 4,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ }
+ ],
+ "stripeLevelBloomFilter": {
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ }
+ }]
+ },
+ {
+ "stripeNumber": 3,
+ "stripeInformation": {
+ "offset": 129645,
+ "indexLength": 962,
+ "dataLength": 63770,
+ "footerLength": 91,
+ "rowCount": 5000
+ },
+ "streams": [
+ {
+ "columnId": 0,
+ "section": "ROW_INDEX",
+ "startOffset": 129645,
+ "length": 17
+ },
+ {
+ "columnId": 1,
+ "section": "ROW_INDEX",
+ "startOffset": 129662,
+ "length": 164
+ },
+ {
+ "columnId": 2,
+ "section": "ROW_INDEX",
+ "startOffset": 129826,
+ "length": 167
+ },
+ {
+ "columnId": 3,
+ "section": "ROW_INDEX",
+ "startOffset": 129993,
+ "length": 102
+ },
+ {
+ "columnId": 3,
+ "section": "BLOOM_FILTER",
+ "startOffset": 130095,
+ "length": 512
+ },
+ {
+ "columnId": 1,
+ "section": "DATA",
+ "startOffset": 130607,
+ "length": 20035
+ },
+ {
+ "columnId": 2,
+ "section": "DATA",
+ "startOffset": 150642,
+ "length": 40050
+ },
+ {
+ "columnId": 3,
+ "section": "PRESENT",
+ "startOffset": 190692,
+ "length": 17
+ },
+ {
+ "columnId": 3,
+ "section": "DATA",
+ "startOffset": 190709,
+ "length": 3510
+ },
+ {
+ "columnId": 3,
+ "section": "LENGTH",
+ "startOffset": 194219,
+ "length": 25
+ },
+ {
+ "columnId": 3,
+ "section": "DICTIONARY_DATA",
+ "startOffset": 194244,
+ "length": 133
+ }
+ ],
+ "encodings": [
+ {
+ "columnId": 0,
+ "kind": "DIRECT"
+ },
+ {
+ "columnId": 1,
+ "kind": "DIRECT_V2"
+ },
+ {
+ "columnId": 2,
+ "kind": "DIRECT_V2"
+ },
+ {
+ "columnId": 3,
+ "kind": "DICTIONARY_V2",
+ "dictionarySize": 35
+ }
+ ],
+ "indexes": [{
+ "columnId": 3,
+ "rowGroupIndexes": [
+ {
+ "entryId": 0,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3829,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ]
+ },
+ {
+ "entryId": 1,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3853,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 38,
+ 12,
+ 0,
+ 0,
+ 698,
+ 74
+ ]
+ },
+ {
+ "entryId": 2,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3796,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 78,
+ 12,
+ 0,
+ 0,
+ 1483,
+ 39
+ ]
+ },
+ {
+ "entryId": 3,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3736,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 118,
+ 12,
+ 0,
+ 0,
+ 2148,
+ 155
+ ]
+ },
+ {
+ "entryId": 4,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3817,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 158,
+ 12,
+ 0,
+ 0,
+ 3018,
+ 8
+ ]
+ }
+ ],
+ "bloomFilterIndexes": [
+ {
+ "entryId": 0,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 1,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 2,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 3,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 4,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ }
+ ],
+ "stripeLevelBloomFilter": {
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ }
+ }]
+ },
+ {
+ "stripeNumber": 4,
+ "stripeInformation": {
+ "offset": 194468,
+ "indexLength": 973,
+ "dataLength": 63756,
+ "footerLength": 91,
+ "rowCount": 5000
+ },
+ "streams": [
+ {
+ "columnId": 0,
+ "section": "ROW_INDEX",
+ "startOffset": 194468,
+ "length": 17
+ },
+ {
+ "columnId": 1,
+ "section": "ROW_INDEX",
+ "startOffset": 194485,
+ "length": 166
+ },
+ {
+ "columnId": 2,
+ "section": "ROW_INDEX",
+ "startOffset": 194651,
+ "length": 171
+ },
+ {
+ "columnId": 3,
+ "section": "ROW_INDEX",
+ "startOffset": 194822,
+ "length": 107
+ },
+ {
+ "columnId": 3,
+ "section": "BLOOM_FILTER",
+ "startOffset": 194929,
+ "length": 512
+ },
+ {
+ "columnId": 1,
+ "section": "DATA",
+ "startOffset": 195441,
+ "length": 20035
+ },
+ {
+ "columnId": 2,
+ "section": "DATA",
+ "startOffset": 215476,
+ "length": 40050
+ },
+ {
+ "columnId": 3,
+ "section": "PRESENT",
+ "startOffset": 255526,
+ "length": 17
+ },
+ {
+ "columnId": 3,
+ "section": "DATA",
+ "startOffset": 255543,
+ "length": 3496
+ },
+ {
+ "columnId": 3,
+ "section": "LENGTH",
+ "startOffset": 259039,
+ "length": 25
+ },
+ {
+ "columnId": 3,
+ "section": "DICTIONARY_DATA",
+ "startOffset": 259064,
+ "length": 133
+ }
+ ],
+ "encodings": [
+ {
+ "columnId": 0,
+ "kind": "DIRECT"
+ },
+ {
+ "columnId": 1,
+ "kind": "DIRECT_V2"
+ },
+ {
+ "columnId": 2,
+ "kind": "DIRECT_V2"
+ },
+ {
+ "columnId": 3,
+ "kind": "DICTIONARY_V2",
+ "dictionarySize": 35
+ }
+ ],
+ "indexes": [{
+ "columnId": 3,
+ "rowGroupIndexes": [
+ {
+ "entryId": 0,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3959,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ]
+ },
+ {
+ "entryId": 1,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3816,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 38,
+ 12,
+ 0,
+ 0,
+ 495,
+ 338
+ ]
+ },
+ {
+ "entryId": 2,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3883,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 78,
+ 12,
+ 0,
+ 0,
+ 1449,
+ 71
+ ]
+ },
+ {
+ "entryId": 3,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3938,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 118,
+ 12,
+ 0,
+ 0,
+ 2207,
+ 59
+ ]
+ },
+ {
+ "entryId": 4,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3863,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 158,
+ 12,
+ 0,
+ 0,
+ 2838,
+ 223
+ ]
+ }
+ ],
+ "bloomFilterIndexes": [
+ {
+ "entryId": 0,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 1,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 2,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 3,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ },
+ {
+ "entryId": 4,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ }
+ ],
+ "stripeLevelBloomFilter": {
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ }
+ }]
+ },
+ {
+ "stripeNumber": 5,
+ "stripeInformation": {
+ "offset": 259288,
+ "indexLength": 433,
+ "dataLength": 12943,
+ "footerLength": 83,
+ "rowCount": 1000
+ },
+ "streams": [
+ {
+ "columnId": 0,
+ "section": "ROW_INDEX",
+ "startOffset": 259288,
+ "length": 12
+ },
+ {
+ "columnId": 1,
+ "section": "ROW_INDEX",
+ "startOffset": 259300,
+ "length": 38
+ },
+ {
+ "columnId": 2,
+ "section": "ROW_INDEX",
+ "startOffset": 259338,
+ "length": 41
+ },
+ {
+ "columnId": 3,
+ "section": "ROW_INDEX",
+ "startOffset": 259379,
+ "length": 41
+ },
+ {
+ "columnId": 3,
+ "section": "BLOOM_FILTER",
+ "startOffset": 259420,
+ "length": 301
+ },
+ {
+ "columnId": 1,
+ "section": "DATA",
+ "startOffset": 259721,
+ "length": 4007
+ },
+ {
+ "columnId": 2,
+ "section": "DATA",
+ "startOffset": 263728,
+ "length": 8010
+ },
+ {
+ "columnId": 3,
+ "section": "PRESENT",
+ "startOffset": 271738,
+ "length": 16
+ },
+ {
+ "columnId": 3,
+ "section": "DATA",
+ "startOffset": 271754,
+ "length": 752
+ },
+ {
+ "columnId": 3,
+ "section": "LENGTH",
+ "startOffset": 272506,
+ "length": 25
+ },
+ {
+ "columnId": 3,
+ "section": "DICTIONARY_DATA",
+ "startOffset": 272531,
+ "length": 133
+ }
+ ],
+ "encodings": [
+ {
+ "columnId": 0,
+ "kind": "DIRECT"
+ },
+ {
+ "columnId": 1,
+ "kind": "DIRECT_V2"
+ },
+ {
+ "columnId": 2,
+ "kind": "DIRECT_V2"
+ },
+ {
+ "columnId": 3,
+ "kind": "DICTIONARY_V2",
+ "dictionarySize": 35
+ }
+ ],
+ "indexes": [{
+ "columnId": 3,
+ "rowGroupIndexes": [{
+ "entryId": 0,
+ "count": 990,
+ "hasNull": true,
+ "min": "Darkness,",
+ "max": "worst",
+ "totalLength": 3963,
+ "type": "STRING",
+ "positions": [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ]
+ }],
+ "bloomFilterIndexes": [{
+ "entryId": 0,
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ }],
+ "stripeLevelBloomFilter": {
+ "numHashFunctions": 4,
+ "bitCount": 6272,
+ "popCount": 138,
+ "loadFactor": 0.022002551704645157,
+ "expectedFpp": 2.3436470542037569E-7
+ }
+ }]
+ }
+ ],
+ "fileLength": 273300,
+ "paddingLength": 0,
+ "paddingRatio": 0,
+ "status": "OK"
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/resources/orc-file-dump.out
----------------------------------------------------------------------
diff --git a/orc/src/test/resources/orc-file-dump.out b/orc/src/test/resources/orc-file-dump.out
new file mode 100644
index 0000000..70f7fbd
--- /dev/null
+++ b/orc/src/test/resources/orc-file-dump.out
@@ -0,0 +1,195 @@
+Structure for TestFileDump.testDump.orc
+File Version: 0.12 with HIVE_13083
+Rows: 21000
+Compression: ZLIB
+Compression size: 4096
+Type: struct<i:int,l:bigint,s:string>
+
+Stripe Statistics:
+ Stripe 1:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146021688 max: 2147223299 sum: 515792826
+ Column 2: count: 5000 hasNull: false min: -9218592812243954469 max: 9221614132680747961
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19280
+ Stripe 2:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146733128 max: 2147001622 sum: 7673427
+ Column 2: count: 5000 hasNull: false min: -9220818777591257749 max: 9222259462014003839
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19504
+ Stripe 3:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146993718 max: 2147378179 sum: 132660742551
+ Column 2: count: 5000 hasNull: false min: -9218342074710552826 max: 9222303228623055266
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19641
+ Stripe 4:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146658006 max: 2145520931 sum: 8533549236
+ Column 2: count: 5000 hasNull: false min: -9222758097219661129 max: 9221043130193737406
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19470
+ Stripe 5:
+ Column 0: count: 1000 hasNull: false
+ Column 1: count: 1000 hasNull: false min: -2146245500 max: 2146378640 sum: 51299706363
+ Column 2: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476
+ Column 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866
+
+File Statistics:
+ Column 0: count: 21000 hasNull: false
+ Column 1: count: 21000 hasNull: false min: -2146993718 max: 2147378179 sum: 193017464403
+ Column 2: count: 21000 hasNull: false min: -9222758097219661129 max: 9222303228623055266
+ Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
+
+Stripes:
+ Stripe: offset: 3 data: 63786 rows: 5000 tail: 79 index: 439
+ Stream: column 0 section ROW_INDEX start: 3 length 17
+ Stream: column 1 section ROW_INDEX start: 20 length 166
+ Stream: column 2 section ROW_INDEX start: 186 length 169
+ Stream: column 3 section ROW_INDEX start: 355 length 87
+ Stream: column 1 section DATA start: 442 length 20035
+ Stream: column 2 section DATA start: 20477 length 40050
+ Stream: column 3 section DATA start: 60527 length 3543
+ Stream: column 3 section LENGTH start: 64070 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 64095 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 1:
+ Entry 0: count: 1000 hasNull: false min: -2145365268 max: 2135491313 sum: 7521792925 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -2139452528 max: 2147223299 sum: -12923774313 positions: 0,2050,488
+ Entry 2: count: 1000 hasNull: false min: -2142420586 max: 2143898386 sum: -25521983511 positions: 4099,2054,464
+ Entry 3: count: 1000 hasNull: false min: -2137233441 max: 2144267163 sum: 40993386199 positions: 8198,2058,440
+ Entry 4: count: 1000 hasNull: false min: -2146021688 max: 2146838901 sum: -9553628474 positions: 12297,2062,416
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9200577545527640566 max: 9175500305011173751 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9203618157670445774 max: 9208123824411178101 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9218592812243954469 max: 9221351515892923972 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9206585617947511272 max: 9167703224425685487 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9206645795733282496 max: 9221614132680747961 positions: 28693,14,416
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3862 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3884 positions: 0,659,149
+ Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3893 positions: 0,1531,3
+ Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3798 positions: 0,2281,32
+ Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3843 positions: 0,3033,45
+ Stripe: offset: 64307 data: 63775 rows: 5000 tail: 79 index: 432
+ Stream: column 0 section ROW_INDEX start: 64307 length 17
+ Stream: column 1 section ROW_INDEX start: 64324 length 164
+ Stream: column 2 section ROW_INDEX start: 64488 length 168
+ Stream: column 3 section ROW_INDEX start: 64656 length 83
+ Stream: column 1 section DATA start: 64739 length 20035
+ Stream: column 2 section DATA start: 84774 length 40050
+ Stream: column 3 section DATA start: 124824 length 3532
+ Stream: column 3 section LENGTH start: 128356 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 128381 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 1:
+ Entry 0: count: 1000 hasNull: false min: -2143799121 max: 2145249879 sum: -6966266181 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -2146733128 max: 2147001622 sum: -35930106333 positions: 0,2050,488
+ Entry 2: count: 1000 hasNull: false min: -2144302712 max: 2146299933 sum: 6944230435 positions: 4099,2054,464
+ Entry 3: count: 1000 hasNull: false min: -2145172948 max: 2144335014 sum: -29624404959 positions: 8198,2058,440
+ Entry 4: count: 1000 hasNull: false min: -2146428427 max: 2144067253 sum: 65584220465 positions: 12297,2062,416
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9218450653857701562 max: 9189819526332228512 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9220818777591257749 max: 9178821722829648113 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9220031433030423388 max: 9210838931786956852 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9208195729739635607 max: 9222259462014003839 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9174271499932339698 max: 9212277876771676916 positions: 28693,14,416
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3923 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3869 positions: 0,761,12
+ Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,1472,70
+ Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3931 positions: 0,2250,43
+ Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3964 positions: 0,2978,88
+ Stripe: offset: 128593 data: 63787 rows: 5000 tail: 79 index: 438
+ Stream: column 0 section ROW_INDEX start: 128593 length 17
+ Stream: column 1 section ROW_INDEX start: 128610 length 163
+ Stream: column 2 section ROW_INDEX start: 128773 length 168
+ Stream: column 3 section ROW_INDEX start: 128941 length 90
+ Stream: column 1 section DATA start: 129031 length 20035
+ Stream: column 2 section DATA start: 149066 length 40050
+ Stream: column 3 section DATA start: 189116 length 3544
+ Stream: column 3 section LENGTH start: 192660 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 192685 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 1:
+ Entry 0: count: 1000 hasNull: false min: -2146993718 max: 2144179881 sum: -7829543271 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -2144095505 max: 2144883384 sum: 51623839692 positions: 0,2050,488
+ Entry 2: count: 1000 hasNull: false min: -2144113995 max: 2143773575 sum: 56574412741 positions: 4099,2054,464
+ Entry 3: count: 1000 hasNull: false min: -2146954065 max: 2146794873 sum: 4336083432 positions: 8198,2058,440
+ Entry 4: count: 1000 hasNull: false min: -2135511523 max: 2147378179 sum: 27955949957 positions: 12297,2062,416
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9211978436552246208 max: 9179058898902097152 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9195645160817780503 max: 9189147759444307708 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9202888157616520823 max: 9193561362676960747 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9216318198067839390 max: 9221286760675829363 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9218342074710552826 max: 9222303228623055266 positions: 28693,14,416
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 4008 positions: 0,634,174
+ Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3999 positions: 0,1469,69
+ Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,2133,194
+ Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 4000 positions: 0,3005,43
+ Stripe: offset: 192897 data: 63817 rows: 5000 tail: 79 index: 440
+ Stream: column 0 section ROW_INDEX start: 192897 length 17
+ Stream: column 1 section ROW_INDEX start: 192914 length 165
+ Stream: column 2 section ROW_INDEX start: 193079 length 167
+ Stream: column 3 section ROW_INDEX start: 193246 length 91
+ Stream: column 1 section DATA start: 193337 length 20035
+ Stream: column 2 section DATA start: 213372 length 40050
+ Stream: column 3 section DATA start: 253422 length 3574
+ Stream: column 3 section LENGTH start: 256996 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 257021 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 1:
+ Entry 0: count: 1000 hasNull: false min: -2141355639 max: 2145520931 sum: 2726719912 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -2138324170 max: 2140167376 sum: -23606674002 positions: 0,2050,488
+ Entry 2: count: 1000 hasNull: false min: -2146658006 max: 2144329742 sum: -41530109703 positions: 4099,2054,464
+ Entry 3: count: 1000 hasNull: false min: -2144207593 max: 2139456355 sum: 13559842458 positions: 8198,2058,440
+ Entry 4: count: 1000 hasNull: false min: -2145744719 max: 2145417153 sum: 57383770571 positions: 12297,2062,416
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9222731174895935707 max: 9214167447015056056 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9222758097219661129 max: 9221043130193737406 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9174483776261243438 max: 9208134757538374043 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9174329712613510612 max: 9197412874152820822 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9221162005892422758 max: 9220625004936875965 positions: 28693,14,416
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3901 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3900 positions: 0,431,431
+ Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3909 positions: 0,1485,52
+ Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3947 positions: 0,2196,104
+ Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3813 positions: 0,2934,131
+ Stripe: offset: 257233 data: 12943 rows: 1000 tail: 71 index: 131
+ Stream: column 0 section ROW_INDEX start: 257233 length 12
+ Stream: column 1 section ROW_INDEX start: 257245 length 38
+ Stream: column 2 section ROW_INDEX start: 257283 length 41
+ Stream: column 3 section ROW_INDEX start: 257324 length 40
+ Stream: column 1 section DATA start: 257364 length 4007
+ Stream: column 2 section DATA start: 261371 length 8010
+ Stream: column 3 section DATA start: 269381 length 768
+ Stream: column 3 section LENGTH start: 270149 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 270174 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 1:
+ Entry 0: count: 1000 hasNull: false min: -2146245500 max: 2146378640 sum: 51299706363 positions: 0,0,0
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476 positions: 0,0,0
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866 positions: 0,0,0
+
+File length: 270923 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+________________________________________________________________________________________________________________________
+
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/resources/orc-file-has-null.out
----------------------------------------------------------------------
diff --git a/orc/src/test/resources/orc-file-has-null.out b/orc/src/test/resources/orc-file-has-null.out
new file mode 100644
index 0000000..e98a73f
--- /dev/null
+++ b/orc/src/test/resources/orc-file-has-null.out
@@ -0,0 +1,112 @@
+Structure for TestOrcFile.testHasNull.orc
+File Version: 0.12 with HIVE_13083
+Rows: 20000
+Compression: ZLIB
+Compression size: 4096
+Type: struct<bytes1:binary,string1:string>
+
+Stripe Statistics:
+ Stripe 1:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false sum: 15000
+ Column 2: count: 2000 hasNull: true min: RG1 max: RG3 sum: 6000
+ Stripe 2:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false sum: 15000
+ Column 2: count: 0 hasNull: true
+ Stripe 3:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false sum: 15000
+ Column 2: count: 5000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 40000
+ Stripe 4:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false sum: 15000
+ Column 2: count: 0 hasNull: true
+
+File Statistics:
+ Column 0: count: 20000 hasNull: false
+ Column 1: count: 20000 hasNull: false sum: 60000
+ Column 2: count: 7000 hasNull: true min: RG1 max: STRIPE-3 sum: 46000
+
+Stripes:
+ Stripe: offset: 3 data: 220 rows: 5000 tail: 65 index: 154
+ Stream: column 0 section ROW_INDEX start: 3 length 17
+ Stream: column 1 section ROW_INDEX start: 20 length 60
+ Stream: column 2 section ROW_INDEX start: 80 length 77
+ Stream: column 1 section DATA start: 157 length 159
+ Stream: column 1 section LENGTH start: 316 length 15
+ Stream: column 2 section PRESENT start: 331 length 13
+ Stream: column 2 section DATA start: 344 length 18
+ Stream: column 2 section LENGTH start: 362 length 6
+ Stream: column 2 section DICTIONARY_DATA start: 368 length 9
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DICTIONARY_V2[2]
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: RG1 max: RG1 sum: 3000 positions: 0,0,0,0,0,0,0
+ Entry 1: count: 0 hasNull: true positions: 0,0,125,0,0,4,488
+ Entry 2: count: 1000 hasNull: false min: RG3 max: RG3 sum: 3000 positions: 0,2,125,0,0,4,488
+ Entry 3: count: 0 hasNull: true positions: 0,4,125,0,0,12,488
+ Entry 4: count: 0 hasNull: true positions: 0,6,125,0,0,12,488
+ Stripe: offset: 442 data: 185 rows: 5000 tail: 64 index: 116
+ Stream: column 0 section ROW_INDEX start: 442 length 17
+ Stream: column 1 section ROW_INDEX start: 459 length 60
+ Stream: column 2 section ROW_INDEX start: 519 length 39
+ Stream: column 1 section DATA start: 558 length 159
+ Stream: column 1 section LENGTH start: 717 length 15
+ Stream: column 2 section PRESENT start: 732 length 11
+ Stream: column 2 section DATA start: 743 length 0
+ Stream: column 2 section LENGTH start: 743 length 0
+ Stream: column 2 section DICTIONARY_DATA start: 743 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DICTIONARY_V2[0]
+ Row group indices for column 2:
+ Entry 0: count: 0 hasNull: true positions: 0,0,0,0,0,0,0
+ Entry 1: count: 0 hasNull: true positions: 0,0,125,0,0,0,0
+ Entry 2: count: 0 hasNull: true positions: 0,2,120,0,0,0,0
+ Entry 3: count: 0 hasNull: true positions: 0,4,115,0,0,0,0
+ Entry 4: count: 0 hasNull: true positions: 0,6,110,0,0,0,0
+ Stripe: offset: 807 data: 206 rows: 5000 tail: 60 index: 137
+ Stream: column 0 section ROW_INDEX start: 807 length 17
+ Stream: column 1 section ROW_INDEX start: 824 length 60
+ Stream: column 2 section ROW_INDEX start: 884 length 60
+ Stream: column 1 section DATA start: 944 length 159
+ Stream: column 1 section LENGTH start: 1103 length 15
+ Stream: column 2 section DATA start: 1118 length 15
+ Stream: column 2 section LENGTH start: 1133 length 6
+ Stream: column 2 section DICTIONARY_DATA start: 1139 length 11
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DICTIONARY_V2[1]
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions: 0,4,488
+ Entry 2: count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions: 0,12,464
+ Entry 3: count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions: 0,20,440
+ Entry 4: count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions: 0,28,416
+ Stripe: offset: 1210 data: 185 rows: 5000 tail: 64 index: 116
+ Stream: column 0 section ROW_INDEX start: 1210 length 17
+ Stream: column 1 section ROW_INDEX start: 1227 length 60
+ Stream: column 2 section ROW_INDEX start: 1287 length 39
+ Stream: column 1 section DATA start: 1326 length 159
+ Stream: column 1 section LENGTH start: 1485 length 15
+ Stream: column 2 section PRESENT start: 1500 length 11
+ Stream: column 2 section DATA start: 1511 length 0
+ Stream: column 2 section LENGTH start: 1511 length 0
+ Stream: column 2 section DICTIONARY_DATA start: 1511 length 0
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DICTIONARY_V2[0]
+ Row group indices for column 2:
+ Entry 0: count: 0 hasNull: true positions: 0,0,0,0,0,0,0
+ Entry 1: count: 0 hasNull: true positions: 0,0,125,0,0,0,0
+ Entry 2: count: 0 hasNull: true positions: 0,2,120,0,0,0,0
+ Entry 3: count: 0 hasNull: true positions: 0,4,115,0,0,0,0
+ Entry 4: count: 0 hasNull: true positions: 0,6,110,0,0,0,0
+
+File length: 1823 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+________________________________________________________________________________________________________________________
+
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDecimalToTimestamp.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDecimalToTimestamp.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDecimalToTimestamp.java
index 6225ade..8963449 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDecimalToTimestamp.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDecimalToTimestamp.java
@@ -20,12 +20,9 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions;
import java.sql.Timestamp;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampUtils;
-import org.apache.hadoop.hive.serde2.io.TimestampWritable;
+import org.apache.hadoop.hive.ql.util.TimestampUtils;
/**
* Type cast decimal to timestamp. The decimal value is interpreted
@@ -44,6 +41,7 @@ public class CastDecimalToTimestamp extends FuncDecimalToTimestamp {
@Override
protected void func(TimestampColumnVector outV, DecimalColumnVector inV, int i) {
- outV.set(i, TimestampWritable.decimalToTimestamp(inV.vector[i].getHiveDecimal()));
+ Timestamp timestamp = TimestampUtils.decimalToTimestamp(inV.vector[i].getHiveDecimal());
+ outV.set(i, timestamp);
}
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDoubleToTimestamp.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDoubleToTimestamp.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDoubleToTimestamp.java
index 31d2f78..07f94f5 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDoubleToTimestamp.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastDoubleToTimestamp.java
@@ -18,9 +18,11 @@
package org.apache.hadoop.hive.ql.exec.vector.expressions;
-import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
-import org.apache.hadoop.hive.ql.exec.vector.*;
-import org.apache.hadoop.hive.serde2.io.TimestampWritable;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.util.TimestampUtils;
public class CastDoubleToTimestamp extends VectorExpression {
private static final long serialVersionUID = 1L;
@@ -40,9 +42,8 @@ public class CastDoubleToTimestamp extends VectorExpression {
private void setDouble(TimestampColumnVector timestampColVector,
double[] vector, int elementNum) {
- TimestampWritable.setTimestampFromDouble(
- timestampColVector.getScratchTimestamp(), vector[elementNum]);
- timestampColVector.setFromScratchTimestamp(elementNum);
+ timestampColVector.set(elementNum,
+ TimestampUtils.doubleToTimestamp(vector[elementNum]));
}
@Override
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastLongToTimestamp.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastLongToTimestamp.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastLongToTimestamp.java
index a2ee52d..4de95a5 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastLongToTimestamp.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastLongToTimestamp.java
@@ -39,9 +39,7 @@ public class CastLongToTimestamp extends VectorExpression {
}
private void setSeconds(TimestampColumnVector timestampColVector, long[] vector, int elementNum) {
- TimestampWritable.setTimestampFromLong(
- timestampColVector.getScratchTimestamp(), vector[elementNum],
- /* intToTimestampInSeconds */ true);
+ timestampColVector.getScratchTimestamp().setTime(vector[elementNum] * 1000);
timestampColVector.setFromScratchTimestamp(elementNum);
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastMillisecondsLongToTimestamp.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastMillisecondsLongToTimestamp.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastMillisecondsLongToTimestamp.java
index 01c8810..b1c6b2d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastMillisecondsLongToTimestamp.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CastMillisecondsLongToTimestamp.java
@@ -38,10 +38,9 @@ public class CastMillisecondsLongToTimestamp extends VectorExpression {
super();
}
- private void setMilliseconds(TimestampColumnVector timestampColVector, long[] vector, int elementNum) {
- TimestampWritable.setTimestampFromLong(
- timestampColVector.getScratchTimestamp(), vector[elementNum],
- /* intToTimestampInSeconds */ false);
+ private void setMilliseconds(TimestampColumnVector timestampColVector,
+ long[] vector, int elementNum) {
+ timestampColVector.getScratchTimestamp().setTime(vector[elementNum]);
timestampColVector.setFromScratchTimestamp(elementNum);
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
deleted file mode 100644
index 90817a5..0000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
+++ /dev/null
@@ -1,354 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hive.ql.exec.vector.expressions;
-
-import java.util.Arrays;
-
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-
-/**
- * String expression evaluation helper functions.
- */
-public class StringExpr {
-
- /* Compare two strings from two byte arrays each
- * with their own start position and length.
- * Use lexicographic unsigned byte value order.
- * This is what's used for UTF-8 sort order.
- * Return negative value if arg1 < arg2, 0 if arg1 = arg2,
- * positive if arg1 > arg2.
- */
- public static int compare(byte[] arg1, int start1, int len1, byte[] arg2, int start2, int len2) {
- for (int i = 0; i < len1 && i < len2; i++) {
- // Note the "& 0xff" is just a way to convert unsigned bytes to signed integer.
- int b1 = arg1[i + start1] & 0xff;
- int b2 = arg2[i + start2] & 0xff;
- if (b1 != b2) {
- return b1 - b2;
- }
- }
- return len1 - len2;
- }
-
- /* Determine if two strings are equal from two byte arrays each
- * with their own start position and length.
- * Use lexicographic unsigned byte value order.
- * This is what's used for UTF-8 sort order.
- */
- public static boolean equal(byte[] arg1, final int start1, final int len1,
- byte[] arg2, final int start2, final int len2) {
- if (len1 != len2) {
- return false;
- }
- if (len1 == 0) {
- return true;
- }
-
- // do bounds check for OOB exception
- if (arg1[start1] != arg2[start2]
- || arg1[start1 + len1 - 1] != arg2[start2 + len2 - 1]) {
- return false;
- }
-
- if (len1 == len2) {
- // prove invariant to the compiler: len1 = len2
- // all array access between (start1, start1+len1)
- // and (start2, start2+len2) are valid
- // no more OOB exceptions are possible
- final int step = 8;
- final int remainder = len1 % step;
- final int wlen = len1 - remainder;
- // suffix first
- for (int i = wlen; i < len1; i++) {
- if (arg1[start1 + i] != arg2[start2 + i]) {
- return false;
- }
- }
- // SIMD loop
- for (int i = 0; i < wlen; i += step) {
- final int s1 = start1 + i;
- final int s2 = start2 + i;
- boolean neq = false;
- for (int j = 0; j < step; j++) {
- neq = (arg1[s1 + j] != arg2[s2 + j]) || neq;
- }
- if (neq) {
- return false;
- }
- }
- }
-
- return true;
- }
-
- public static int characterCount(byte[] bytes) {
- int end = bytes.length;
-
- // count characters
- int j = 0;
- int charCount = 0;
- while(j < end) {
- // UTF-8 continuation bytes have 2 high bits equal to 0x80.
- if ((bytes[j] & 0xc0) != 0x80) {
- ++charCount;
- }
- j++;
- }
- return charCount;
- }
-
- public static int characterCount(byte[] bytes, int start, int length) {
- int end = start + length;
-
- // count characters
- int j = start;
- int charCount = 0;
- while(j < end) {
- // UTF-8 continuation bytes have 2 high bits equal to 0x80.
- if ((bytes[j] & 0xc0) != 0x80) {
- ++charCount;
- }
- j++;
- }
- return charCount;
- }
-
- // A setVal with the same function signature as rightTrim, leftTrim, truncate, etc, below.
- // Useful for class generation via templates.
- public static void assign(BytesColumnVector outV, int i, byte[] bytes, int start, int length) {
- // set output vector
- outV.setVal(i, bytes, start, length);
- }
-
- /*
- * Right trim a slice of a byte array and return the new byte length.
- */
- public static int rightTrim(byte[] bytes, int start, int length) {
- // skip trailing blank characters
- int j = start + length - 1;
- while(j >= start && bytes[j] == 0x20) {
- j--;
- }
-
- return (j - start) + 1;
- }
-
- /*
- * Right trim a slice of a byte array and place the result into element i of a vector.
- */
- public static void rightTrim(BytesColumnVector outV, int i, byte[] bytes, int start, int length) {
- // skip trailing blank characters
- int j = start + length - 1;
- while(j >= start && bytes[j] == 0x20) {
- j--;
- }
-
- // set output vector
- outV.setVal(i, bytes, start, (j - start) + 1);
- }
-
- /*
- * Truncate a slice of a byte array to a maximum number of characters and
- * return the new byte length.
- */
- public static int truncate(byte[] bytes, int start, int length, int maxLength) {
- int end = start + length;
-
- // count characters forward
- int j = start;
- int charCount = 0;
- while(j < end) {
- // UTF-8 continuation bytes have 2 high bits equal to 0x80.
- if ((bytes[j] & 0xc0) != 0x80) {
- if (charCount == maxLength) {
- break;
- }
- ++charCount;
- }
- j++;
- }
- return (j - start);
- }
-
- /*
- * Truncate a slice of a byte array to a maximum number of characters and
- * place the result into element i of a vector.
- */
- public static void truncate(BytesColumnVector outV, int i, byte[] bytes, int start, int length, int maxLength) {
- int end = start + length;
-
- // count characters forward
- int j = start;
- int charCount = 0;
- while(j < end) {
- // UTF-8 continuation bytes have 2 high bits equal to 0x80.
- if ((bytes[j] & 0xc0) != 0x80) {
- if (charCount == maxLength) {
- break;
- }
- ++charCount;
- }
- j++;
- }
-
- // set output vector
- outV.setVal(i, bytes, start, (j - start));
- }
-
- /*
- * Truncate a byte array to a maximum number of characters and
- * return a byte array with only truncated bytes.
- */
- public static byte[] truncateScalar(byte[] bytes, int maxLength) {
- int end = bytes.length;
-
- // count characters forward
- int j = 0;
- int charCount = 0;
- while(j < end) {
- // UTF-8 continuation bytes have 2 high bits equal to 0x80.
- if ((bytes[j] & 0xc0) != 0x80) {
- if (charCount == maxLength) {
- break;
- }
- ++charCount;
- }
- j++;
- }
- if (j == end) {
- return bytes;
- } else {
- return Arrays.copyOf(bytes, j);
- }
- }
-
- /*
- * Right trim and truncate a slice of a byte array to a maximum number of characters and
- * return the new byte length.
- */
- public static int rightTrimAndTruncate(byte[] bytes, int start, int length, int maxLength) {
- int end = start + length;
-
- // count characters forward and watch for final run of pads
- int j = start;
- int charCount = 0;
- int padRunStart = -1;
- while(j < end) {
- // UTF-8 continuation bytes have 2 high bits equal to 0x80.
- if ((bytes[j] & 0xc0) != 0x80) {
- if (charCount == maxLength) {
- break;
- }
- if (bytes[j] == 0x20) {
- if (padRunStart == -1) {
- padRunStart = j;
- }
- } else {
- padRunStart = -1;
- }
- ++charCount;
- } else {
- padRunStart = -1;
- }
- j++;
- }
- if (padRunStart != -1) {
- return (padRunStart - start);
- } else {
- return (j - start);
- }
- }
-
- /*
- * Right trim and truncate a slice of a byte array to a maximum number of characters and
- * place the result into element i of a vector.
- */
- public static void rightTrimAndTruncate(BytesColumnVector outV, int i, byte[] bytes, int start, int length, int maxLength) {
- int end = start + length;
-
- // count characters forward and watch for final run of pads
- int j = start;
- int charCount = 0;
- int padRunStart = -1;
- while(j < end) {
- // UTF-8 continuation bytes have 2 high bits equal to 0x80.
- if ((bytes[j] & 0xc0) != 0x80) {
- if (charCount == maxLength) {
- break;
- }
- if (bytes[j] == 0x20) {
- if (padRunStart == -1) {
- padRunStart = j;
- }
- } else {
- padRunStart = -1;
- }
- ++charCount;
- } else {
- padRunStart = -1;
- }
- j++;
- }
- // set output vector
- if (padRunStart != -1) {
- outV.setVal(i, bytes, start, (padRunStart - start));
- } else {
- outV.setVal(i, bytes, start, (j - start) );
- }
- }
-
- /*
- * Right trim and truncate a byte array to a maximum number of characters and
- * return a byte array with only the trimmed and truncated bytes.
- */
- public static byte[] rightTrimAndTruncateScalar(byte[] bytes, int maxLength) {
- int end = bytes.length;
-
- // count characters forward and watch for final run of pads
- int j = 0;
- int charCount = 0;
- int padRunStart = -1;
- while(j < end) {
- // UTF-8 continuation bytes have 2 high bits equal to 0x80.
- if ((bytes[j] & 0xc0) != 0x80) {
- if (charCount == maxLength) {
- break;
- }
- if (bytes[j] == 0x20) {
- if (padRunStart == -1) {
- padRunStart = j;
- }
- } else {
- padRunStart = -1;
- }
- ++charCount;
- } else {
- padRunStart = -1;
- }
- j++;
- }
- if (padRunStart != -1) {
- return Arrays.copyOf(bytes, padRunStart);
- } else if (j == end) {
- return bytes;
- } else {
- return Arrays.copyOf(bytes, j);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/hooks/PostExecOrcFileDump.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/hooks/PostExecOrcFileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/hooks/PostExecOrcFileDump.java
index d5d1370..e184fcb 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/hooks/PostExecOrcFileDump.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/hooks/PostExecOrcFileDump.java
@@ -30,8 +30,8 @@ import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.QueryPlan;
import org.apache.hadoop.hive.ql.exec.FetchTask;
-import org.apache.hadoop.hive.ql.io.FileFormatException;
-import org.apache.hadoop.hive.ql.io.orc.FileDump;
+import org.apache.orc.FileFormatException;
+import org.apache.orc.tools.FileDump;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.session.SessionState;
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java b/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java
deleted file mode 100644
index 12417aa..0000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/FileFormatException.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io;
-
-import java.io.IOException;
-
-/**
- * Thrown when an invalid file format is encountered.
- */
-public class FileFormatException extends IOException {
-
- public FileFormatException(String errMsg) {
- super(errMsg);
- }
-}
[18/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java b/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java
new file mode 100644
index 0000000..289a86e
--- /dev/null
+++ b/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+public class TestOrcWideTable {
+
+ @Test
+ public void testBufferSizeFor1Col() throws IOException {
+ assertEquals(128 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+ 1, 128*1024));
+ }
+
+ @Test
+ public void testBufferSizeFor50Col() throws IOException {
+ assertEquals(256 * 1024, WriterImpl.getEstimatedBufferSize(256 * 1024 * 1024,
+ 50, 256*1024));
+ }
+
+ @Test
+ public void testBufferSizeFor1000Col() throws IOException {
+ assertEquals(32 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+ 1000, 128*1024));
+ }
+
+ @Test
+ public void testBufferSizeFor2000Col() throws IOException {
+ assertEquals(16 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+ 2000, 256*1024));
+ }
+
+ @Test
+ public void testBufferSizeFor4000Col() throws IOException {
+ assertEquals(8 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+ 4000, 256*1024));
+ }
+
+ @Test
+ public void testBufferSizeFor25000Col() throws IOException {
+ assertEquals(4 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+ 25000, 256*1024));
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/impl/TestRLEv2.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestRLEv2.java b/orc/src/test/org/apache/orc/impl/TestRLEv2.java
new file mode 100644
index 0000000..e139619
--- /dev/null
+++ b/orc/src/test/org/apache/orc/impl/TestRLEv2.java
@@ -0,0 +1,307 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.apache.orc.tools.FileDump;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+public class TestRLEv2 {
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+ Path testFilePath;
+ Configuration conf;
+ FileSystem fs;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem () throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestRLEv2." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ void appendInt(VectorizedRowBatch batch, int i) {
+ ((LongColumnVector) batch.cols[0]).vector[batch.size++] = i;
+ }
+
+ @Test
+ public void testFixedDeltaZero() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (int i = 0; i < 5120; ++i) {
+ appendInt(batch, 123);
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // 10 runs of 512 elements. Each run has 2 bytes header, 2 bytes base (base = 123,
+ // zigzag encoded varint) and 1 byte delta (delta = 0). In total, 5 bytes per run.
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testFixedDeltaOne() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (int i = 0; i < 5120; ++i) {
+ appendInt(batch, i % 512);
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // 10 runs of 512 elements. Each run has 2 bytes header, 1 byte base (base = 0)
+ // and 1 byte delta (delta = 1). In total, 4 bytes per run.
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 40"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testFixedDeltaOneDescending() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (int i = 0; i < 5120; ++i) {
+ appendInt(batch, 512 - (i % 512));
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint)
+ // and 1 byte delta (delta = 1). In total, 5 bytes per run.
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testFixedDeltaLarge() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (int i = 0; i < 5120; ++i) {
+ appendInt(batch, i % 512 + ((i % 512) * 100));
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // 10 runs of 512 elements. Each run has 2 bytes header, 1 byte base (base = 0)
+ // and 2 bytes delta (delta = 100, zigzag encoded varint). In total, 5 bytes per run.
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testFixedDeltaLargeDescending() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (int i = 0; i < 5120; ++i) {
+ appendInt(batch, (512 - i % 512) + ((i % 512) * 100));
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint)
+ // and 2 bytes delta (delta = 100, zigzag encoded varint). In total, 6 bytes per run.
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 60"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testShortRepeat() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (int i = 0; i < 5; ++i) {
+ appendInt(batch, 10);
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // 1 byte header + 1 byte value
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 2"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testDeltaUnknownSign() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ appendInt(batch, 0);
+ for (int i = 0; i < 511; ++i) {
+ appendInt(batch, i);
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // monotonicity will be undetermined for this sequence 0,0,1,2,3,...510. Hence DIRECT encoding
+ // will be used. 2 bytes for header and 640 bytes for data (512 values with fixed bit of 10 bits
+ // each, 5120/8 = 640). Total bytes 642
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 642"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testPatchedBase() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+
+ Random rand = new Random(123);
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ appendInt(batch, 10000000);
+ for (int i = 0; i < 511; ++i) {
+ appendInt(batch, rand.nextInt(i+1));
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // use PATCHED_BASE encoding
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 583"));
+ System.setOut(origOut);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/impl/TestReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestReaderImpl.java b/orc/src/test/org/apache/orc/impl/TestReaderImpl.java
new file mode 100644
index 0000000..23d0dab
--- /dev/null
+++ b/orc/src/test/org/apache/orc/impl/TestReaderImpl.java
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2016 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import java.io.ByteArrayInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PositionedReadable;
+import org.apache.hadoop.fs.Seekable;
+import org.apache.orc.FileFormatException;
+import org.apache.hadoop.io.Text;
+import org.apache.orc.OrcFile;
+import org.junit.Test;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.rules.ExpectedException;
+
+public class TestReaderImpl {
+
+ @Rule
+ public ExpectedException thrown = ExpectedException.none();
+
+ private final Path path = new Path("test-file.orc");
+ private FSDataInputStream in;
+ private int psLen;
+ private ByteBuffer buffer;
+
+ @Before
+ public void setup() {
+ in = null;
+ }
+
+ @Test
+ public void testEnsureOrcFooterSmallTextFile() throws IOException {
+ prepareTestCase("1".getBytes());
+ thrown.expect(FileFormatException.class);
+ ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
+ }
+
+ @Test
+ public void testEnsureOrcFooterLargeTextFile() throws IOException {
+ prepareTestCase("This is Some Text File".getBytes());
+ thrown.expect(FileFormatException.class);
+ ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
+ }
+
+ @Test
+ public void testEnsureOrcFooter011ORCFile() throws IOException {
+ prepareTestCase(composeContent(OrcFile.MAGIC, "FOOTER"));
+ ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
+ }
+
+ @Test
+ public void testEnsureOrcFooterCorrectORCFooter() throws IOException {
+ prepareTestCase(composeContent("", OrcFile.MAGIC));
+ ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
+ }
+
+ private void prepareTestCase(byte[] bytes) {
+ buffer = ByteBuffer.wrap(bytes);
+ psLen = buffer.get(bytes.length - 1) & 0xff;
+ in = new FSDataInputStream(new SeekableByteArrayInputStream(bytes));
+ }
+
+ private byte[] composeContent(String headerStr, String footerStr) throws CharacterCodingException {
+ ByteBuffer header = Text.encode(headerStr);
+ ByteBuffer footer = Text.encode(footerStr);
+ int headerLen = header.remaining();
+ int footerLen = footer.remaining() + 1;
+
+ ByteBuffer buf = ByteBuffer.allocate(headerLen + footerLen);
+
+ buf.put(header);
+ buf.put(footer);
+ buf.put((byte) footerLen);
+ return buf.array();
+ }
+
+ private static final class SeekableByteArrayInputStream extends ByteArrayInputStream
+ implements Seekable, PositionedReadable {
+
+ public SeekableByteArrayInputStream(byte[] buf) {
+ super(buf);
+ }
+
+ @Override
+ public void seek(long pos) throws IOException {
+ this.reset();
+ this.skip(pos);
+ }
+
+ @Override
+ public long getPos() throws IOException {
+ return pos;
+ }
+
+ @Override
+ public boolean seekToNewSource(long targetPos) throws IOException {
+ return false;
+ }
+
+ @Override
+ public int read(long position, byte[] buffer, int offset, int length)
+ throws IOException {
+ long oldPos = getPos();
+ int nread = -1;
+ try {
+ seek(position);
+ nread = read(buffer, offset, length);
+ } finally {
+ seek(oldPos);
+ }
+ return nread;
+ }
+
+ @Override
+ public void readFully(long position, byte[] buffer, int offset, int length)
+ throws IOException {
+ int nread = 0;
+ while (nread < length) {
+ int nbytes = read(position + nread, buffer, offset + nread, length - nread);
+ if (nbytes < 0) {
+ throw new EOFException("End of file reached before reading fully.");
+ }
+ nread += nbytes;
+ }
+ }
+
+ @Override
+ public void readFully(long position, byte[] buffer)
+ throws IOException {
+ readFully(position, buffer, 0, buffer.length);
+ }
+ }
+}
[02/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/resources/orc-file-dump.json
----------------------------------------------------------------------
diff --git a/ql/src/test/resources/orc-file-dump.json b/ql/src/test/resources/orc-file-dump.json
deleted file mode 100644
index bf654a1..0000000
--- a/ql/src/test/resources/orc-file-dump.json
+++ /dev/null
@@ -1,1355 +0,0 @@
-{
- "fileName": "TestFileDump.testDump.orc",
- "fileVersion": "0.12",
- "writerVersion": "HIVE_13083",
- "numberOfRows": 21000,
- "compression": "ZLIB",
- "compressionBufferSize": 4096,
- "schemaString": "struct<i:int,l:bigint,s:string>",
- "schema": [
- {
- "columnId": 0,
- "columnType": "STRUCT",
- "childColumnNames": [
- "i",
- "l",
- "s"
- ],
- "childColumnIds": [
- 1,
- 2,
- 3
- ]
- },
- {
- "columnId": 1,
- "columnType": "INT"
- },
- {
- "columnId": 2,
- "columnType": "LONG"
- },
- {
- "columnId": 3,
- "columnType": "STRING"
- }
- ],
- "stripeStatistics": [
- {
- "stripeNumber": 1,
- "columnStatistics": [
- {
- "columnId": 0,
- "count": 5000,
- "hasNull": false
- },
- {
- "columnId": 1,
- "count": 5000,
- "hasNull": false,
- "min": -2147115959,
- "max": 2145210552,
- "sum": 50111854553,
- "type": "LONG"
- },
- {
- "columnId": 2,
- "count": 5000,
- "hasNull": false,
- "min": -9223180583305557329,
- "max": 9221614132680747961,
- "type": "LONG"
- },
- {
- "columnId": 3,
- "count": 4950,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 19283,
- "type": "STRING"
- }
- ]
- },
- {
- "stripeNumber": 2,
- "columnStatistics": [
- {
- "columnId": 0,
- "count": 5000,
- "hasNull": false
- },
- {
- "columnId": 1,
- "count": 5000,
- "hasNull": false,
- "min": -2147390285,
- "max": 2147224606,
- "sum": -22290798217,
- "type": "LONG"
- },
- {
- "columnId": 2,
- "count": 5000,
- "hasNull": false,
- "min": -9219295160509160427,
- "max": 9217571024994660020,
- "type": "LONG"
- },
- {
- "columnId": 3,
- "count": 4950,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 19397,
- "type": "STRING"
- }
- ]
- },
- {
- "stripeNumber": 3,
- "columnStatistics": [
- {
- "columnId": 0,
- "count": 5000,
- "hasNull": false
- },
- {
- "columnId": 1,
- "count": 5000,
- "hasNull": false,
- "min": -2146954065,
- "max": 2146722468,
- "sum": 20639652136,
- "type": "LONG"
- },
- {
- "columnId": 2,
- "count": 5000,
- "hasNull": false,
- "min": -9214076359988107846,
- "max": 9222919052987871506,
- "type": "LONG"
- },
- {
- "columnId": 3,
- "count": 4950,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 19031,
- "type": "STRING"
- }
- ]
- },
- {
- "stripeNumber": 4,
- "columnStatistics": [
- {
- "columnId": 0,
- "count": 5000,
- "hasNull": false
- },
- {
- "columnId": 1,
- "count": 5000,
- "hasNull": false,
- "min": -2146969085,
- "max": 2146025044,
- "sum": -5156814387,
- "type": "LONG"
- },
- {
- "columnId": 2,
- "count": 5000,
- "hasNull": false,
- "min": -9222731174895935707,
- "max": 9220625004936875965,
- "type": "LONG"
- },
- {
- "columnId": 3,
- "count": 4950,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 19459,
- "type": "STRING"
- }
- ]
- },
- {
- "stripeNumber": 5,
- "columnStatistics": [
- {
- "columnId": 0,
- "count": 1000,
- "hasNull": false
- },
- {
- "columnId": 1,
- "count": 1000,
- "hasNull": false,
- "min": -2144303438,
- "max": 2127599049,
- "sum": 62841564778,
- "type": "LONG"
- },
- {
- "columnId": 2,
- "count": 1000,
- "hasNull": false,
- "min": -9195133638801798919,
- "max": 9218626063131504414,
- "type": "LONG"
- },
- {
- "columnId": 3,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3963,
- "type": "STRING"
- }
- ]
- }
- ],
- "fileStatistics": [
- {
- "columnId": 0,
- "count": 21000,
- "hasNull": false
- },
- {
- "columnId": 1,
- "count": 21000,
- "hasNull": false,
- "min": -2147390285,
- "max": 2147224606,
- "sum": 106145458863,
- "type": "LONG"
- },
- {
- "columnId": 2,
- "count": 21000,
- "hasNull": false,
- "min": -9223180583305557329,
- "max": 9222919052987871506,
- "type": "LONG"
- },
- {
- "columnId": 3,
- "count": 20790,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 81133,
- "type": "STRING"
- }
- ],
- "stripes": [
- {
- "stripeNumber": 1,
- "stripeInformation": {
- "offset": 3,
- "indexLength": 970,
- "dataLength": 63770,
- "footerLength": 90,
- "rowCount": 5000
- },
- "streams": [
- {
- "columnId": 0,
- "section": "ROW_INDEX",
- "startOffset": 3,
- "length": 17
- },
- {
- "columnId": 1,
- "section": "ROW_INDEX",
- "startOffset": 20,
- "length": 167
- },
- {
- "columnId": 2,
- "section": "ROW_INDEX",
- "startOffset": 187,
- "length": 171
- },
- {
- "columnId": 3,
- "section": "ROW_INDEX",
- "startOffset": 358,
- "length": 103
- },
- {
- "columnId": 3,
- "section": "BLOOM_FILTER",
- "startOffset": 461,
- "length": 512
- },
- {
- "columnId": 1,
- "section": "DATA",
- "startOffset": 973,
- "length": 20035
- },
- {
- "columnId": 2,
- "section": "DATA",
- "startOffset": 21008,
- "length": 40050
- },
- {
- "columnId": 3,
- "section": "PRESENT",
- "startOffset": 61058,
- "length": 17
- },
- {
- "columnId": 3,
- "section": "DATA",
- "startOffset": 61075,
- "length": 3510
- },
- {
- "columnId": 3,
- "section": "LENGTH",
- "startOffset": 64585,
- "length": 25
- },
- {
- "columnId": 3,
- "section": "DICTIONARY_DATA",
- "startOffset": 64610,
- "length": 133
- }
- ],
- "encodings": [
- {
- "columnId": 0,
- "kind": "DIRECT"
- },
- {
- "columnId": 1,
- "kind": "DIRECT_V2"
- },
- {
- "columnId": 2,
- "kind": "DIRECT_V2"
- },
- {
- "columnId": 3,
- "kind": "DICTIONARY_V2",
- "dictionarySize": 35
- }
- ],
- "indexes": [{
- "columnId": 3,
- "rowGroupIndexes": [
- {
- "entryId": 0,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3873,
- "type": "STRING",
- "positions": [
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0
- ]
- },
- {
- "entryId": 1,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3861,
- "type": "STRING",
- "positions": [
- 0,
- 38,
- 12,
- 0,
- 0,
- 736,
- 23
- ]
- },
- {
- "entryId": 2,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3946,
- "type": "STRING",
- "positions": [
- 0,
- 78,
- 12,
- 0,
- 0,
- 1473,
- 43
- ]
- },
- {
- "entryId": 3,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3774,
- "type": "STRING",
- "positions": [
- 0,
- 118,
- 12,
- 0,
- 0,
- 2067,
- 261
- ]
- },
- {
- "entryId": 4,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3829,
- "type": "STRING",
- "positions": [
- 0,
- 158,
- 12,
- 0,
- 0,
- 2992,
- 35
- ]
- }
- ],
- "bloomFilterIndexes": [
- {
- "entryId": 0,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 1,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 2,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 3,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 4,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- }
- ],
- "stripeLevelBloomFilter": {
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- }
- }]
- },
- {
- "stripeNumber": 2,
- "stripeInformation": {
- "offset": 64833,
- "indexLength": 961,
- "dataLength": 63763,
- "footerLength": 88,
- "rowCount": 5000
- },
- "streams": [
- {
- "columnId": 0,
- "section": "ROW_INDEX",
- "startOffset": 64833,
- "length": 17
- },
- {
- "columnId": 1,
- "section": "ROW_INDEX",
- "startOffset": 64850,
- "length": 166
- },
- {
- "columnId": 2,
- "section": "ROW_INDEX",
- "startOffset": 65016,
- "length": 166
- },
- {
- "columnId": 3,
- "section": "ROW_INDEX",
- "startOffset": 65182,
- "length": 100
- },
- {
- "columnId": 3,
- "section": "BLOOM_FILTER",
- "startOffset": 65282,
- "length": 512
- },
- {
- "columnId": 1,
- "section": "DATA",
- "startOffset": 65794,
- "length": 20035
- },
- {
- "columnId": 2,
- "section": "DATA",
- "startOffset": 85829,
- "length": 40050
- },
- {
- "columnId": 3,
- "section": "PRESENT",
- "startOffset": 125879,
- "length": 17
- },
- {
- "columnId": 3,
- "section": "DATA",
- "startOffset": 125896,
- "length": 3503
- },
- {
- "columnId": 3,
- "section": "LENGTH",
- "startOffset": 129399,
- "length": 25
- },
- {
- "columnId": 3,
- "section": "DICTIONARY_DATA",
- "startOffset": 129424,
- "length": 133
- }
- ],
- "encodings": [
- {
- "columnId": 0,
- "kind": "DIRECT"
- },
- {
- "columnId": 1,
- "kind": "DIRECT_V2"
- },
- {
- "columnId": 2,
- "kind": "DIRECT_V2"
- },
- {
- "columnId": 3,
- "kind": "DICTIONARY_V2",
- "dictionarySize": 35
- }
- ],
- "indexes": [{
- "columnId": 3,
- "rowGroupIndexes": [
- {
- "entryId": 0,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3946,
- "type": "STRING",
- "positions": [
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0
- ]
- },
- {
- "entryId": 1,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3836,
- "type": "STRING",
- "positions": [
- 0,
- 38,
- 12,
- 0,
- 0,
- 746,
- 11
- ]
- },
- {
- "entryId": 2,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3791,
- "type": "STRING",
- "positions": [
- 0,
- 78,
- 12,
- 0,
- 0,
- 1430,
- 95
- ]
- },
- {
- "entryId": 3,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3904,
- "type": "STRING",
- "positions": [
- 0,
- 118,
- 12,
- 0,
- 0,
- 2239,
- 23
- ]
- },
- {
- "entryId": 4,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3920,
- "type": "STRING",
- "positions": [
- 0,
- 158,
- 12,
- 0,
- 0,
- 2994,
- 17
- ]
- }
- ],
- "bloomFilterIndexes": [
- {
- "entryId": 0,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 1,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 2,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 3,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 4,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- }
- ],
- "stripeLevelBloomFilter": {
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- }
- }]
- },
- {
- "stripeNumber": 3,
- "stripeInformation": {
- "offset": 129645,
- "indexLength": 962,
- "dataLength": 63770,
- "footerLength": 91,
- "rowCount": 5000
- },
- "streams": [
- {
- "columnId": 0,
- "section": "ROW_INDEX",
- "startOffset": 129645,
- "length": 17
- },
- {
- "columnId": 1,
- "section": "ROW_INDEX",
- "startOffset": 129662,
- "length": 164
- },
- {
- "columnId": 2,
- "section": "ROW_INDEX",
- "startOffset": 129826,
- "length": 167
- },
- {
- "columnId": 3,
- "section": "ROW_INDEX",
- "startOffset": 129993,
- "length": 102
- },
- {
- "columnId": 3,
- "section": "BLOOM_FILTER",
- "startOffset": 130095,
- "length": 512
- },
- {
- "columnId": 1,
- "section": "DATA",
- "startOffset": 130607,
- "length": 20035
- },
- {
- "columnId": 2,
- "section": "DATA",
- "startOffset": 150642,
- "length": 40050
- },
- {
- "columnId": 3,
- "section": "PRESENT",
- "startOffset": 190692,
- "length": 17
- },
- {
- "columnId": 3,
- "section": "DATA",
- "startOffset": 190709,
- "length": 3510
- },
- {
- "columnId": 3,
- "section": "LENGTH",
- "startOffset": 194219,
- "length": 25
- },
- {
- "columnId": 3,
- "section": "DICTIONARY_DATA",
- "startOffset": 194244,
- "length": 133
- }
- ],
- "encodings": [
- {
- "columnId": 0,
- "kind": "DIRECT"
- },
- {
- "columnId": 1,
- "kind": "DIRECT_V2"
- },
- {
- "columnId": 2,
- "kind": "DIRECT_V2"
- },
- {
- "columnId": 3,
- "kind": "DICTIONARY_V2",
- "dictionarySize": 35
- }
- ],
- "indexes": [{
- "columnId": 3,
- "rowGroupIndexes": [
- {
- "entryId": 0,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3829,
- "type": "STRING",
- "positions": [
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0
- ]
- },
- {
- "entryId": 1,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3853,
- "type": "STRING",
- "positions": [
- 0,
- 38,
- 12,
- 0,
- 0,
- 698,
- 74
- ]
- },
- {
- "entryId": 2,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3796,
- "type": "STRING",
- "positions": [
- 0,
- 78,
- 12,
- 0,
- 0,
- 1483,
- 39
- ]
- },
- {
- "entryId": 3,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3736,
- "type": "STRING",
- "positions": [
- 0,
- 118,
- 12,
- 0,
- 0,
- 2148,
- 155
- ]
- },
- {
- "entryId": 4,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3817,
- "type": "STRING",
- "positions": [
- 0,
- 158,
- 12,
- 0,
- 0,
- 3018,
- 8
- ]
- }
- ],
- "bloomFilterIndexes": [
- {
- "entryId": 0,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 1,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 2,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 3,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 4,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- }
- ],
- "stripeLevelBloomFilter": {
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- }
- }]
- },
- {
- "stripeNumber": 4,
- "stripeInformation": {
- "offset": 194468,
- "indexLength": 973,
- "dataLength": 63756,
- "footerLength": 91,
- "rowCount": 5000
- },
- "streams": [
- {
- "columnId": 0,
- "section": "ROW_INDEX",
- "startOffset": 194468,
- "length": 17
- },
- {
- "columnId": 1,
- "section": "ROW_INDEX",
- "startOffset": 194485,
- "length": 166
- },
- {
- "columnId": 2,
- "section": "ROW_INDEX",
- "startOffset": 194651,
- "length": 171
- },
- {
- "columnId": 3,
- "section": "ROW_INDEX",
- "startOffset": 194822,
- "length": 107
- },
- {
- "columnId": 3,
- "section": "BLOOM_FILTER",
- "startOffset": 194929,
- "length": 512
- },
- {
- "columnId": 1,
- "section": "DATA",
- "startOffset": 195441,
- "length": 20035
- },
- {
- "columnId": 2,
- "section": "DATA",
- "startOffset": 215476,
- "length": 40050
- },
- {
- "columnId": 3,
- "section": "PRESENT",
- "startOffset": 255526,
- "length": 17
- },
- {
- "columnId": 3,
- "section": "DATA",
- "startOffset": 255543,
- "length": 3496
- },
- {
- "columnId": 3,
- "section": "LENGTH",
- "startOffset": 259039,
- "length": 25
- },
- {
- "columnId": 3,
- "section": "DICTIONARY_DATA",
- "startOffset": 259064,
- "length": 133
- }
- ],
- "encodings": [
- {
- "columnId": 0,
- "kind": "DIRECT"
- },
- {
- "columnId": 1,
- "kind": "DIRECT_V2"
- },
- {
- "columnId": 2,
- "kind": "DIRECT_V2"
- },
- {
- "columnId": 3,
- "kind": "DICTIONARY_V2",
- "dictionarySize": 35
- }
- ],
- "indexes": [{
- "columnId": 3,
- "rowGroupIndexes": [
- {
- "entryId": 0,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3959,
- "type": "STRING",
- "positions": [
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0
- ]
- },
- {
- "entryId": 1,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3816,
- "type": "STRING",
- "positions": [
- 0,
- 38,
- 12,
- 0,
- 0,
- 495,
- 338
- ]
- },
- {
- "entryId": 2,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3883,
- "type": "STRING",
- "positions": [
- 0,
- 78,
- 12,
- 0,
- 0,
- 1449,
- 71
- ]
- },
- {
- "entryId": 3,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3938,
- "type": "STRING",
- "positions": [
- 0,
- 118,
- 12,
- 0,
- 0,
- 2207,
- 59
- ]
- },
- {
- "entryId": 4,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3863,
- "type": "STRING",
- "positions": [
- 0,
- 158,
- 12,
- 0,
- 0,
- 2838,
- 223
- ]
- }
- ],
- "bloomFilterIndexes": [
- {
- "entryId": 0,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 1,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 2,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 3,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- },
- {
- "entryId": 4,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- }
- ],
- "stripeLevelBloomFilter": {
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- }
- }]
- },
- {
- "stripeNumber": 5,
- "stripeInformation": {
- "offset": 259288,
- "indexLength": 433,
- "dataLength": 12943,
- "footerLength": 83,
- "rowCount": 1000
- },
- "streams": [
- {
- "columnId": 0,
- "section": "ROW_INDEX",
- "startOffset": 259288,
- "length": 12
- },
- {
- "columnId": 1,
- "section": "ROW_INDEX",
- "startOffset": 259300,
- "length": 38
- },
- {
- "columnId": 2,
- "section": "ROW_INDEX",
- "startOffset": 259338,
- "length": 41
- },
- {
- "columnId": 3,
- "section": "ROW_INDEX",
- "startOffset": 259379,
- "length": 41
- },
- {
- "columnId": 3,
- "section": "BLOOM_FILTER",
- "startOffset": 259420,
- "length": 301
- },
- {
- "columnId": 1,
- "section": "DATA",
- "startOffset": 259721,
- "length": 4007
- },
- {
- "columnId": 2,
- "section": "DATA",
- "startOffset": 263728,
- "length": 8010
- },
- {
- "columnId": 3,
- "section": "PRESENT",
- "startOffset": 271738,
- "length": 16
- },
- {
- "columnId": 3,
- "section": "DATA",
- "startOffset": 271754,
- "length": 752
- },
- {
- "columnId": 3,
- "section": "LENGTH",
- "startOffset": 272506,
- "length": 25
- },
- {
- "columnId": 3,
- "section": "DICTIONARY_DATA",
- "startOffset": 272531,
- "length": 133
- }
- ],
- "encodings": [
- {
- "columnId": 0,
- "kind": "DIRECT"
- },
- {
- "columnId": 1,
- "kind": "DIRECT_V2"
- },
- {
- "columnId": 2,
- "kind": "DIRECT_V2"
- },
- {
- "columnId": 3,
- "kind": "DICTIONARY_V2",
- "dictionarySize": 35
- }
- ],
- "indexes": [{
- "columnId": 3,
- "rowGroupIndexes": [{
- "entryId": 0,
- "count": 990,
- "hasNull": true,
- "min": "Darkness,",
- "max": "worst",
- "totalLength": 3963,
- "type": "STRING",
- "positions": [
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0
- ]
- }],
- "bloomFilterIndexes": [{
- "entryId": 0,
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- }],
- "stripeLevelBloomFilter": {
- "numHashFunctions": 4,
- "bitCount": 6272,
- "popCount": 138,
- "loadFactor": 0.022002551704645157,
- "expectedFpp": 2.3436470542037569E-7
- }
- }]
- }
- ],
- "fileLength": 273300,
- "paddingLength": 0,
- "paddingRatio": 0,
- "status": "OK"
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/resources/orc-file-dump.out
----------------------------------------------------------------------
diff --git a/ql/src/test/resources/orc-file-dump.out b/ql/src/test/resources/orc-file-dump.out
deleted file mode 100644
index 70f7fbd..0000000
--- a/ql/src/test/resources/orc-file-dump.out
+++ /dev/null
@@ -1,195 +0,0 @@
-Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
-Rows: 21000
-Compression: ZLIB
-Compression size: 4096
-Type: struct<i:int,l:bigint,s:string>
-
-Stripe Statistics:
- Stripe 1:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2146021688 max: 2147223299 sum: 515792826
- Column 2: count: 5000 hasNull: false min: -9218592812243954469 max: 9221614132680747961
- Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19280
- Stripe 2:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2146733128 max: 2147001622 sum: 7673427
- Column 2: count: 5000 hasNull: false min: -9220818777591257749 max: 9222259462014003839
- Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19504
- Stripe 3:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2146993718 max: 2147378179 sum: 132660742551
- Column 2: count: 5000 hasNull: false min: -9218342074710552826 max: 9222303228623055266
- Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19641
- Stripe 4:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false min: -2146658006 max: 2145520931 sum: 8533549236
- Column 2: count: 5000 hasNull: false min: -9222758097219661129 max: 9221043130193737406
- Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19470
- Stripe 5:
- Column 0: count: 1000 hasNull: false
- Column 1: count: 1000 hasNull: false min: -2146245500 max: 2146378640 sum: 51299706363
- Column 2: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476
- Column 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866
-
-File Statistics:
- Column 0: count: 21000 hasNull: false
- Column 1: count: 21000 hasNull: false min: -2146993718 max: 2147378179 sum: 193017464403
- Column 2: count: 21000 hasNull: false min: -9222758097219661129 max: 9222303228623055266
- Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
-
-Stripes:
- Stripe: offset: 3 data: 63786 rows: 5000 tail: 79 index: 439
- Stream: column 0 section ROW_INDEX start: 3 length 17
- Stream: column 1 section ROW_INDEX start: 20 length 166
- Stream: column 2 section ROW_INDEX start: 186 length 169
- Stream: column 3 section ROW_INDEX start: 355 length 87
- Stream: column 1 section DATA start: 442 length 20035
- Stream: column 2 section DATA start: 20477 length 40050
- Stream: column 3 section DATA start: 60527 length 3543
- Stream: column 3 section LENGTH start: 64070 length 25
- Stream: column 3 section DICTIONARY_DATA start: 64095 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 1:
- Entry 0: count: 1000 hasNull: false min: -2145365268 max: 2135491313 sum: 7521792925 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -2139452528 max: 2147223299 sum: -12923774313 positions: 0,2050,488
- Entry 2: count: 1000 hasNull: false min: -2142420586 max: 2143898386 sum: -25521983511 positions: 4099,2054,464
- Entry 3: count: 1000 hasNull: false min: -2137233441 max: 2144267163 sum: 40993386199 positions: 8198,2058,440
- Entry 4: count: 1000 hasNull: false min: -2146021688 max: 2146838901 sum: -9553628474 positions: 12297,2062,416
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9200577545527640566 max: 9175500305011173751 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9203618157670445774 max: 9208123824411178101 positions: 4099,2,488
- Entry 2: count: 1000 hasNull: false min: -9218592812243954469 max: 9221351515892923972 positions: 12297,6,464
- Entry 3: count: 1000 hasNull: false min: -9206585617947511272 max: 9167703224425685487 positions: 20495,10,440
- Entry 4: count: 1000 hasNull: false min: -9206645795733282496 max: 9221614132680747961 positions: 28693,14,416
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3862 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3884 positions: 0,659,149
- Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3893 positions: 0,1531,3
- Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3798 positions: 0,2281,32
- Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3843 positions: 0,3033,45
- Stripe: offset: 64307 data: 63775 rows: 5000 tail: 79 index: 432
- Stream: column 0 section ROW_INDEX start: 64307 length 17
- Stream: column 1 section ROW_INDEX start: 64324 length 164
- Stream: column 2 section ROW_INDEX start: 64488 length 168
- Stream: column 3 section ROW_INDEX start: 64656 length 83
- Stream: column 1 section DATA start: 64739 length 20035
- Stream: column 2 section DATA start: 84774 length 40050
- Stream: column 3 section DATA start: 124824 length 3532
- Stream: column 3 section LENGTH start: 128356 length 25
- Stream: column 3 section DICTIONARY_DATA start: 128381 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 1:
- Entry 0: count: 1000 hasNull: false min: -2143799121 max: 2145249879 sum: -6966266181 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -2146733128 max: 2147001622 sum: -35930106333 positions: 0,2050,488
- Entry 2: count: 1000 hasNull: false min: -2144302712 max: 2146299933 sum: 6944230435 positions: 4099,2054,464
- Entry 3: count: 1000 hasNull: false min: -2145172948 max: 2144335014 sum: -29624404959 positions: 8198,2058,440
- Entry 4: count: 1000 hasNull: false min: -2146428427 max: 2144067253 sum: 65584220465 positions: 12297,2062,416
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9218450653857701562 max: 9189819526332228512 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9220818777591257749 max: 9178821722829648113 positions: 4099,2,488
- Entry 2: count: 1000 hasNull: false min: -9220031433030423388 max: 9210838931786956852 positions: 12297,6,464
- Entry 3: count: 1000 hasNull: false min: -9208195729739635607 max: 9222259462014003839 positions: 20495,10,440
- Entry 4: count: 1000 hasNull: false min: -9174271499932339698 max: 9212277876771676916 positions: 28693,14,416
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3923 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3869 positions: 0,761,12
- Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,1472,70
- Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3931 positions: 0,2250,43
- Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3964 positions: 0,2978,88
- Stripe: offset: 128593 data: 63787 rows: 5000 tail: 79 index: 438
- Stream: column 0 section ROW_INDEX start: 128593 length 17
- Stream: column 1 section ROW_INDEX start: 128610 length 163
- Stream: column 2 section ROW_INDEX start: 128773 length 168
- Stream: column 3 section ROW_INDEX start: 128941 length 90
- Stream: column 1 section DATA start: 129031 length 20035
- Stream: column 2 section DATA start: 149066 length 40050
- Stream: column 3 section DATA start: 189116 length 3544
- Stream: column 3 section LENGTH start: 192660 length 25
- Stream: column 3 section DICTIONARY_DATA start: 192685 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 1:
- Entry 0: count: 1000 hasNull: false min: -2146993718 max: 2144179881 sum: -7829543271 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -2144095505 max: 2144883384 sum: 51623839692 positions: 0,2050,488
- Entry 2: count: 1000 hasNull: false min: -2144113995 max: 2143773575 sum: 56574412741 positions: 4099,2054,464
- Entry 3: count: 1000 hasNull: false min: -2146954065 max: 2146794873 sum: 4336083432 positions: 8198,2058,440
- Entry 4: count: 1000 hasNull: false min: -2135511523 max: 2147378179 sum: 27955949957 positions: 12297,2062,416
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9211978436552246208 max: 9179058898902097152 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9195645160817780503 max: 9189147759444307708 positions: 4099,2,488
- Entry 2: count: 1000 hasNull: false min: -9202888157616520823 max: 9193561362676960747 positions: 12297,6,464
- Entry 3: count: 1000 hasNull: false min: -9216318198067839390 max: 9221286760675829363 positions: 20495,10,440
- Entry 4: count: 1000 hasNull: false min: -9218342074710552826 max: 9222303228623055266 positions: 28693,14,416
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 4008 positions: 0,634,174
- Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3999 positions: 0,1469,69
- Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,2133,194
- Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 4000 positions: 0,3005,43
- Stripe: offset: 192897 data: 63817 rows: 5000 tail: 79 index: 440
- Stream: column 0 section ROW_INDEX start: 192897 length 17
- Stream: column 1 section ROW_INDEX start: 192914 length 165
- Stream: column 2 section ROW_INDEX start: 193079 length 167
- Stream: column 3 section ROW_INDEX start: 193246 length 91
- Stream: column 1 section DATA start: 193337 length 20035
- Stream: column 2 section DATA start: 213372 length 40050
- Stream: column 3 section DATA start: 253422 length 3574
- Stream: column 3 section LENGTH start: 256996 length 25
- Stream: column 3 section DICTIONARY_DATA start: 257021 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 1:
- Entry 0: count: 1000 hasNull: false min: -2141355639 max: 2145520931 sum: 2726719912 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -2138324170 max: 2140167376 sum: -23606674002 positions: 0,2050,488
- Entry 2: count: 1000 hasNull: false min: -2146658006 max: 2144329742 sum: -41530109703 positions: 4099,2054,464
- Entry 3: count: 1000 hasNull: false min: -2144207593 max: 2139456355 sum: 13559842458 positions: 8198,2058,440
- Entry 4: count: 1000 hasNull: false min: -2145744719 max: 2145417153 sum: 57383770571 positions: 12297,2062,416
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9222731174895935707 max: 9214167447015056056 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9222758097219661129 max: 9221043130193737406 positions: 4099,2,488
- Entry 2: count: 1000 hasNull: false min: -9174483776261243438 max: 9208134757538374043 positions: 12297,6,464
- Entry 3: count: 1000 hasNull: false min: -9174329712613510612 max: 9197412874152820822 positions: 20495,10,440
- Entry 4: count: 1000 hasNull: false min: -9221162005892422758 max: 9220625004936875965 positions: 28693,14,416
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3901 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3900 positions: 0,431,431
- Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3909 positions: 0,1485,52
- Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3947 positions: 0,2196,104
- Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3813 positions: 0,2934,131
- Stripe: offset: 257233 data: 12943 rows: 1000 tail: 71 index: 131
- Stream: column 0 section ROW_INDEX start: 257233 length 12
- Stream: column 1 section ROW_INDEX start: 257245 length 38
- Stream: column 2 section ROW_INDEX start: 257283 length 41
- Stream: column 3 section ROW_INDEX start: 257324 length 40
- Stream: column 1 section DATA start: 257364 length 4007
- Stream: column 2 section DATA start: 261371 length 8010
- Stream: column 3 section DATA start: 269381 length 768
- Stream: column 3 section LENGTH start: 270149 length 25
- Stream: column 3 section DICTIONARY_DATA start: 270174 length 133
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DIRECT_V2
- Encoding column 3: DICTIONARY_V2[35]
- Row group indices for column 1:
- Entry 0: count: 1000 hasNull: false min: -2146245500 max: 2146378640 sum: 51299706363 positions: 0,0,0
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476 positions: 0,0,0
- Row group indices for column 3:
- Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866 positions: 0,0,0
-
-File length: 270923 bytes
-Padding length: 0 bytes
-Padding ratio: 0%
-________________________________________________________________________________________________________________________
-
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/resources/orc-file-has-null.out
----------------------------------------------------------------------
diff --git a/ql/src/test/resources/orc-file-has-null.out b/ql/src/test/resources/orc-file-has-null.out
deleted file mode 100644
index e98a73f..0000000
--- a/ql/src/test/resources/orc-file-has-null.out
+++ /dev/null
@@ -1,112 +0,0 @@
-Structure for TestOrcFile.testHasNull.orc
-File Version: 0.12 with HIVE_13083
-Rows: 20000
-Compression: ZLIB
-Compression size: 4096
-Type: struct<bytes1:binary,string1:string>
-
-Stripe Statistics:
- Stripe 1:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false sum: 15000
- Column 2: count: 2000 hasNull: true min: RG1 max: RG3 sum: 6000
- Stripe 2:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false sum: 15000
- Column 2: count: 0 hasNull: true
- Stripe 3:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false sum: 15000
- Column 2: count: 5000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 40000
- Stripe 4:
- Column 0: count: 5000 hasNull: false
- Column 1: count: 5000 hasNull: false sum: 15000
- Column 2: count: 0 hasNull: true
-
-File Statistics:
- Column 0: count: 20000 hasNull: false
- Column 1: count: 20000 hasNull: false sum: 60000
- Column 2: count: 7000 hasNull: true min: RG1 max: STRIPE-3 sum: 46000
-
-Stripes:
- Stripe: offset: 3 data: 220 rows: 5000 tail: 65 index: 154
- Stream: column 0 section ROW_INDEX start: 3 length 17
- Stream: column 1 section ROW_INDEX start: 20 length 60
- Stream: column 2 section ROW_INDEX start: 80 length 77
- Stream: column 1 section DATA start: 157 length 159
- Stream: column 1 section LENGTH start: 316 length 15
- Stream: column 2 section PRESENT start: 331 length 13
- Stream: column 2 section DATA start: 344 length 18
- Stream: column 2 section LENGTH start: 362 length 6
- Stream: column 2 section DICTIONARY_DATA start: 368 length 9
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DICTIONARY_V2[2]
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: RG1 max: RG1 sum: 3000 positions: 0,0,0,0,0,0,0
- Entry 1: count: 0 hasNull: true positions: 0,0,125,0,0,4,488
- Entry 2: count: 1000 hasNull: false min: RG3 max: RG3 sum: 3000 positions: 0,2,125,0,0,4,488
- Entry 3: count: 0 hasNull: true positions: 0,4,125,0,0,12,488
- Entry 4: count: 0 hasNull: true positions: 0,6,125,0,0,12,488
- Stripe: offset: 442 data: 185 rows: 5000 tail: 64 index: 116
- Stream: column 0 section ROW_INDEX start: 442 length 17
- Stream: column 1 section ROW_INDEX start: 459 length 60
- Stream: column 2 section ROW_INDEX start: 519 length 39
- Stream: column 1 section DATA start: 558 length 159
- Stream: column 1 section LENGTH start: 717 length 15
- Stream: column 2 section PRESENT start: 732 length 11
- Stream: column 2 section DATA start: 743 length 0
- Stream: column 2 section LENGTH start: 743 length 0
- Stream: column 2 section DICTIONARY_DATA start: 743 length 0
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DICTIONARY_V2[0]
- Row group indices for column 2:
- Entry 0: count: 0 hasNull: true positions: 0,0,0,0,0,0,0
- Entry 1: count: 0 hasNull: true positions: 0,0,125,0,0,0,0
- Entry 2: count: 0 hasNull: true positions: 0,2,120,0,0,0,0
- Entry 3: count: 0 hasNull: true positions: 0,4,115,0,0,0,0
- Entry 4: count: 0 hasNull: true positions: 0,6,110,0,0,0,0
- Stripe: offset: 807 data: 206 rows: 5000 tail: 60 index: 137
- Stream: column 0 section ROW_INDEX start: 807 length 17
- Stream: column 1 section ROW_INDEX start: 824 length 60
- Stream: column 2 section ROW_INDEX start: 884 length 60
- Stream: column 1 section DATA start: 944 length 159
- Stream: column 1 section LENGTH start: 1103 length 15
- Stream: column 2 section DATA start: 1118 length 15
- Stream: column 2 section LENGTH start: 1133 length 6
- Stream: column 2 section DICTIONARY_DATA start: 1139 length 11
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DICTIONARY_V2[1]
- Row group indices for column 2:
- Entry 0: count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions: 0,4,488
- Entry 2: count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions: 0,12,464
- Entry 3: count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions: 0,20,440
- Entry 4: count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions: 0,28,416
- Stripe: offset: 1210 data: 185 rows: 5000 tail: 64 index: 116
- Stream: column 0 section ROW_INDEX start: 1210 length 17
- Stream: column 1 section ROW_INDEX start: 1227 length 60
- Stream: column 2 section ROW_INDEX start: 1287 length 39
- Stream: column 1 section DATA start: 1326 length 159
- Stream: column 1 section LENGTH start: 1485 length 15
- Stream: column 2 section PRESENT start: 1500 length 11
- Stream: column 2 section DATA start: 1511 length 0
- Stream: column 2 section LENGTH start: 1511 length 0
- Stream: column 2 section DICTIONARY_DATA start: 1511 length 0
- Encoding column 0: DIRECT
- Encoding column 1: DIRECT_V2
- Encoding column 2: DICTIONARY_V2[0]
- Row group indices for column 2:
- Entry 0: count: 0 hasNull: true positions: 0,0,0,0,0,0,0
- Entry 1: count: 0 hasNull: true positions: 0,0,125,0,0,0,0
- Entry 2: count: 0 hasNull: true positions: 0,2,120,0,0,0,0
- Entry 3: count: 0 hasNull: true positions: 0,4,115,0,0,0,0
- Entry 4: count: 0 hasNull: true positions: 0,6,110,0,0,0,0
-
-File length: 1823 bytes
-Padding length: 0 bytes
-Padding ratio: 0%
-________________________________________________________________________________________________________________________
-
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/results/clientpositive/orc_create.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/orc_create.q.out b/ql/src/test/results/clientpositive/orc_create.q.out
index 20c3fce..34ab00d 100644
--- a/ql/src/test/results/clientpositive/orc_create.q.out
+++ b/ql/src/test/results/clientpositive/orc_create.q.out
@@ -380,9 +380,9 @@ POSTHOOK: query: SELECT * from orc_create_complex
POSTHOOK: type: QUERY
POSTHOOK: Input: default@orc_create_complex
#### A masked pattern was here ####
-line1 {"key11":"value11","key12":"value12","key13":"value13"} ["a","b","c"] {"A":"one","B":"two"}
-line2 {"key21":"value21","key22":"value22","key23":"value23"} ["d","e","f"] {"A":"three","B":"four"}
-line3 {"key31":"value31","key32":"value32","key33":"value33"} ["g","h","i"] {"A":"five","B":"six"}
+line1 {"key13":"value13","key12":"value12","key11":"value11"} ["a","b","c"] {"A":"one","B":"two"}
+line2 {"key21":"value21","key23":"value23","key22":"value22"} ["d","e","f"] {"A":"three","B":"four"}
+line3 {"key33":"value33","key31":"value31","key32":"value32"} ["g","h","i"] {"A":"five","B":"six"}
PREHOOK: query: SELECT str from orc_create_complex
PREHOOK: type: QUERY
PREHOOK: Input: default@orc_create_complex
@@ -402,9 +402,9 @@ POSTHOOK: query: SELECT mp from orc_create_complex
POSTHOOK: type: QUERY
POSTHOOK: Input: default@orc_create_complex
#### A masked pattern was here ####
-{"key11":"value11","key12":"value12","key13":"value13"}
-{"key21":"value21","key22":"value22","key23":"value23"}
-{"key31":"value31","key32":"value32","key33":"value33"}
+{"key13":"value13","key12":"value12","key11":"value11"}
+{"key21":"value21","key23":"value23","key22":"value22"}
+{"key33":"value33","key31":"value31","key32":"value32"}
PREHOOK: query: SELECT lst from orc_create_complex
PREHOOK: type: QUERY
PREHOOK: Input: default@orc_create_complex
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/results/clientpositive/orc_int_type_promotion.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/orc_int_type_promotion.q.out b/ql/src/test/results/clientpositive/orc_int_type_promotion.q.out
index 4b7b0b0..3b2e962 100644
--- a/ql/src/test/results/clientpositive/orc_int_type_promotion.q.out
+++ b/ql/src/test/results/clientpositive/orc_int_type_promotion.q.out
@@ -126,8 +126,8 @@ POSTHOOK: query: select * from alltypes_orc
POSTHOOK: type: QUERY
POSTHOOK: Input: default@alltypes_orc
#### A masked pattern was here ####
-true 10 100 1000 10000 4.0 20.0 4.222 1969-12-31 15:59:58.174 1970-01-01 string hello hello {"k1":"v1","k2":"v2"} [100,200] {"c1":null,"c2":" \"foo\"}"}
-false 20 200 2000 20000 8.0 40.0 2.222 1970-12-31 15:59:58.174 1971-01-01 abcd world world {"k3":"v3","k4":"v4"} [200,300] {"c1":null,"c2":" \"bar\"}"}
+true 10 100 1000 10000 4.0 20.0 4.222 1969-12-31 15:59:58.174 1970-01-01 string hello hello {"k2":"v2","k1":"v1"} [100,200] {"c1":null,"c2":" \"foo\"}"}
+false 20 200 2000 20000 8.0 40.0 2.222 1970-12-31 15:59:58.174 1971-01-01 abcd world world {"k4":"v4","k3":"v3"} [200,300] {"c1":null,"c2":" \"bar\"}"}
PREHOOK: query: alter table alltypes_orc change si si int
PREHOOK: type: ALTERTABLE_RENAMECOL
PREHOOK: Input: default@alltypes_orc
@@ -144,8 +144,8 @@ POSTHOOK: query: select * from alltypes_orc
POSTHOOK: type: QUERY
POSTHOOK: Input: default@alltypes_orc
#### A masked pattern was here ####
-true 10 100 1000 10000 4.0 20.0 4.222 1969-12-31 15:59:58.174 1970-01-01 string hello hello {"k1":"v1","k2":"v2"} [100,200] {"c1":null,"c2":" \"foo\"}"}
-false 20 200 2000 20000 8.0 40.0 2.222 1970-12-31 15:59:58.174 1971-01-01 abcd world world {"k3":"v3","k4":"v4"} [200,300] {"c1":null,"c2":" \"bar\"}"}
+true 10 100 1000 10000 4.0 20.0 4.222 1969-12-31 15:59:58.174 1970-01-01 string hello hello {"k2":"v2","k1":"v1"} [100,200] {"c1":null,"c2":" \"foo\"}"}
+false 20 200 2000 20000 8.0 40.0 2.222 1970-12-31 15:59:58.174 1971-01-01 abcd world world {"k4":"v4","k3":"v3"} [200,300] {"c1":null,"c2":" \"bar\"}"}
PREHOOK: query: alter table alltypes_orc change si si bigint
PREHOOK: type: ALTERTABLE_RENAMECOL
PREHOOK: Input: default@alltypes_orc
@@ -170,8 +170,8 @@ POSTHOOK: query: select * from alltypes_orc
POSTHOOK: type: QUERY
POSTHOOK: Input: default@alltypes_orc
#### A masked pattern was here ####
-true 10 100 1000 10000 4.0 20.0 4.222 1969-12-31 15:59:58.174 1970-01-01 string hello hello {"k1":"v1","k2":"v2"} [100,200] {"c1":null,"c2":" \"foo\"}"}
-false 20 200 2000 20000 8.0 40.0 2.222 1970-12-31 15:59:58.174 1971-01-01 abcd world world {"k3":"v3","k4":"v4"} [200,300] {"c1":null,"c2":" \"bar\"}"}
+true 10 100 1000 10000 4.0 20.0 4.222 1969-12-31 15:59:58.174 1970-01-01 string hello hello {"k2":"v2","k1":"v1"} [100,200] {"c1":null,"c2":" \"foo\"}"}
+false 20 200 2000 20000 8.0 40.0 2.222 1970-12-31 15:59:58.174 1971-01-01 abcd world world {"k4":"v4","k3":"v3"} [200,300] {"c1":null,"c2":" \"bar\"}"}
PREHOOK: query: explain select ti, si, i, bi from alltypes_orc
PREHOOK: type: QUERY
POSTHOOK: query: explain select ti, si, i, bi from alltypes_orc
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/results/clientpositive/schema_evol_orc_vec_mapwork_part_all_primitive.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/schema_evol_orc_vec_mapwork_part_all_primitive.q.out b/ql/src/test/results/clientpositive/schema_evol_orc_vec_mapwork_part_all_primitive.q.out
index bd309e6..e29b357 100644
--- a/ql/src/test/results/clientpositive/schema_evol_orc_vec_mapwork_part_all_primitive.q.out
+++ b/ql/src/test/results/clientpositive/schema_evol_orc_vec_mapwork_part_all_primitive.q.out
@@ -1719,10 +1719,10 @@ POSTHOOK: Input: default@part_change_various_various_string@part=1
POSTHOOK: Input: default@part_change_various_various_string@part=2
#### A masked pattern was here ####
insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b
-1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.519542222 2007-02-09 binary original
-2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.368756876 0004-09-22 binary original
-3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.970117179 5966-07-09 binary original
-4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.990818073 1815-05-06 binary original
+1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.519542222 2007-02-09 62 69 6e 61 72 79 original
+2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.368756876 0004-09-22 62 69 6e 61 72 79 original
+3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.970117179 5966-07-09 62 69 6e 61 72 79 original
+4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.990818073 1815-05-06 62 69 6e 61 72 79 original
5 2 true 400 44388 -100 953967041. 62.079153 718.78 1 verdict verdict timestamp date binary new
6 1 -false -67 833 63993 1255178165.77663 905070.974 -4314.7918 -1240033819 trial trial 2016-03-07 03:02:22.0 2016-03-07 binary new
PREHOOK: query: drop table part_change_various_various_string
@@ -1916,10 +1916,10 @@ POSTHOOK: Input: default@part_change_various_various_char@part=1
POSTHOOK: Input: default@part_change_various_various_char@part=2
#### A masked pattern was here ####
insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b
-1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 binary original
-2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 binary original
-3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 binary original
-4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 binary original
+1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 62 69 6e 61 72 79 original
+2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 62 69 6e 61 72 79 original
+3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 62 69 6e 61 72 79 original
+4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 62 69 6e 61 72 79 original
5 2 true 400 44388 -100 953967041. 62.079153 718.78 1 verdict verdict timestamp date binary new
6 1 -false -67 833 63993 1255178165.77663 905070.974 -4314.7918 -1240033819 trial trial 2016-03-07 03:02:22.0 2016-03-07 binary new
PREHOOK: query: drop table part_change_various_various_char
@@ -2113,10 +2113,10 @@ POSTHOOK: Input: default@part_change_various_various_char_trunc@part=1
POSTHOOK: Input: default@part_change_various_various_char_trunc@part=2
#### A masked pattern was here ####
insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b
-1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- binary original
-2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- binary original
-3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- binary original
-4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- binary original
+1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- 62 69 6e original
+2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- 62 69 6e original
+3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- 62 69 6e original
+4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- 62 69 6e original
5 2 true 400 44388 -100 95396704 62.07915 718.78 1 verdict verdict timestam date binary new
6 1 -false -67 833 63993 1255178 905070.9 -4314.79 -1240033 trial trial 2016-03- 2016-03- binary new
PREHOOK: query: drop table part_change_various_various_char_trunc
@@ -2310,10 +2310,10 @@ POSTHOOK: Input: default@part_change_various_various_varchar@part=1
POSTHOOK: Input: default@part_change_various_various_varchar@part=2
#### A masked pattern was here ####
insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b
-1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 binary original
-2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 binary original
-3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 binary original
-4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 binary original
+1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 62 69 6e 61 72 79 original
+2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 62 69 6e 61 72 79 original
+3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 62 69 6e 61 72 79 original
+4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 62 69 6e 61 72 79 original
5 2 true 400 44388 -100 953967041. 62.079153 718.78 1 verdict verdict timestamp date binary new
6 1 -false -67 833 63993 1255178165.77663 905070.974 -4314.7918 -1240033819 trial trial 2016-03-07 03:02:22.0 2016-03-07 binary new
PREHOOK: query: drop table part_change_various_various_varchar
@@ -2507,10 +2507,10 @@ POSTHOOK: Input: default@part_change_various_various_varchar_trunc@part=1
POSTHOOK: Input: default@part_change_various_various_varchar_trunc@part=2
#### A masked pattern was here ####
insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b
-1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- binary original
-2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- binary original
-3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- binary original
-4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- binary original
+1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- 62 69 6e original
+2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- 62 69 6e original
+3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- 62 69 6e original
+4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- 62 69 6e original
5 2 true 400 44388 -100 95396704 62.07915 718.78 1 verdict verdict timestam date binary new
6 1 -false -67 833 63993 1255178 905070.9 -4314.79 -1240033 trial trial 2016-03- 2016-03- binary new
PREHOOK: query: drop table part_change_various_various_varchar_trunc
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/results/clientpositive/tez/schema_evol_orc_vec_mapwork_part_all_primitive.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/schema_evol_orc_vec_mapwork_part_all_primitive.q.out b/ql/src/test/results/clientpositive/tez/schema_evol_orc_vec_mapwork_part_all_primitive.q.out
index f9f4d0b..3721f5b 100644
--- a/ql/src/test/results/clientpositive/tez/schema_evol_orc_vec_mapwork_part_all_primitive.q.out
+++ b/ql/src/test/results/clientpositive/tez/schema_evol_orc_vec_mapwork_part_all_primitive.q.out
@@ -1539,10 +1539,10 @@ POSTHOOK: Input: default@part_change_various_various_string@part=1
POSTHOOK: Input: default@part_change_various_various_string@part=2
#### A masked pattern was here ####
insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b
-1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.519542222 2007-02-09 binary original
-2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.368756876 0004-09-22 binary original
-3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.970117179 5966-07-09 binary original
-4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.990818073 1815-05-06 binary original
+1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.519542222 2007-02-09 62 69 6e 61 72 79 original
+2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.368756876 0004-09-22 62 69 6e 61 72 79 original
+3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.970117179 5966-07-09 62 69 6e 61 72 79 original
+4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.990818073 1815-05-06 62 69 6e 61 72 79 original
5 2 true 400 44388 -100 953967041. 62.079153 718.78 1 verdict verdict timestamp date binary new
6 1 -false -67 833 63993 1255178165.77663 905070.974 -4314.7918 -1240033819 trial trial 2016-03-07 03:02:22.0 2016-03-07 binary new
PREHOOK: query: drop table part_change_various_various_string
@@ -1716,10 +1716,10 @@ POSTHOOK: Input: default@part_change_various_various_char@part=1
POSTHOOK: Input: default@part_change_various_various_char@part=2
#### A masked pattern was here ####
insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b
-1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 binary original
-2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 binary original
-3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 binary original
-4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 binary original
+1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 62 69 6e 61 72 79 original
+2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 62 69 6e 61 72 79 original
+3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 62 69 6e 61 72 79 original
+4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 62 69 6e 61 72 79 original
5 2 true 400 44388 -100 953967041. 62.079153 718.78 1 verdict verdict timestamp date binary new
6 1 -false -67 833 63993 1255178165.77663 905070.974 -4314.7918 -1240033819 trial trial 2016-03-07 03:02:22.0 2016-03-07 binary new
PREHOOK: query: drop table part_change_various_various_char
@@ -1893,10 +1893,10 @@ POSTHOOK: Input: default@part_change_various_various_char_trunc@part=1
POSTHOOK: Input: default@part_change_various_various_char_trunc@part=2
#### A masked pattern was here ####
insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b
-1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- binary original
-2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- binary original
-3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- binary original
-4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- binary original
+1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- 62 69 6e original
+2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- 62 69 6e original
+3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- 62 69 6e original
+4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- 62 69 6e original
5 2 true 400 44388 -100 95396704 62.07915 718.78 1 verdict verdict timestam date binary new
6 1 -false -67 833 63993 1255178 905070.9 -4314.79 -1240033 trial trial 2016-03- 2016-03- binary new
PREHOOK: query: drop table part_change_various_various_char_trunc
@@ -2070,10 +2070,10 @@ POSTHOOK: Input: default@part_change_various_various_varchar@part=1
POSTHOOK: Input: default@part_change_various_various_varchar@part=2
#### A masked pattern was here ####
insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b
-1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 binary original
-2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 binary original
-3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 binary original
-4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 binary original
+1 1 TRUE NULL NULL 3244222 -99999999999 -29.0764 4.70614135E8 470614135 dynamic reptile dynamic reptile 0004-09-22 18:26:29.51954 2007-02-09 62 69 6e 61 72 79 original
+2 1 TRUE 100 NULL 14 -23866739993 -3651.672 46114.284799488 46114.284799488 baffling baffling 2007-02-09 05:17:29.36875 0004-09-22 62 69 6e 61 72 79 original
+3 1 FALSE 72 NULL -93222 30 -66475.56 -66475.561431 0.561431 1 1 6229-06-28 02:54:28.97011 5966-07-09 62 69 6e 61 72 79 original
+4 1 TRUE -90 NULL 3289094 46114 9250341.0 9250340.75 9250340.75 junkyard junkyard 2002-05-10 05:29:48.99081 1815-05-06 62 69 6e 61 72 79 original
5 2 true 400 44388 -100 953967041. 62.079153 718.78 1 verdict verdict timestamp date binary new
6 1 -false -67 833 63993 1255178165.77663 905070.974 -4314.7918 -1240033819 trial trial 2016-03-07 03:02:22.0 2016-03-07 binary new
PREHOOK: query: drop table part_change_various_various_varchar
@@ -2247,10 +2247,10 @@ POSTHOOK: Input: default@part_change_various_various_varchar_trunc@part=1
POSTHOOK: Input: default@part_change_various_various_varchar_trunc@part=2
#### A masked pattern was here ####
insert_num part c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 b
-1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- binary original
-2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- binary original
-3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- binary original
-4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- binary original
+1 1 TRUE NULL NULL 3244222 -9999999 -29.0764 4.706141 47061413 dynamic dynamic 0004-09- 2007-02- 62 69 6e original
+2 1 TRUE 100 NULL 14 -2386673 -3651.67 46114.28 46114.28 baffli baffli 2007-02- 0004-09- 62 69 6e original
+3 1 FALSE 72 NULL -93222 30 -66475.5 -66475.5 0.561431 1 1 6229-06- 5966-07- 62 69 6e original
+4 1 TRUE -90 NULL 3289094 46114 9250341. 9250340. 9250340. junkyard junkyard 2002-05- 1815-05- 62 69 6e original
5 2 true 400 44388 -100 95396704 62.07915 718.78 1 verdict verdict timestam date binary new
6 1 -false -67 833 63993 1255178 905070.9 -4314.79 -1240033 trial trial 2016-03- 2016-03- binary new
PREHOOK: query: drop table part_change_various_various_varchar_trunc
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/test/results/clientpositive/vector_complex_all.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vector_complex_all.q.out b/ql/src/test/results/clientpositive/vector_complex_all.q.out
index 1af37c3..2ae7c1b 100644
--- a/ql/src/test/results/clientpositive/vector_complex_all.q.out
+++ b/ql/src/test/results/clientpositive/vector_complex_all.q.out
@@ -108,9 +108,9 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@orc_create_complex
#### A masked pattern was here ####
orc_create_complex.str orc_create_complex.mp orc_create_complex.lst orc_create_complex.strct
-line1 {"key11":"value11","key12":"value12","key13":"value13"} ["a","b","c"] {"a":"one","b":"two"}
-line2 {"key21":"value21","key22":"value22","key23":"value23"} ["d","e","f"] {"a":"three","b":"four"}
-line3 {"key31":"value31","key32":"value32","key33":"value33"} ["g","h","i"] {"a":"five","b":"six"}
+line1 {"key13":"value13","key12":"value12","key11":"value11"} ["a","b","c"] {"a":"one","b":"two"}
+line2 {"key21":"value21","key23":"value23","key22":"value22"} ["d","e","f"] {"a":"three","b":"four"}
+line3 {"key33":"value33","key31":"value31","key32":"value32"} ["g","h","i"] {"a":"five","b":"six"}
PREHOOK: query: -- However, since this query is not referencing the complex fields, it should vectorize.
EXPLAIN
SELECT COUNT(*) FROM orc_create_complex
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java b/serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java
index 305fdbe..7d136b4 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/io/TimestampWritable.java
@@ -21,13 +21,13 @@ import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.OutputStream;
-import java.math.BigDecimal;
import java.sql.Timestamp;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.util.TimestampUtils;
import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils.VInt;
@@ -61,7 +61,6 @@ public class TimestampWritable implements WritableComparable<TimestampWritable>
private static final long SEVEN_BYTE_LONG_SIGN_FLIP = 0xff80L << 48;
- private static final BigDecimal BILLION_BIG_DECIMAL = BigDecimal.valueOf(1000000000);
/** The maximum number of bytes required for a TimestampWritable */
public static final int MAX_BYTES = 13;
@@ -181,7 +180,7 @@ public class TimestampWritable implements WritableComparable<TimestampWritable>
*/
public long getSeconds() {
if (!timestampEmpty) {
- return millisToSeconds(timestamp.getTime());
+ return TimestampUtils.millisToSeconds(timestamp.getTime());
} else if (!bytesEmpty) {
return TimestampWritable.getSeconds(currentBytes, offset);
} else {
@@ -313,7 +312,7 @@ public class TimestampWritable implements WritableComparable<TimestampWritable>
public double getDouble() {
double seconds, nanos;
if (bytesEmpty) {
- seconds = millisToSeconds(timestamp.getTime());
+ seconds = TimestampUtils.millisToSeconds(timestamp.getTime());
nanos = timestamp.getNanos();
} else {
seconds = getSeconds();
@@ -326,17 +325,6 @@ public class TimestampWritable implements WritableComparable<TimestampWritable>
return timestamp.getTime() / 1000;
}
- /**
- *
- * @return double representation of the timestamp, accurate to nanoseconds
- */
- public static double getDouble(Timestamp timestamp) {
- double seconds, nanos;
- seconds = millisToSeconds(timestamp.getTime());
- nanos = timestamp.getNanos();
- return seconds + nanos / 1000000000;
- }
-
public void readFields(DataInput in) throws IOException {
in.readFully(internalBytes, 0, 4);
if (TimestampWritable.hasDecimalOrSecondVInt(internalBytes[0])) {
@@ -493,7 +481,7 @@ public class TimestampWritable implements WritableComparable<TimestampWritable>
long millis = t.getTime();
int nanos = t.getNanos();
- long seconds = millisToSeconds(millis);
+ long seconds = TimestampUtils.millisToSeconds(millis);
boolean hasSecondVInt = seconds < 0 || seconds > Integer.MAX_VALUE;
boolean hasDecimal = setNanosBytes(nanos, b, offset+4, hasSecondVInt);
@@ -541,20 +529,6 @@ public class TimestampWritable implements WritableComparable<TimestampWritable>
return decimal != 0;
}
- public static Timestamp decimalToTimestamp(HiveDecimal d) {
- BigDecimal nanoInstant = d.bigDecimalValue().multiply(BILLION_BIG_DECIMAL);
- int nanos = nanoInstant.remainder(BILLION_BIG_DECIMAL).intValue();
- if (nanos < 0) {
- nanos += 1000000000;
- }
- long seconds =
- nanoInstant.subtract(new BigDecimal(nanos)).divide(BILLION_BIG_DECIMAL).longValue();
- Timestamp t = new Timestamp(seconds * 1000);
- t.setNanos(nanos);
-
- return t;
- }
-
public HiveDecimal getHiveDecimal() {
if (timestampEmpty) {
populateTimestamp();
@@ -565,11 +539,12 @@ public class TimestampWritable implements WritableComparable<TimestampWritable>
public static HiveDecimal getHiveDecimal(Timestamp timestamp) {
// The BigDecimal class recommends not converting directly from double to BigDecimal,
// so we convert through a string...
- Double timestampDouble = TimestampWritable.getDouble(timestamp);
+ Double timestampDouble = TimestampUtils.getDouble(timestamp);
HiveDecimal result = HiveDecimal.create(timestampDouble.toString());
return result;
}
+
/**
* Converts the time in seconds or milliseconds to a timestamp.
* @param time time in seconds or in milliseconds
@@ -580,71 +555,6 @@ public class TimestampWritable implements WritableComparable<TimestampWritable>
return new Timestamp(intToTimestampInSeconds ? time * 1000 : time);
}
- /**
- * Converts the time in seconds or milliseconds to a timestamp.
- * @param time time in seconds or in milliseconds
- * @return the timestamp
- */
- public static void setTimestampFromLong(Timestamp timestamp, long time,
- boolean intToTimestampInSeconds) {
- // If the time is in seconds, converts it to milliseconds first.
- timestamp.setTime(intToTimestampInSeconds ? time * 1000 : time);
- }
-
- public static Timestamp doubleToTimestamp(double f) {
- long seconds = (long) f;
-
- // We must ensure the exactness of the double's fractional portion.
- // 0.6 as the fraction part will be converted to 0.59999... and
- // significantly reduce the savings from binary serialization
- BigDecimal bd = new BigDecimal(String.valueOf(f));
- bd = bd.subtract(new BigDecimal(seconds)).multiply(new BigDecimal(1000000000));
- int nanos = bd.intValue();
-
- // Convert to millis
- long millis = seconds * 1000;
- if (nanos < 0) {
- millis -= 1000;
- nanos += 1000000000;
- }
- Timestamp t = new Timestamp(millis);
-
- // Set remaining fractional portion to nanos
- t.setNanos(nanos);
- return t;
- }
-
- public static void setTimestampFromDouble(Timestamp timestamp, double f) {
- // Otherwise, BigDecimal throws an exception. (Support vector operations that sometimes
- // do work on double Not-a-Number NaN values).
- if (Double.isNaN(f)) {
- timestamp.setTime(0);
- return;
- }
- // Algorithm used by TimestampWritable.doubleToTimestamp method.
- // Allocates a BigDecimal object!
-
- long seconds = (long) f;
-
- // We must ensure the exactness of the double's fractional portion.
- // 0.6 as the fraction part will be converted to 0.59999... and
- // significantly reduce the savings from binary serialization
- BigDecimal bd = new BigDecimal(String.valueOf(f));
- bd = bd.subtract(new BigDecimal(seconds)).multiply(new BigDecimal(1000000000));
- int nanos = bd.intValue();
-
- // Convert to millis
- long millis = seconds * 1000;
- if (nanos < 0) {
- millis -= 1000;
- nanos += 1000000000;
- }
- timestamp.setTime(millis);
-
- // Set remaining fractional portion to nanos
- timestamp.setNanos(nanos);
- }
-
public static void setTimestamp(Timestamp t, byte[] bytes, int offset) {
boolean hasDecimalOrSecondVInt = hasDecimalOrSecondVInt(bytes[offset]);
long seconds = (long) TimestampWritable.getSeconds(bytes, offset);
@@ -737,16 +647,4 @@ public class TimestampWritable implements WritableComparable<TimestampWritable>
| ((0xFFL & bytes[offset+5]) << 16)
| ((0xFFL & bytes[offset+6]) << 8)) >> 8;
}
-
- /**
- * Rounds the number of milliseconds relative to the epoch down to the nearest whole number of
- * seconds. 500 would round to 0, -500 would round to -1.
- */
- public static long millisToSeconds(long millis) {
- if (millis >= 0) {
- return millis / 1000;
- } else {
- return (millis - 999) / 1000;
- }
- }
}