You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by pr...@apache.org on 2020/02/13 16:52:16 UTC
[orc] branch master updated: ORC-598: Unable to read ORC file with struct and array.length > 1024 (#479)

This is an automated email from the ASF dual-hosted git repository.

prasanthj pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/master by this push:
     new 4f48184  ORC-598: Unable to read ORC file with struct and array.length > 1024 (#479)
4f48184 is described below

commit 4f48184f07802498ebfecc9820de1ca167d1fadb
Author: kasakrisz <33...@users.noreply.github.com>
AuthorDate: Thu Feb 13 17:52:08 2020 +0100

    ORC-598: Unable to read ORC file with struct and array.length > 1024 (#479)
    
    * ORC-598: Unable to read ORC file with struct and array.length > 1024
    
    * ORC-598: Unable to read ORC file with struct and array.length > 1024 - UT
---
 .../apache/orc/impl/ConvertTreeReaderFactory.java  |  10 +--
 .../orc/impl/TestConvertTreeReaderFactory.java     |  94 +++++++++++++++++++++
 java/core/src/test/resources/bigarray.orc          | Bin 0 -> 2337 bytes
 3 files changed, 99 insertions(+), 5 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
index 81a9b62..ead8f65 100644
--- a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
+++ b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
@@ -538,7 +538,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
                            final int batchSize) throws IOException {
       if (decimalColVector == null) {
         // Allocate column vector for file; cast column vector for reader.
-        decimalColVector = new DecimalColumnVector(precision, scale);
+        decimalColVector = new DecimalColumnVector(batchSize, precision, scale);
         longColVector = (LongColumnVector) previousVector;
       }
       // Read present/isNull stream
@@ -685,7 +685,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
                            final int batchSize) throws IOException {
       if (decimalColVector == null) {
         // Allocate column vector for file; cast column vector for reader.
-        decimalColVector = new DecimalColumnVector(precision, scale);
+        decimalColVector = new DecimalColumnVector(batchSize, precision, scale);
         doubleColVector = (DoubleColumnVector) previousVector;
       }
       // Read present/isNull stream
@@ -988,7 +988,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
                            final int batchSize) throws IOException {
       if (fileDecimalColVector == null) {
         // Allocate column vector for file; cast column vector for reader.
-        fileDecimalColVector = new DecimalColumnVector(filePrecision, fileScale);
+        fileDecimalColVector = new DecimalColumnVector(batchSize, filePrecision, fileScale);
         decimalColVector = previousVector;
       }
       // Read present/isNull stream
@@ -1129,7 +1129,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
                            final int batchSize) throws IOException {
       if (decimalColVector == null) {
         // Allocate column vector for file; cast column vector for reader.
-        decimalColVector = new DecimalColumnVector(precision, scale);
+        decimalColVector = new DecimalColumnVector(batchSize, precision, scale);
         bytesColVector = (BytesColumnVector) previousVector;
       }
       // Read present/isNull stream
@@ -1534,7 +1534,7 @@ public class ConvertTreeReaderFactory extends TreeReaderFactory {
                            final int batchSize) throws IOException {
       if (decimalColVector == null) {
         // Allocate column vector for file; cast column vector for reader.
-        decimalColVector = new DecimalColumnVector(precision, scale);
+        decimalColVector = new DecimalColumnVector(batchSize, precision, scale);
         timestampColVector = (TimestampColumnVector) previousVector;
       }
       timestampColVector.changeCalendar(fileUsedProlepticGregorian, false);
diff --git a/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java b/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java
new file mode 100644
index 0000000..baf5e10
--- /dev/null
+++ b/java/core/src/test/org/apache/orc/impl/TestConvertTreeReaderFactory.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import static org.junit.Assert.*;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TestVectorOrcFile;
+import org.apache.orc.TypeDescription;
+import org.junit.Test;
+
+public class TestConvertTreeReaderFactory {
+
+  @Test
+  public void testArraySizeBiggerThan1024AndConvertToDecimal() throws Exception {
+    Decimal64ColumnVector columnVector = testArraySizeBiggerThan1024("decimal(6,1)", Decimal64ColumnVector.class);
+    assertEquals(columnVector.vector.length, 1025);
+  }
+
+  public <TExpectedColumn extends ColumnVector> TExpectedColumn testArraySizeBiggerThan1024(
+          String typeString, Class<TExpectedColumn> expectedColumnType) throws Exception {
+    Reader.Options options = new Reader.Options();
+    TypeDescription schema = TypeDescription.fromString("struct<col1:array<"+ typeString +">>");
+    options.schema(schema);
+    String expected = options.toString();
+
+    Configuration conf = new Configuration();
+    Path path = new Path(TestVectorOrcFile.getFileFromClasspath("bigarray.orc"));
+
+    Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
+    RecordReader rows = reader.rows(options);
+    VectorizedRowBatch batch = schema.createRowBatchV2();
+    while (rows.nextBatch(batch)) {
+      assertTrue(batch.size > 0);
+    }
+
+    assertEquals(expected, options.toString());
+    assertEquals(batch.cols.length, 1);
+    assertTrue(batch.cols[0] instanceof ListColumnVector);
+    assertEquals(((ListColumnVector) batch.cols[0]).child.getClass(), expectedColumnType);
+    return (TExpectedColumn) ((ListColumnVector) batch.cols[0]).child;
+  }
+
+  @Test
+  public void testArraySizeBiggerThan1024AndConvertToVarchar() throws Exception {
+    BytesColumnVector columnVector = testArraySizeBiggerThan1024("varchar(10)", BytesColumnVector.class);
+    assertEquals(columnVector.vector.length, 1025);
+  }
+
+  @Test
+  public void testArraySizeBiggerThan1024AndConvertToDouble() throws Exception {
+    DoubleColumnVector columnVector = testArraySizeBiggerThan1024("double", DoubleColumnVector.class);
+    assertEquals(columnVector.vector.length, 1025);
+  }
+
+  @Test
+  public void testArraySizeBiggerThan1024AndConvertToInteger() throws Exception {
+    LongColumnVector columnVector = testArraySizeBiggerThan1024("int", LongColumnVector.class);
+    assertEquals(columnVector.vector.length, 1025);
+  }
+
+  @Test
+  public void testArraySizeBiggerThan1024AndConvertToTimestamp() throws Exception {
+    TimestampColumnVector columnVector = testArraySizeBiggerThan1024("timestamp", TimestampColumnVector.class);
+    assertEquals(columnVector.time.length, 1025);
+  }
+}
\ No newline at end of file
diff --git a/java/core/src/test/resources/bigarray.orc b/java/core/src/test/resources/bigarray.orc
new file mode 100644
index 0000000..002565b
Binary files /dev/null and b/java/core/src/test/resources/bigarray.orc differ