You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2020/04/24 17:02:38 UTC

[incubator-iceberg] branch master updated: ORC: Fix IndexOutOfBoundsException in GenericOrcReader caused by repeated records (#953)

This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 79d4f05  ORC: Fix IndexOutOfBoundsException in GenericOrcReader caused by repeated records (#953)
79d4f05 is described below

commit 79d4f0579dd988e8dfea9f40e53c69545afa0b58
Author: Shardul Mahadik <sm...@linkedin.com>
AuthorDate: Fri Apr 24 10:02:29 2020 -0700

    ORC: Fix IndexOutOfBoundsException in GenericOrcReader caused by repeated records (#953)
---
 .../apache/iceberg/data/orc/GenericOrcReader.java  |  2 +-
 .../apache/iceberg/data/orc/TestGenericData.java   | 58 ++++++++++++++--------
 .../iceberg/spark/data/TestSparkOrcReader.java     | 21 ++++++++
 3 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/data/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java b/data/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java
index 03f03be..2db5822 100644
--- a/data/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java
+++ b/data/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java
@@ -101,7 +101,7 @@ public class GenericOrcReader implements OrcValueReader<Record> {
       if (!vector.noNulls && vector.isNull[rowIndex]) {
         return null;
       } else {
-        return convertNonNullValue(vector, row);
+        return convertNonNullValue(vector, rowIndex);
       }
     }
 
diff --git a/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java b/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java
index 670b455..058379b 100644
--- a/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java
+++ b/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java
@@ -24,6 +24,7 @@ import java.io.File;
 import java.io.IOException;
 import java.time.LocalDateTime;
 import java.time.OffsetDateTime;
+import java.util.Collections;
 import java.util.List;
 import java.util.TimeZone;
 import org.apache.iceberg.Files;
@@ -48,29 +49,18 @@ public class TestGenericData extends DataTest {
   protected void writeAndValidate(Schema schema) throws IOException {
     List<Record> expected = RandomGenericData.generate(schema, 100, 0L);
 
-    File testFile = temp.newFile();
-    Assert.assertTrue("Delete should succeed", testFile.delete());
-
-    try (FileAppender<Record> writer = ORC.write(Files.localOutput(testFile))
-        .schema(schema)
-        .createWriterFunc(GenericOrcWriter::buildWriter)
-        .build()) {
-      for (Record rec : expected) {
-        writer.add(rec);
-      }
-    }
+    writeAndValidateRecords(schema, expected);
+  }
 
-    List<Record> rows;
-    try (CloseableIterable<Record> reader = ORC.read(Files.localInput(testFile))
-        .project(schema)
-        .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema))
-        .build()) {
-      rows = Lists.newArrayList(reader);
-    }
+  @Test
+  public void writeAndValidateRepeatingRecords() throws IOException {
+    Schema structSchema = new Schema(
+        required(100, "id", Types.LongType.get()),
+        required(101, "data", Types.StringType.get())
+    );
+    List<Record> expectedRepeating = Collections.nCopies(100, RandomGenericData.generate(structSchema, 1, 0L).get(0));
 
-    for (int i = 0; i < expected.size(); i += 1) {
-      DataTestHelpers.assertEquals(schema.asStruct(), expected.get(i), rows.get(i));
-    }
+    writeAndValidateRecords(structSchema, expectedRepeating);
   }
 
   @Test
@@ -127,4 +117,30 @@ public class TestGenericData extends DataTest {
     Assert.assertEquals(OffsetDateTime.parse("1935-05-17T01:10:34Z"), rows.get(3).getField("tsTzCol"));
     Assert.assertEquals(LocalDateTime.parse("1935-05-01T00:01:00"), rows.get(3).getField("tsCol"));
   }
+
+  private void writeAndValidateRecords(Schema schema, List<Record> expected) throws IOException {
+    File testFile = temp.newFile();
+    Assert.assertTrue("Delete should succeed", testFile.delete());
+
+    try (FileAppender<Record> writer = ORC.write(Files.localOutput(testFile))
+        .schema(schema)
+        .createWriterFunc(GenericOrcWriter::buildWriter)
+        .build()) {
+      for (Record rec : expected) {
+        writer.add(rec);
+      }
+    }
+
+    List<Record> rows;
+    try (CloseableIterable<Record> reader = ORC.read(Files.localInput(testFile))
+        .project(schema)
+        .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema))
+        .build()) {
+      rows = Lists.newArrayList(reader);
+    }
+
+    for (int i = 0; i < expected.size(); i += 1) {
+      DataTestHelpers.assertEquals(schema.asStruct(), expected.get(i), rows.get(i));
+    }
+  }
 }
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java
index fefdce9..071c903 100644
--- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java
+++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java
@@ -21,16 +21,21 @@ package org.apache.iceberg.spark.data;
 
 import java.io.File;
 import java.io.IOException;
+import java.util.Collections;
 import java.util.Iterator;
+import java.util.List;
 import org.apache.iceberg.Files;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.FileAppender;
 import org.apache.iceberg.orc.ORC;
+import org.apache.iceberg.types.Types;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.junit.Assert;
+import org.junit.Test;
 
 import static org.apache.iceberg.spark.data.TestHelpers.assertEquals;
+import static org.apache.iceberg.types.Types.NestedField.required;
 
 public class TestSparkOrcReader extends AvroDataTest {
   @Override
@@ -38,6 +43,22 @@ public class TestSparkOrcReader extends AvroDataTest {
     final Iterable<InternalRow> expected = RandomData
         .generateSpark(schema, 100, 0L);
 
+    writeAndValidateRecords(schema, expected);
+  }
+
+  @Test
+  public void writeAndValidateRepeatingRecords() throws IOException {
+    Schema structSchema = new Schema(
+        required(100, "id", Types.LongType.get()),
+        required(101, "data", Types.StringType.get())
+    );
+    List<InternalRow> expectedRepeating = Collections.nCopies(100,
+        RandomData.generateSpark(structSchema, 1, 0L).iterator().next());
+
+    writeAndValidateRecords(structSchema, expectedRepeating);
+  }
+
+  private void writeAndValidateRecords(Schema schema, Iterable<InternalRow> expected) throws IOException {
     final File testFile = temp.newFile();
     Assert.assertTrue("Delete should succeed", testFile.delete());