You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@orc.apache.org by GitBox <gi...@apache.org> on 2020/08/31 21:47:01 UTC

[GitHub] [orc] shardulm94 commented on a change in pull request #542: ORC-623 Fix evaluation of SArgs over all null columns for ints and doubles.

shardulm94 commented on a change in pull request #542:
URL: https://github.com/apache/orc/pull/542#discussion_r480405386



##########
File path: java/core/src/test/org/apache/orc/TestVectorOrcFile.java
##########
@@ -4061,6 +4061,115 @@ public void testPredicatePushdownWithNan() throws Exception {
     assertEquals(0, batch.size);
   }
 
+  @Test
+  public void testPredicatePushdownIssue1() throws Exception {

Review comment:
       Nit: Make method name more descriptive?

##########
File path: java/core/src/test/org/apache/orc/TestVectorOrcFile.java
##########
@@ -4061,6 +4061,115 @@ public void testPredicatePushdownWithNan() throws Exception {
     assertEquals(0, batch.size);
   }
 
+  @Test
+  public void testPredicatePushdownIssue1() throws Exception {
+    TypeDescription schema = createInnerSchema();
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .setSchema(schema)
+            .stripeSize(400000L)
+            .compress(CompressionKind.NONE)
+            .bufferSize(500)
+            .rowIndexStride(1000)
+            .version(fileFormat));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    batch.ensureSize(3500);
+    batch.size = 3500;
+    for(int i=0; i < 3500; ++i) {
+      batch.cols[0].noNulls = false;
+      batch.cols[0].isNull[i] = true; // all nulls for int1
+      ((BytesColumnVector) batch.cols[1]).setVal(i,
+          Integer.toHexString(10*i).getBytes(StandardCharsets.UTF_8));
+    }
+    writer.addRowBatch(batch);
+    writer.close();
+
+    Reader reader = OrcFile.createReader(testFilePath,
+        OrcFile.readerOptions(conf).filesystem(fs));
+    assertEquals(3500, reader.getNumberOfRows());
+
+    // all values for int1 are null, so not(isNull(int1)) should always be false and not rows should be read
+    SearchArgument sarg = SearchArgumentFactory.newBuilder()
+        .startNot()
+        .isNull("int1", PredicateLeaf.Type.LONG)
+        .end()
+        .build();
+
+    RecordReader rows = reader.rows(reader.options()
+        .range(0L, Long.MAX_VALUE)
+        .searchArgument(sarg, new String[]{}));
+    batch = reader.getSchema().createRowBatch(3500);
+
+    rows.nextBatch(batch);
+    assertEquals(0, batch.size);
+  }
+
+  @Test
+  public void testPredicatePushdownIssue2() throws Exception {

Review comment:
       Nit: Make method name more descriptive?

##########
File path: java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
##########
@@ -403,16 +406,18 @@ public void testGetMin() throws Exception {
 
   private static OrcProto.ColumnStatistics createIntStats(Long min,
                                                           Long max) {
+    OrcProto.ColumnStatistics.Builder result =
+        OrcProto.ColumnStatistics.newBuilder();
     OrcProto.IntegerStatistics.Builder intStats =
         OrcProto.IntegerStatistics.newBuilder();
     if (min != null) {
       intStats.setMinimum(min);
+      result.setNumberOfValues(1);

Review comment:
       I am not sure why this is needed? Also if min != max wouldn't we have > 1 values?
   PSA: I am not familiar with the Orc codebase, especially the protobuf parts. So may be totally missing the point.

##########
File path: java/core/src/test/org/apache/orc/TestVectorOrcFile.java
##########
@@ -4061,6 +4061,115 @@ public void testPredicatePushdownWithNan() throws Exception {
     assertEquals(0, batch.size);
   }
 
+  @Test
+  public void testPredicatePushdownIssue1() throws Exception {
+    TypeDescription schema = createInnerSchema();
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .setSchema(schema)
+            .stripeSize(400000L)
+            .compress(CompressionKind.NONE)
+            .bufferSize(500)
+            .rowIndexStride(1000)
+            .version(fileFormat));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    batch.ensureSize(3500);
+    batch.size = 3500;
+    for(int i=0; i < 3500; ++i) {
+      batch.cols[0].noNulls = false;
+      batch.cols[0].isNull[i] = true; // all nulls for int1
+      ((BytesColumnVector) batch.cols[1]).setVal(i,
+          Integer.toHexString(10*i).getBytes(StandardCharsets.UTF_8));
+    }
+    writer.addRowBatch(batch);
+    writer.close();
+
+    Reader reader = OrcFile.createReader(testFilePath,
+        OrcFile.readerOptions(conf).filesystem(fs));
+    assertEquals(3500, reader.getNumberOfRows());
+
+    // all values for int1 are null, so not(isNull(int1)) should always be false and not rows should be read
+    SearchArgument sarg = SearchArgumentFactory.newBuilder()
+        .startNot()
+        .isNull("int1", PredicateLeaf.Type.LONG)
+        .end()
+        .build();
+
+    RecordReader rows = reader.rows(reader.options()
+        .range(0L, Long.MAX_VALUE)
+        .searchArgument(sarg, new String[]{}));
+    batch = reader.getSchema().createRowBatch(3500);
+
+    rows.nextBatch(batch);
+    assertEquals(0, batch.size);
+  }
+
+  @Test
+  public void testPredicatePushdownIssue2() throws Exception {
+    TypeDescription schema = createInnerSchema();
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .setSchema(schema)
+            .stripeSize(400000L)
+            .compress(CompressionKind.NONE)
+            .bufferSize(500)
+            .rowIndexStride(1000)
+            .version(fileFormat));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    batch.ensureSize(3500);
+    batch.size = 3500;
+    for(int i=0; i < 3500; ++i) {
+      if (i % 2 == 0) {
+        batch.cols[0].noNulls = false;
+        batch.cols[0].isNull[i] = true; // every other value is null or 1
+        batch.cols[1].noNulls = false;
+        batch.cols[1].isNull[i] = true; // every other value is null or "val"
+      } else {
+        batch.cols[0].isNull[i] = false;
+        ((LongColumnVector) batch.cols[0]).vector[i] = 1;
+        batch.cols[1].isNull[i] = false;
+        ((BytesColumnVector) batch.cols[1]).setVal(i, "val".getBytes(StandardCharsets.UTF_8));
+      }
+    }
+    writer.addRowBatch(batch);
+    writer.close();
+    Reader reader = OrcFile.createReader(testFilePath,
+        OrcFile.readerOptions(conf).filesystem(fs));
+    assertEquals(3500, reader.getNumberOfRows());
+
+    // values for int1 are either null or 1, so not(in(int1, 1)) should always be true because null
+    // occurs every 2nd row and hence all row groups should be returned

Review comment:
       This should now change from `hence all row groups should be returned` to `hence no row groups should be returned` as my initial understanding when I wrote the comment was wrong

##########
File path: java/core/src/test/org/apache/orc/TestVectorOrcFile.java
##########
@@ -4061,6 +4061,115 @@ public void testPredicatePushdownWithNan() throws Exception {
     assertEquals(0, batch.size);
   }
 
+  @Test
+  public void testPredicatePushdownIssue1() throws Exception {
+    TypeDescription schema = createInnerSchema();
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .setSchema(schema)
+            .stripeSize(400000L)
+            .compress(CompressionKind.NONE)
+            .bufferSize(500)
+            .rowIndexStride(1000)
+            .version(fileFormat));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    batch.ensureSize(3500);
+    batch.size = 3500;
+    for(int i=0; i < 3500; ++i) {
+      batch.cols[0].noNulls = false;
+      batch.cols[0].isNull[i] = true; // all nulls for int1
+      ((BytesColumnVector) batch.cols[1]).setVal(i,
+          Integer.toHexString(10*i).getBytes(StandardCharsets.UTF_8));
+    }
+    writer.addRowBatch(batch);
+    writer.close();
+
+    Reader reader = OrcFile.createReader(testFilePath,
+        OrcFile.readerOptions(conf).filesystem(fs));
+    assertEquals(3500, reader.getNumberOfRows());
+
+    // all values for int1 are null, so not(isNull(int1)) should always be false and not rows should be read

Review comment:
       Typo `not rows should be read` -> `no rows should be read`

##########
File path: java/core/src/test/org/apache/orc/TestVectorOrcFile.java
##########
@@ -4061,6 +4061,115 @@ public void testPredicatePushdownWithNan() throws Exception {
     assertEquals(0, batch.size);
   }
 
+  @Test
+  public void testPredicatePushdownIssue1() throws Exception {
+    TypeDescription schema = createInnerSchema();
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .setSchema(schema)
+            .stripeSize(400000L)
+            .compress(CompressionKind.NONE)
+            .bufferSize(500)
+            .rowIndexStride(1000)
+            .version(fileFormat));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    batch.ensureSize(3500);
+    batch.size = 3500;
+    for(int i=0; i < 3500; ++i) {
+      batch.cols[0].noNulls = false;
+      batch.cols[0].isNull[i] = true; // all nulls for int1
+      ((BytesColumnVector) batch.cols[1]).setVal(i,
+          Integer.toHexString(10*i).getBytes(StandardCharsets.UTF_8));
+    }
+    writer.addRowBatch(batch);
+    writer.close();
+
+    Reader reader = OrcFile.createReader(testFilePath,
+        OrcFile.readerOptions(conf).filesystem(fs));
+    assertEquals(3500, reader.getNumberOfRows());
+
+    // all values for int1 are null, so not(isNull(int1)) should always be false and not rows should be read
+    SearchArgument sarg = SearchArgumentFactory.newBuilder()
+        .startNot()
+        .isNull("int1", PredicateLeaf.Type.LONG)
+        .end()
+        .build();
+
+    RecordReader rows = reader.rows(reader.options()
+        .range(0L, Long.MAX_VALUE)
+        .searchArgument(sarg, new String[]{}));
+    batch = reader.getSchema().createRowBatch(3500);
+
+    rows.nextBatch(batch);
+    assertEquals(0, batch.size);
+  }
+
+  @Test
+  public void testPredicatePushdownIssue2() throws Exception {
+    TypeDescription schema = createInnerSchema();
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .setSchema(schema)
+            .stripeSize(400000L)
+            .compress(CompressionKind.NONE)
+            .bufferSize(500)
+            .rowIndexStride(1000)
+            .version(fileFormat));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    batch.ensureSize(3500);
+    batch.size = 3500;
+    for(int i=0; i < 3500; ++i) {
+      if (i % 2 == 0) {
+        batch.cols[0].noNulls = false;
+        batch.cols[0].isNull[i] = true; // every other value is null or 1
+        batch.cols[1].noNulls = false;
+        batch.cols[1].isNull[i] = true; // every other value is null or "val"
+      } else {
+        batch.cols[0].isNull[i] = false;
+        ((LongColumnVector) batch.cols[0]).vector[i] = 1;
+        batch.cols[1].isNull[i] = false;
+        ((BytesColumnVector) batch.cols[1]).setVal(i, "val".getBytes(StandardCharsets.UTF_8));
+      }
+    }
+    writer.addRowBatch(batch);
+    writer.close();
+    Reader reader = OrcFile.createReader(testFilePath,
+        OrcFile.readerOptions(conf).filesystem(fs));
+    assertEquals(3500, reader.getNumberOfRows());
+
+    // values for int1 are either null or 1, so not(in(int1, 1)) should always be true because null
+    // occurs every 2nd row and hence all row groups should be returned
+    SearchArgument sarg = SearchArgumentFactory.newBuilder()
+        .startNot()
+        .in("int1", PredicateLeaf.Type.LONG, 1L)
+        .end()
+        .build();
+
+    RecordReader rows = reader.rows(reader.options()
+        .range(0L, Long.MAX_VALUE)
+        .searchArgument(sarg, new String[]{}));
+    batch = reader.getSchema().createRowBatch(2500);
+
+    rows.nextBatch(batch);
+    assertEquals(0, batch.size);
+
+    // values for string1 are either null or "val", so not(in(string1, "val")) should always be true because null
+    // occurs every 2nd row and hence all row groups should be returned

Review comment:
       This should now change from `hence all row groups should be returned` to `hence no row groups should be returned` as my initial understanding when I wrote the comment was wrong




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org