You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2022/10/04 20:55:50 UTC

[orc] branch main updated: ORC-1283: ENABLE_INDEXES does not take effect

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 733457d46 ORC-1283: ENABLE_INDEXES does not take effect
733457d46 is described below

commit 733457d46a60225fae12b0609e699e9edc9ef2f0
Author: deshanxiao <de...@microsoft.com>
AuthorDate: Tue Oct 4 13:55:43 2022 -0700

    ORC-1283: ENABLE_INDEXES does not take effect
    
    ### What changes were proposed in this pull request?
    This PR aims to fix the problem that ENABLE_INDEXES does not take effect.
    
    ### Why are the changes needed?
    Now, if the orc config `ENABLE_INDEXES` is set to `false`. Orc will still write index because orc writes to index or not is only related to the configure of `ROW_INDEX_STRIDE`.
    
    ### How was this patch tested?
    Added UT
    
    Closes #1267 from deshanxiao/deshan/fix-index-effect.
    
    Authored-by: deshanxiao <de...@microsoft.com>
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 java/core/src/java/org/apache/orc/OrcFile.java     |  6 +++++
 .../src/java/org/apache/orc/impl/WriterImpl.java   |  3 ++-
 .../test/org/apache/orc/impl/TestWriterImpl.java   | 27 ++++++++++++++++++++++
 site/_docs/hive-config.md                          |  2 +-
 site/_docs/spark-config.md                         |  2 +-
 5 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java
index fa9487b3d..406fda8cb 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -432,6 +432,7 @@ public class OrcFile {
     private long stripeSizeValue;
     private long stripeRowCountValue;
     private long blockSizeValue;
+    private boolean buildIndex;
     private int rowIndexStrideValue;
     private int bufferSizeValue;
     private boolean enforceBufferSize = false;
@@ -466,6 +467,7 @@ public class OrcFile {
       stripeSizeValue = OrcConf.STRIPE_SIZE.getLong(tableProperties, conf);
       stripeRowCountValue = OrcConf.STRIPE_ROW_COUNT.getLong(tableProperties, conf);
       blockSizeValue = OrcConf.BLOCK_SIZE.getLong(tableProperties, conf);
+      buildIndex = OrcConf.ENABLE_INDEXES.getBoolean(tableProperties, conf);
       rowIndexStrideValue =
           (int) OrcConf.ROW_INDEX_STRIDE.getLong(tableProperties, conf);
       bufferSizeValue = (int) OrcConf.BUFFER_SIZE.getLong(tableProperties,
@@ -905,6 +907,10 @@ public class OrcFile {
       return rowIndexStrideValue;
     }
 
+    public boolean isBuildIndex() {
+      return buildIndex;
+    }
+
     public CompressionStrategy getCompressionStrategy() {
       return compressionStrategy;
     }
diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
index e7d71a142..734b94061 100644
--- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
@@ -184,7 +184,8 @@ public class WriterImpl implements WriterInternal, MemoryManager.Callback {
     this.compressionStrategy = opts.getCompressionStrategy();
 
     this.rowIndexStride = opts.getRowIndexStride();
-    buildIndex = rowIndexStride > 0;
+
+    this.buildIndex = opts.isBuildIndex() && (rowIndexStride > 0);
     if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
       throw new IllegalArgumentException("Row stride must be at least " +
           MIN_ROW_INDEX_STRIDE);
diff --git a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
index 8db041d2c..84f4df808 100644
--- a/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestWriterImpl.java
@@ -29,6 +29,7 @@ import org.apache.orc.OrcFile;
 import org.apache.orc.Reader;
 import org.apache.orc.TypeDescription;
 import org.apache.orc.Writer;
+import org.apache.orc.*;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@@ -92,6 +93,32 @@ public class TestWriterImpl {
     w.close();
   }
 
+  @Test
+  public void testNoIndexIfEnableIndexIsFalse() throws Exception {
+    conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");
+    conf.set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "1000");
+    conf.setBoolean(OrcConf.ENABLE_INDEXES.getAttribute(), false);
+    VectorizedRowBatch b = schema.createRowBatch();
+    LongColumnVector f1 = (LongColumnVector) b.cols[0];
+    LongColumnVector f2 = (LongColumnVector) b.cols[1];
+    Writer w = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).setSchema(schema));
+    long rowCount = 1000;
+    for (int i = 0; i < rowCount; i++) {
+      f1.vector[b.size] = 1 ;
+      f2.vector[b.size] = 2 ;
+      b.size += 1;
+      if (b.size == 10) {
+        w.addRowBatch(b);
+        b.reset();
+      }
+    }
+    w.close();
+
+    for (StripeInformation information: w.getStripes()) {
+      assertEquals(0, information.getIndexLength());
+    }
+  }
+
   @Test
   public void testStripes() throws Exception {
     conf.set(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");
diff --git a/site/_docs/hive-config.md b/site/_docs/hive-config.md
index 8bbc66418..99e4863ea 100644
--- a/site/_docs/hive-config.md
+++ b/site/_docs/hive-config.md
@@ -16,7 +16,7 @@ orc.compress             | ZLIB        | high level compression = {NONE, ZLIB, S
 orc.compress.size        | 262,144     | compression chunk size
 orc.stripe.size          | 67,108,864  | memory buffer in bytes for writing
 orc.row.index.stride     | 10,000      | number of rows between index entries
-orc.create.index         | true        | create indexes?
+orc.create.index         | true        | whether the ORC writer create indexes as part of the file or not
 orc.bloom.filter.columns | ""          | comma separated list of column names
 orc.bloom.filter.fpp     | 0.05        | bloom filter false positive rate
 
diff --git a/site/_docs/spark-config.md b/site/_docs/spark-config.md
index ad3a90d58..dca4124c2 100644
--- a/site/_docs/spark-config.md
+++ b/site/_docs/spark-config.md
@@ -16,7 +16,7 @@ orc.compress             | ZLIB        | high level compression = {NONE, ZLIB, S
 orc.compress.size        | 262,144     | compression chunk size
 orc.stripe.size          | 67,108,864  | memory buffer in bytes for writing
 orc.row.index.stride     | 10,000      | number of rows between index entries
-orc.create.index         | true        | create indexes?
+orc.create.index         | true        | whether the ORC writer create indexes as part of the file or not
 orc.bloom.filter.columns | ""          | comma separated list of column names
 orc.bloom.filter.fpp     | 0.05        | bloom filter false positive rate
 orc.key.provider         | "hadoop"    | key provider