You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by lu...@apache.org on 2022/05/29 07:21:26 UTC

[drill] branch master updated: DRILL-8149: large xlsx configs (#2483)

This is an automated email from the ASF dual-hosted git repository.

luoc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new 83802e3c4e DRILL-8149: large xlsx configs (#2483)
83802e3c4e is described below

commit 83802e3c4eaac7b7a05ae8e6ff11e619665dee3f
Author: PJ Fanning <pj...@users.noreply.github.com>
AuthorDate: Sun May 29 08:21:21 2022 +0100

    DRILL-8149: large xlsx configs (#2483)
---
 contrib/format-excel/README.md                     |  3 ++
 .../drill/exec/store/excel/ExcelBatchReader.java   | 21 +++++++++
 .../drill/exec/store/excel/ExcelFormatConfig.java  | 51 ++++++++++++++++++++--
 3 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/contrib/format-excel/README.md b/contrib/format-excel/README.md
index 17bb73bb27..207203c7fd 100644
--- a/contrib/format-excel/README.md
+++ b/contrib/format-excel/README.md
@@ -26,6 +26,9 @@ The plugin has many other configuration options listed below:
 * `lastColumn`: To define a region within a spreadsheet, this is the right-most column index. This is indexed from one. If set to `0` Drill will read all available columns. This
  is not inclusive, so if you ask for columns 2-5 you will get columns 2,3 and 4. 
 * `allTextMode`: When set to `true`, Drill will not attempt to infer column data types and will read everything as `VARCHAR`. Defaults to `false`;
+* `maxArraySize`: Overrides the default [POI config](https://poi.apache.org/components/configuration.html) for `setByteArrayMaxOverride(int maxOverride)`. May be needed with large Excel files.
+* `thresholdBytesForTempFiles`: Overrides the default [POI config](https://poi.apache.org/components/configuration.html) for `ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(int thresholdBytes)`. May be needed with large Excel files.
+* `useTempFilePackageParts`: Overrides the default [POI config](https://poi.apache.org/components/configuration.html) for `ZipPackage.setUseTempFilePackageParts(boolean tempFilePackageParts)`. May be needed with large Excel files.
 
 ## Usage
 You can specify the configuration at runtime via the `table()` function or in the storage plugin configuration. For instance, if you just want to query an Excel file, you could
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index 9116f3d861..11a4c27d78 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -38,12 +38,15 @@ import org.apache.drill.exec.vector.accessor.ScalarWriter;
 import org.apache.drill.exec.vector.accessor.TupleWriter;
 import org.apache.hadoop.mapred.FileSplit;
 import org.apache.poi.ooxml.POIXMLProperties.CoreProperties;
+import org.apache.poi.openxml4j.opc.ZipPackage;
+import org.apache.poi.openxml4j.util.ZipInputStreamZipEntrySource;
 import org.apache.poi.ss.usermodel.Cell;
 import org.apache.poi.ss.usermodel.CellType;
 import org.apache.poi.ss.usermodel.DateUtil;
 import org.apache.poi.ss.usermodel.Row;
 import org.apache.poi.ss.usermodel.Sheet;
 import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.util.IOUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -174,6 +177,9 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
     final int lastColumn;
     final boolean allTextMode;
     final String sheetName;
+    final int maxArraySize;
+    final int thresholdBytesForTempFiles;
+    final boolean useTempFilePackageParts;
 
     ExcelReaderConfig(ExcelFormatPlugin plugin) {
       this.plugin = plugin;
@@ -183,6 +189,9 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
       lastColumn = plugin.getConfig().getLastColumn();
       allTextMode = plugin.getConfig().getAllTextMode();
       sheetName = plugin.getConfig().getSheetName();
+      maxArraySize = plugin.getConfig().getMaxArraySize();
+      thresholdBytesForTempFiles = plugin.getConfig().getThresholdBytesForTempFiles();
+      useTempFilePackageParts = plugin.getConfig().isUseTempFilePackageParts();
     }
   }
 
@@ -230,6 +239,18 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
     try {
       fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
 
+      if (readerConfig.maxArraySize >= 0) {
+        IOUtils.setByteArrayMaxOverride(readerConfig.maxArraySize);
+      }
+
+      if (readerConfig.thresholdBytesForTempFiles >= 0) {
+        ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(readerConfig.thresholdBytesForTempFiles);
+      }
+
+      if (readerConfig.useTempFilePackageParts) {
+        ZipPackage.setUseTempFilePackageParts(readerConfig.useTempFilePackageParts);
+      }
+
       // Open streaming reader
       Workbook workbook = StreamingReader.builder()
         .rowCacheSize(ROW_CACHE_SIZE)
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java
index 533b4b5f11..b662060108 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java
@@ -27,6 +27,7 @@ import org.apache.drill.common.PlanStringBuilder;
 import org.apache.drill.common.logical.FormatPluginConfig;
 import org.apache.drill.exec.store.excel.ExcelBatchReader.ExcelReaderConfig;
 import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.poi.ss.SpreadsheetVersion;
 
 import java.util.Collections;
 import java.util.List;
@@ -37,7 +38,7 @@ import java.util.Objects;
 public class ExcelFormatConfig implements FormatPluginConfig {
 
   // This is the theoretical maximum number of rows in an Excel spreadsheet
-  private final int MAX_ROWS = 1_048_576;
+  private final int MAX_ROWS = SpreadsheetVersion.EXCEL2007.getMaxRows();
 
   private final List<String> extensions;
   private final int headerRow;
@@ -46,6 +47,9 @@ public class ExcelFormatConfig implements FormatPluginConfig {
   private final int lastColumn;
   private final boolean allTextMode;
   private final String sheetName;
+  private final int maxArraySize;
+  private final int thresholdBytesForTempFiles;
+  private final boolean useTempFilePackageParts;
 
   // Omitted properties take reasonable defaults
   @JsonCreator
@@ -56,7 +60,11 @@ public class ExcelFormatConfig implements FormatPluginConfig {
       @JsonProperty("firstColumn") Integer firstColumn,
       @JsonProperty("lastColumn") Integer lastColumn,
       @JsonProperty("allTextMode") Boolean allTextMode,
-      @JsonProperty("sheetName") String sheetName) {
+      @JsonProperty("sheetName") String sheetName,
+      @JsonProperty("maxArraySize") Integer maxArraySize,
+      @JsonProperty("thresholdBytesForTempFiles") Integer thresholdBytesForTempFiles,
+      @JsonProperty("useTempFilePackageParts") Boolean useTempFilePackageParts
+  ) {
     this.extensions = extensions == null
         ? Collections.singletonList("xlsx")
         : ImmutableList.copyOf(extensions);
@@ -67,6 +75,9 @@ public class ExcelFormatConfig implements FormatPluginConfig {
     this.lastColumn = lastColumn == null ? 0 : lastColumn;
     this.allTextMode = allTextMode == null ? false : allTextMode;
     this.sheetName = sheetName == null ? "" : sheetName;
+    this.maxArraySize = maxArraySize == null ? -1 : maxArraySize;
+    this.thresholdBytesForTempFiles = thresholdBytesForTempFiles == null ? -1 : thresholdBytesForTempFiles;
+    this.useTempFilePackageParts = useTempFilePackageParts == null ? false : useTempFilePackageParts;
   }
 
   @JsonInclude(JsonInclude.Include.NON_DEFAULT)
@@ -98,6 +109,34 @@ public class ExcelFormatConfig implements FormatPluginConfig {
     return sheetName;
   }
 
+  /**
+   * See the <code>setByteArrayMaxOverride</code> section in the Apache POI
+   * <a href="https://poi.apache.org/components/configuration.html">configuration</a> doc.
+   * @return max size of POI array (-1 means default limits applied)
+   */
+  public int getMaxArraySize() {
+    return maxArraySize;
+  }
+
+  /**
+   * See the <code>setThresholdBytesForTempFiles</code> section in the Apache POI
+   * <a href="https://poi.apache.org/components/configuration.html">configuration</a> doc.
+   * @return size at which xlsx parts are switched to temp file backing
+   * (-1 means no temp files for this feature - POI does use temp files in some other cases)
+   */
+  public int getThresholdBytesForTempFiles() {
+    return thresholdBytesForTempFiles;
+  }
+
+  /**
+   * See the <code>setUseTempFilePackageParts</code> section in the Apache POI
+   * <a href="https://poi.apache.org/components/configuration.html">configuration</a> doc.
+   * @return whether to use temp files for package parts (default is false)
+   */
+  public boolean isUseTempFilePackageParts() {
+    return useTempFilePackageParts;
+  }
+
   public ExcelReaderConfig getReaderConfig(ExcelFormatPlugin plugin) {
     ExcelReaderConfig readerConfig = new ExcelReaderConfig(plugin);
     return readerConfig;
@@ -124,7 +163,10 @@ public class ExcelFormatConfig implements FormatPluginConfig {
       && Objects.equals(firstColumn, other.firstColumn)
       && Objects.equals(lastColumn, other.lastColumn)
       && Objects.equals(allTextMode, other.allTextMode)
-      && Objects.equals(sheetName, other.sheetName);
+      && Objects.equals(sheetName, other.sheetName)
+      && Objects.equals(maxArraySize, other.maxArraySize)
+      && Objects.equals(thresholdBytesForTempFiles, other.thresholdBytesForTempFiles)
+      && Objects.equals(useTempFilePackageParts, other.useTempFilePackageParts);
   }
 
   @Override
@@ -137,6 +179,9 @@ public class ExcelFormatConfig implements FormatPluginConfig {
         .field("firstColumn", firstColumn)
         .field("lastColumn", lastColumn)
         .field("allTextMode", allTextMode)
+        .field("maxArraySize", maxArraySize)
+        .field("thresholdBytesForTempFiles", thresholdBytesForTempFiles)
+        .field("useTempFilePackageParts", useTempFilePackageParts)
         .toString();
   }
 }