You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by lu...@apache.org on 2022/05/29 07:21:26 UTC
[drill] branch master updated: DRILL-8149: large xlsx configs (#2483)
This is an automated email from the ASF dual-hosted git repository.
luoc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new 83802e3c4e DRILL-8149: large xlsx configs (#2483)
83802e3c4e is described below
commit 83802e3c4eaac7b7a05ae8e6ff11e619665dee3f
Author: PJ Fanning <pj...@users.noreply.github.com>
AuthorDate: Sun May 29 08:21:21 2022 +0100
DRILL-8149: large xlsx configs (#2483)
---
contrib/format-excel/README.md | 3 ++
.../drill/exec/store/excel/ExcelBatchReader.java | 21 +++++++++
.../drill/exec/store/excel/ExcelFormatConfig.java | 51 ++++++++++++++++++++--
3 files changed, 72 insertions(+), 3 deletions(-)
diff --git a/contrib/format-excel/README.md b/contrib/format-excel/README.md
index 17bb73bb27..207203c7fd 100644
--- a/contrib/format-excel/README.md
+++ b/contrib/format-excel/README.md
@@ -26,6 +26,9 @@ The plugin has many other configuration options listed below:
* `lastColumn`: To define a region within a spreadsheet, this is the right-most column index. This is indexed from one. If set to `0` Drill will read all available columns. This
is not inclusive, so if you ask for columns 2-5 you will get columns 2,3 and 4.
* `allTextMode`: When set to `true`, Drill will not attempt to infer column data types and will read everything as `VARCHAR`. Defaults to `false`;
+* `maxArraySize`: Overrides the default [POI config](https://poi.apache.org/components/configuration.html) for `setByteArrayMaxOverride(int maxOverride)`. May be needed with large Excel files.
+* `thresholdBytesForTempFiles`: Overrides the default [POI config](https://poi.apache.org/components/configuration.html) for `ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(int thresholdBytes)`. May be needed with large Excel files.
+* `useTempFilePackageParts`: Overrides the default [POI config](https://poi.apache.org/components/configuration.html) for `ZipPackage.setUseTempFilePackageParts(boolean tempFilePackageParts)`. May be needed with large Excel files.
## Usage
You can specify the configuration at runtime via the `table()` function or in the storage plugin configuration. For instance, if you just want to query an Excel file, you could
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index 9116f3d861..11a4c27d78 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -38,12 +38,15 @@ import org.apache.drill.exec.vector.accessor.ScalarWriter;
import org.apache.drill.exec.vector.accessor.TupleWriter;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.poi.ooxml.POIXMLProperties.CoreProperties;
+import org.apache.poi.openxml4j.opc.ZipPackage;
+import org.apache.poi.openxml4j.util.ZipInputStreamZipEntrySource;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.util.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -174,6 +177,9 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
final int lastColumn;
final boolean allTextMode;
final String sheetName;
+ final int maxArraySize;
+ final int thresholdBytesForTempFiles;
+ final boolean useTempFilePackageParts;
ExcelReaderConfig(ExcelFormatPlugin plugin) {
this.plugin = plugin;
@@ -183,6 +189,9 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
lastColumn = plugin.getConfig().getLastColumn();
allTextMode = plugin.getConfig().getAllTextMode();
sheetName = plugin.getConfig().getSheetName();
+ maxArraySize = plugin.getConfig().getMaxArraySize();
+ thresholdBytesForTempFiles = plugin.getConfig().getThresholdBytesForTempFiles();
+ useTempFilePackageParts = plugin.getConfig().isUseTempFilePackageParts();
}
}
@@ -230,6 +239,18 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
try {
fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
+ if (readerConfig.maxArraySize >= 0) {
+ IOUtils.setByteArrayMaxOverride(readerConfig.maxArraySize);
+ }
+
+ if (readerConfig.thresholdBytesForTempFiles >= 0) {
+ ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(readerConfig.thresholdBytesForTempFiles);
+ }
+
+ if (readerConfig.useTempFilePackageParts) {
+ ZipPackage.setUseTempFilePackageParts(readerConfig.useTempFilePackageParts);
+ }
+
// Open streaming reader
Workbook workbook = StreamingReader.builder()
.rowCacheSize(ROW_CACHE_SIZE)
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java
index 533b4b5f11..b662060108 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java
@@ -27,6 +27,7 @@ import org.apache.drill.common.PlanStringBuilder;
import org.apache.drill.common.logical.FormatPluginConfig;
import org.apache.drill.exec.store.excel.ExcelBatchReader.ExcelReaderConfig;
import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.poi.ss.SpreadsheetVersion;
import java.util.Collections;
import java.util.List;
@@ -37,7 +38,7 @@ import java.util.Objects;
public class ExcelFormatConfig implements FormatPluginConfig {
// This is the theoretical maximum number of rows in an Excel spreadsheet
- private final int MAX_ROWS = 1_048_576;
+ private final int MAX_ROWS = SpreadsheetVersion.EXCEL2007.getMaxRows();
private final List<String> extensions;
private final int headerRow;
@@ -46,6 +47,9 @@ public class ExcelFormatConfig implements FormatPluginConfig {
private final int lastColumn;
private final boolean allTextMode;
private final String sheetName;
+ private final int maxArraySize;
+ private final int thresholdBytesForTempFiles;
+ private final boolean useTempFilePackageParts;
// Omitted properties take reasonable defaults
@JsonCreator
@@ -56,7 +60,11 @@ public class ExcelFormatConfig implements FormatPluginConfig {
@JsonProperty("firstColumn") Integer firstColumn,
@JsonProperty("lastColumn") Integer lastColumn,
@JsonProperty("allTextMode") Boolean allTextMode,
- @JsonProperty("sheetName") String sheetName) {
+ @JsonProperty("sheetName") String sheetName,
+ @JsonProperty("maxArraySize") Integer maxArraySize,
+ @JsonProperty("thresholdBytesForTempFiles") Integer thresholdBytesForTempFiles,
+ @JsonProperty("useTempFilePackageParts") Boolean useTempFilePackageParts
+ ) {
this.extensions = extensions == null
? Collections.singletonList("xlsx")
: ImmutableList.copyOf(extensions);
@@ -67,6 +75,9 @@ public class ExcelFormatConfig implements FormatPluginConfig {
this.lastColumn = lastColumn == null ? 0 : lastColumn;
this.allTextMode = allTextMode == null ? false : allTextMode;
this.sheetName = sheetName == null ? "" : sheetName;
+ this.maxArraySize = maxArraySize == null ? -1 : maxArraySize;
+ this.thresholdBytesForTempFiles = thresholdBytesForTempFiles == null ? -1 : thresholdBytesForTempFiles;
+ this.useTempFilePackageParts = useTempFilePackageParts == null ? false : useTempFilePackageParts;
}
@JsonInclude(JsonInclude.Include.NON_DEFAULT)
@@ -98,6 +109,34 @@ public class ExcelFormatConfig implements FormatPluginConfig {
return sheetName;
}
+ /**
+ * See the <code>setByteArrayMaxOverride</code> section in the Apache POI
+ * <a href="https://poi.apache.org/components/configuration.html">configuration</a> doc.
+ * @return max size of POI array (-1 means default limits applied)
+ */
+ public int getMaxArraySize() {
+ return maxArraySize;
+ }
+
+ /**
+ * See the <code>setThresholdBytesForTempFiles</code> section in the Apache POI
+ * <a href="https://poi.apache.org/components/configuration.html">configuration</a> doc.
+ * @return size at which xlsx parts are switched to temp file backing
+ * (-1 means no temp files for this feature - POI does use temp files in some other cases)
+ */
+ public int getThresholdBytesForTempFiles() {
+ return thresholdBytesForTempFiles;
+ }
+
+ /**
+ * See the <code>setUseTempFilePackageParts</code> section in the Apache POI
+ * <a href="https://poi.apache.org/components/configuration.html">configuration</a> doc.
+ * @return whether to use temp files for package parts (default is false)
+ */
+ public boolean isUseTempFilePackageParts() {
+ return useTempFilePackageParts;
+ }
+
public ExcelReaderConfig getReaderConfig(ExcelFormatPlugin plugin) {
ExcelReaderConfig readerConfig = new ExcelReaderConfig(plugin);
return readerConfig;
@@ -124,7 +163,10 @@ public class ExcelFormatConfig implements FormatPluginConfig {
&& Objects.equals(firstColumn, other.firstColumn)
&& Objects.equals(lastColumn, other.lastColumn)
&& Objects.equals(allTextMode, other.allTextMode)
- && Objects.equals(sheetName, other.sheetName);
+ && Objects.equals(sheetName, other.sheetName)
+ && Objects.equals(maxArraySize, other.maxArraySize)
+ && Objects.equals(thresholdBytesForTempFiles, other.thresholdBytesForTempFiles)
+ && Objects.equals(useTempFilePackageParts, other.useTempFilePackageParts);
}
@Override
@@ -137,6 +179,9 @@ public class ExcelFormatConfig implements FormatPluginConfig {
.field("firstColumn", firstColumn)
.field("lastColumn", lastColumn)
.field("allTextMode", allTextMode)
+ .field("maxArraySize", maxArraySize)
+ .field("thresholdBytesForTempFiles", thresholdBytesForTempFiles)
+ .field("useTempFilePackageParts", useTempFilePackageParts)
.toString();
}
}