You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2022/06/17 20:38:55 UTC
[drill] branch master updated: DRILL-8199: Convert Excel EVF1 to EVF2 (#2569)
This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new 11164e9169 DRILL-8199: Convert Excel EVF1 to EVF2 (#2569)
11164e9169 is described below
commit 11164e9169db94ce0b4def508042007bc4207312
Author: James Turton <91...@users.noreply.github.com>
AuthorDate: Fri Jun 17 22:38:49 2022 +0200
DRILL-8199: Convert Excel EVF1 to EVF2 (#2569)
* Upgrade Excel format plugin to EVF 2.
* Fix implict column exclusion from wildcard projection in EVF2.
---
.../drill/exec/store/excel/ExcelBatchReader.java | 60 +++++--------
.../drill/exec/store/excel/ExcelFormatConfig.java | 48 +++++++++++
.../drill/exec/store/excel/ExcelFormatPlugin.java | 98 +++++-----------------
.../impl/scan/v3/schema/DynamicSchemaFilter.java | 3 +
4 files changed, 93 insertions(+), 116 deletions(-)
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index 11a4c27d78..cbdc144711 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -24,9 +24,9 @@ import org.apache.drill.common.exceptions.CustomErrorContext;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.types.TypeProtos.DataMode;
import org.apache.drill.common.types.TypeProtos.MinorType;
-import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework;
-import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator;
-import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileDescrip;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator;
+import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader;
import org.apache.drill.exec.physical.resultSet.ResultSetLoader;
import org.apache.drill.exec.physical.resultSet.RowSetLoader;
import org.apache.drill.exec.record.MaterializedField;
@@ -34,9 +34,9 @@ import org.apache.drill.exec.record.metadata.ColumnMetadata;
import org.apache.drill.exec.record.metadata.MetadataUtils;
import org.apache.drill.exec.record.metadata.SchemaBuilder;
import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.exec.store.dfs.easy.EasySubScan;
import org.apache.drill.exec.vector.accessor.ScalarWriter;
import org.apache.drill.exec.vector.accessor.TupleWriter;
-import org.apache.hadoop.mapred.FileSplit;
import org.apache.poi.ooxml.POIXMLProperties.CoreProperties;
import org.apache.poi.openxml4j.opc.ZipPackage;
import org.apache.poi.openxml4j.util.ZipInputStreamZipEntrySource;
@@ -65,7 +65,7 @@ import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
+public class ExcelBatchReader implements ManagedReader {
private static final Logger logger = LoggerFactory.getLogger(ExcelBatchReader.class);
@@ -145,8 +145,10 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
private static final int BUFFER_SIZE = 4096;
private final ExcelReaderConfig readerConfig;
- private final int maxRecords;
private final TreeSet<String> columnNameChecker;
+ private final RowSetLoader rowWriter;
+ private final CustomErrorContext errorContext;
+
private Sheet sheet;
private Row currentRow;
private StreamingWorkbook streamingWorkbook;
@@ -157,17 +159,13 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
private List<ScalarWriter> metadataColumnWriters;
private ScalarWriter sheetNameWriter;
private Iterator<Row> rowIterator;
- private RowSetLoader rowWriter;
+ private FileDescrip file;
private int totalColumnCount;
private boolean firstLine;
- private FileSplit split;
private int recordCount;
private Map<String, String> stringMetadata;
private Map<String, Date> dateMetadata;
private Map<String, List<String>> listMetadata;
- private CustomErrorContext errorContext;
-
-
static class ExcelReaderConfig {
final ExcelFormatPlugin plugin;
@@ -195,49 +193,43 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
}
}
- public ExcelBatchReader(ExcelReaderConfig readerConfig, int maxRecords) {
+ public ExcelBatchReader(ExcelReaderConfig readerConfig, EasySubScan scan, FileSchemaNegotiator negotiator) {
+ this.errorContext = negotiator.parentErrorContext();
this.readerConfig = readerConfig;
- this.maxRecords = maxRecords;
- this.columnNameChecker = new TreeSet<>(String.CASE_INSENSITIVE_ORDER);
- firstLine = true;
- }
+ this.file = negotiator.file();
- @Override
- public boolean open(FileSchemaNegotiator negotiator) {
- split = negotiator.split();
- errorContext = negotiator.parentErrorContext();
ResultSetLoader loader = negotiator.build();
- rowWriter = loader.writer();
- openFile(negotiator);
+ this.rowWriter = loader.writer();
+ this.columnNameChecker = new TreeSet<>(String.CASE_INSENSITIVE_ORDER);
+ this.firstLine = true;
- if (negotiator.hasProvidedSchema()) {
+ openFile();
+ if (negotiator.providedSchema() != null) {
TupleMetadata providedSchema = negotiator.providedSchema();
- logger.debug("Found inline schema");
+ logger.debug("Found a provided schema");
// Add Implicit columns to schema
SchemaBuilder builderForProvidedSchema = new SchemaBuilder();
builderForProvidedSchema.addAll(providedSchema);
TupleMetadata finalSchema = builderForProvidedSchema.build();
buildColumnWritersFromProvidedSchema(finalSchema);
-
// Add schema to file negotiator
logger.debug("Metadata added to provided schema.");
addMetadataToSchema(builderForProvidedSchema);
+
// Build column writer array
negotiator.tableSchema(finalSchema, true);
} else {
defineSchema(negotiator);
}
- return true;
}
/**
* This method opens the Excel file, initializes the Streaming Excel Reader, and initializes the sheet variable.
- * @param negotiator The Drill file negotiator object that represents the file system
*/
- private void openFile(FileScanFramework.FileSchemaNegotiator negotiator) {
+ private void openFile() {
try {
- fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
+ fsStream = file.fileSystem().openPossiblyCompressedStream(file.split().getPath());
if (readerConfig.maxArraySize >= 0) {
IOUtils.setByteArrayMaxOverride(readerConfig.maxArraySize);
@@ -263,7 +255,7 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
} catch (Exception e) {
throw UserException
.dataReadError(e)
- .message("Failed to open open input file: %s", split.getPath().toString())
+ .message("Failed to open open input file: %s", file.split().getPath().toString())
.addContext(e.getMessage())
.addContext(errorContext)
.build(logger);
@@ -518,10 +510,6 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
finalColumn = readerConfig.lastColumn - 1;
}
- if (rowWriter.limitReached(maxRecords)) {
- return false;
- }
-
rowWriter.start();
for (int colWriterIndex = 0; colPosition < finalColumn; colWriterIndex++) {
Cell cell = currentRow.getCell(colPosition);
@@ -679,9 +667,7 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
int index = rowWriter.tupleSchema().index(name);
if (index == -1) {
ColumnMetadata colSchema = MetadataUtils.newScalar(name, type, DataMode.OPTIONAL);
- if (isMetadata) {
- colSchema.setBooleanProperty(ColumnMetadata.EXCLUDE_FROM_WILDCARD, true);
- }
+ colSchema.setBooleanProperty(ColumnMetadata.EXCLUDE_FROM_WILDCARD, isMetadata);
index = rowWriter.addColumn(colSchema);
} else {
return;
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java
index b662060108..a78f026619 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java
@@ -24,10 +24,13 @@ import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.logical.FormatPluginConfig;
import org.apache.drill.exec.store.excel.ExcelBatchReader.ExcelReaderConfig;
import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
import org.apache.poi.ss.SpreadsheetVersion;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.util.Collections;
import java.util.List;
@@ -37,6 +40,8 @@ import java.util.Objects;
@JsonInclude(JsonInclude.Include.NON_DEFAULT)
public class ExcelFormatConfig implements FormatPluginConfig {
+ private static final Logger logger = LoggerFactory.getLogger(ExcelFormatPlugin.class);
+
// This is the theoretical maximum number of rows in an Excel spreadsheet
private final int MAX_ROWS = SpreadsheetVersion.EXCEL2007.getMaxRows();
@@ -78,6 +83,7 @@ public class ExcelFormatConfig implements FormatPluginConfig {
this.maxArraySize = maxArraySize == null ? -1 : maxArraySize;
this.thresholdBytesForTempFiles = thresholdBytesForTempFiles == null ? -1 : thresholdBytesForTempFiles;
this.useTempFilePackageParts = useTempFilePackageParts == null ? false : useTempFilePackageParts;
+ validateConfig();
}
@JsonInclude(JsonInclude.Include.NON_DEFAULT)
@@ -184,4 +190,46 @@ public class ExcelFormatConfig implements FormatPluginConfig {
.field("useTempFilePackageParts", useTempFilePackageParts)
.toString();
}
+
+ /**
+ * This function validates that the user entered valid user configuration options. Specifically it verifies that:
+ * <ul>
+ * <li>the lastColumn is greater than the first column</li>
+ * <li>The lastColumn is not zero</li>
+ * <li>firstColumn is greater than zero</li>
+ * <li>lastColumn is greater than zero</li>
+ * <li>The headerRow index is less than the lastRow index</li>
+ * </ul>
+ *
+ */
+ private void validateConfig() {
+ // Validate the config variables
+ if ((lastColumn < firstColumn) && lastColumn != 0) {
+ throw UserException
+ .validationError()
+ .message("Invalid column configuration. The first column index is greater than the last column index.")
+ .build(logger);
+ }
+
+ if (firstColumn < 0) {
+ throw UserException
+ .validationError()
+ .message("Invalid value for first column. Index must be greater than zero.")
+ .build(logger);
+ }
+
+ if (lastColumn < 0) {
+ throw UserException
+ .validationError()
+ .message("Invalid value for last column. Index must be greater than zero.")
+ .build(logger);
+ }
+
+ if (headerRow > lastRow) {
+ throw UserException
+ .validationError()
+ .message("Invalid value for headerRow. Header row must be less than last row.")
+ .build(logger);
+ }
+ }
}
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatPlugin.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatPlugin.java
index e37027bb61..3436baee04 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatPlugin.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatPlugin.java
@@ -18,42 +18,37 @@
package org.apache.drill.exec.store.excel;
-import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.logical.StoragePluginConfig;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.Types;
-import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileReaderFactory;
-import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileScanBuilder;
-import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileReaderFactory;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileScanLifecycleBuilder;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator;
-import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader;
+import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader.EarlyEofException;
import org.apache.drill.exec.server.DrillbitContext;
-import org.apache.drill.exec.server.options.OptionSet;
import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin;
import org.apache.drill.exec.store.dfs.easy.EasySubScan;
import org.apache.hadoop.conf.Configuration;
-import org.apache.drill.exec.store.excel.ExcelBatchReader.ExcelReaderConfig;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
public class ExcelFormatPlugin extends EasyFormatPlugin<ExcelFormatConfig> {
- protected static final String DEFAULT_NAME = "excel";
- private static final Logger logger = LoggerFactory.getLogger(ExcelFormatPlugin.class);
+ public static final String DEFAULT_NAME = "excel";
+ public static final String OPERATOR_TYPE = "EXCEL_SUB_SCAN";
private static class ExcelReaderFactory extends FileReaderFactory {
private final ExcelBatchReader.ExcelReaderConfig readerConfig;
- private final int maxRecords;
+ private final EasySubScan scan;
- public ExcelReaderFactory(ExcelReaderConfig config, int maxRecords) {
- readerConfig = config;
- this.maxRecords = maxRecords;
+ public ExcelReaderFactory(ExcelBatchReader.ExcelReaderConfig config, EasySubScan scan) {
+ this.readerConfig = config;
+ this.scan = scan;
}
@Override
- public ManagedReader<? extends FileSchemaNegotiator> newReader() {
- return new ExcelBatchReader(readerConfig, maxRecords);
+ public ManagedReader newReader(FileSchemaNegotiator negotiator) throws EarlyEofException {
+ return new ExcelBatchReader(readerConfig, scan, negotiator);
}
}
@@ -69,74 +64,19 @@ public class ExcelFormatPlugin extends EasyFormatPlugin<ExcelFormatConfig> {
.writable(false)
.blockSplittable(false)
.compressible(true)
- .supportsProjectPushdown(true)
.extensions(pluginConfig.getExtensions())
.fsConf(fsConf)
- .defaultName(DEFAULT_NAME)
- .scanVersion(ScanFrameworkVersion.EVF_V1)
+ .readerOperatorType(OPERATOR_TYPE)
+ .defaultName(ExcelFormatPlugin.DEFAULT_NAME)
+ .scanVersion(ScanFrameworkVersion.EVF_V2)
+ .supportsProjectPushdown(true)
.supportsLimitPushdown(true)
.build();
}
@Override
- public ManagedReader<? extends FileSchemaNegotiator> newBatchReader(
- EasySubScan scan, OptionSet options) {
- return new ExcelBatchReader(formatConfig.getReaderConfig(this), scan.getMaxRecords());
- }
-
- @Override
- protected FileScanBuilder frameworkBuilder(EasySubScan scan, OptionSet options) {
- FileScanBuilder builder = new FileScanBuilder();
- ExcelReaderConfig readerConfig = new ExcelReaderConfig(this);
-
- verifyConfigOptions(readerConfig);
- builder.setReaderFactory(new ExcelReaderFactory(readerConfig, scan.getMaxRecords()));
-
- initScanBuilder(builder, scan);
+ protected void configureScan(FileScanLifecycleBuilder builder, EasySubScan scan) {
builder.nullType(Types.optional(TypeProtos.MinorType.VARCHAR));
- return builder;
- }
-
- /**
- * This function verifies that the user entered valid user configuration options. Specifically it verifies that:
- * <ul>
- * <li>the lastColumn is greater than the first column</li>
- * <li>The lastColumn is not zero</li>
- * <li>firstColumn is greater than zero</li>
- * <li>lastColumn is greater than zero</li>
- * <li>The headerRow index is less than the lastRow index</li>
- * </ul>
- *
- * @param readerConfig The readerConfig object for which the function will verify the config options
- */
- private void verifyConfigOptions(ExcelReaderConfig readerConfig) {
- // Validate the config variables
- if ((readerConfig.lastColumn < readerConfig.firstColumn) && readerConfig.lastColumn != 0) {
- throw UserException
- .validationError()
- .message("Invalid column configuration. The first column index is greater than the last column index.")
- .build(logger);
- }
-
- if (readerConfig.firstColumn < 0) {
- throw UserException
- .validationError()
- .message("Invalid value for first column. Index must be greater than zero.")
- .build(logger);
- }
-
- if (readerConfig.lastColumn < 0) {
- throw UserException
- .validationError()
- .message("Invalid value for last column. Index must be greater than zero.")
- .build(logger);
- }
-
- if (readerConfig.headerRow > readerConfig.lastRow) {
- throw UserException
- .validationError()
- .message("Invalid value for headerRow. Header row must be less than last row.")
- .build(logger);
- }
+ builder.readerFactory(new ExcelReaderFactory(formatConfig.getReaderConfig(this), scan));
}
}
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/schema/DynamicSchemaFilter.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/schema/DynamicSchemaFilter.java
index 687176534f..cff86c61db 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/schema/DynamicSchemaFilter.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/schema/DynamicSchemaFilter.java
@@ -232,6 +232,9 @@ public abstract class DynamicSchemaFilter implements ProjectionFilter {
@Override
public ProjResult projection(ColumnMetadata col) {
ColumnHandle handle = schema.find(col.name());
+ if (SchemaUtils.isExcludedFromWildcard(col) && handle == null) {
+ return NOT_PROJECTED;
+ }
if (handle == null) {
return newColumnProjection();
}