You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2022/06/17 20:38:55 UTC

[drill] branch master updated: DRILL-8199: Convert Excel EVF1 to EVF2 (#2569)

This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new 11164e9169 DRILL-8199: Convert Excel EVF1 to EVF2 (#2569)
11164e9169 is described below

commit 11164e9169db94ce0b4def508042007bc4207312
Author: James Turton <91...@users.noreply.github.com>
AuthorDate: Fri Jun 17 22:38:49 2022 +0200

    DRILL-8199: Convert Excel EVF1 to EVF2 (#2569)
    
    * Upgrade Excel format plugin to EVF 2.
    
    * Fix implict column exclusion from wildcard projection in EVF2.
---
 .../drill/exec/store/excel/ExcelBatchReader.java   | 60 +++++--------
 .../drill/exec/store/excel/ExcelFormatConfig.java  | 48 +++++++++++
 .../drill/exec/store/excel/ExcelFormatPlugin.java  | 98 +++++-----------------
 .../impl/scan/v3/schema/DynamicSchemaFilter.java   |  3 +
 4 files changed, 93 insertions(+), 116 deletions(-)

diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index 11a4c27d78..cbdc144711 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -24,9 +24,9 @@ import org.apache.drill.common.exceptions.CustomErrorContext;
 import org.apache.drill.common.exceptions.UserException;
 import org.apache.drill.common.types.TypeProtos.DataMode;
 import org.apache.drill.common.types.TypeProtos.MinorType;
-import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework;
-import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator;
-import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileDescrip;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator;
+import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader;
 import org.apache.drill.exec.physical.resultSet.ResultSetLoader;
 import org.apache.drill.exec.physical.resultSet.RowSetLoader;
 import org.apache.drill.exec.record.MaterializedField;
@@ -34,9 +34,9 @@ import org.apache.drill.exec.record.metadata.ColumnMetadata;
 import org.apache.drill.exec.record.metadata.MetadataUtils;
 import org.apache.drill.exec.record.metadata.SchemaBuilder;
 import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.exec.store.dfs.easy.EasySubScan;
 import org.apache.drill.exec.vector.accessor.ScalarWriter;
 import org.apache.drill.exec.vector.accessor.TupleWriter;
-import org.apache.hadoop.mapred.FileSplit;
 import org.apache.poi.ooxml.POIXMLProperties.CoreProperties;
 import org.apache.poi.openxml4j.opc.ZipPackage;
 import org.apache.poi.openxml4j.util.ZipInputStreamZipEntrySource;
@@ -65,7 +65,7 @@ import java.util.TreeSet;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
+public class ExcelBatchReader implements ManagedReader {
 
   private static final Logger logger = LoggerFactory.getLogger(ExcelBatchReader.class);
 
@@ -145,8 +145,10 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
   private static final int BUFFER_SIZE = 4096;
 
   private final ExcelReaderConfig readerConfig;
-  private final int maxRecords;
   private final TreeSet<String> columnNameChecker;
+  private final RowSetLoader rowWriter;
+  private final CustomErrorContext errorContext;
+
   private Sheet sheet;
   private Row currentRow;
   private StreamingWorkbook streamingWorkbook;
@@ -157,17 +159,13 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
   private List<ScalarWriter> metadataColumnWriters;
   private ScalarWriter sheetNameWriter;
   private Iterator<Row> rowIterator;
-  private RowSetLoader rowWriter;
+  private FileDescrip file;
   private int totalColumnCount;
   private boolean firstLine;
-  private FileSplit split;
   private int recordCount;
   private Map<String, String> stringMetadata;
   private Map<String, Date> dateMetadata;
   private Map<String, List<String>> listMetadata;
-  private CustomErrorContext errorContext;
-
-
 
   static class ExcelReaderConfig {
     final ExcelFormatPlugin plugin;
@@ -195,49 +193,43 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
     }
   }
 
-  public ExcelBatchReader(ExcelReaderConfig readerConfig, int maxRecords) {
+  public ExcelBatchReader(ExcelReaderConfig readerConfig, EasySubScan scan, FileSchemaNegotiator negotiator) {
+    this.errorContext = negotiator.parentErrorContext();
     this.readerConfig = readerConfig;
-    this.maxRecords = maxRecords;
-    this.columnNameChecker = new TreeSet<>(String.CASE_INSENSITIVE_ORDER);
-    firstLine = true;
-  }
+    this.file = negotiator.file();
 
-  @Override
-  public boolean open(FileSchemaNegotiator negotiator) {
-    split = negotiator.split();
-    errorContext = negotiator.parentErrorContext();
     ResultSetLoader loader = negotiator.build();
-    rowWriter = loader.writer();
-    openFile(negotiator);
+    this.rowWriter = loader.writer();
+    this.columnNameChecker = new TreeSet<>(String.CASE_INSENSITIVE_ORDER);
+    this.firstLine = true;
 
-    if (negotiator.hasProvidedSchema()) {
+    openFile();
+    if (negotiator.providedSchema() != null) {
       TupleMetadata providedSchema = negotiator.providedSchema();
-      logger.debug("Found inline schema");
+      logger.debug("Found a provided schema");
 
       // Add Implicit columns to schema
       SchemaBuilder builderForProvidedSchema = new SchemaBuilder();
       builderForProvidedSchema.addAll(providedSchema);
       TupleMetadata finalSchema = builderForProvidedSchema.build();
       buildColumnWritersFromProvidedSchema(finalSchema);
-
       // Add schema to file negotiator
       logger.debug("Metadata added to provided schema.");
       addMetadataToSchema(builderForProvidedSchema);
+
       // Build column writer array
       negotiator.tableSchema(finalSchema, true);
     } else {
       defineSchema(negotiator);
     }
-    return true;
   }
 
   /**
    * This method opens the Excel file, initializes the Streaming Excel Reader, and initializes the sheet variable.
-   * @param negotiator The Drill file negotiator object that represents the file system
    */
-  private void openFile(FileScanFramework.FileSchemaNegotiator negotiator) {
+  private void openFile() {
     try {
-      fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
+      fsStream = file.fileSystem().openPossiblyCompressedStream(file.split().getPath());
 
       if (readerConfig.maxArraySize >= 0) {
         IOUtils.setByteArrayMaxOverride(readerConfig.maxArraySize);
@@ -263,7 +255,7 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
     } catch (Exception e) {
       throw UserException
         .dataReadError(e)
-        .message("Failed to open open input file: %s", split.getPath().toString())
+        .message("Failed to open open input file: %s", file.split().getPath().toString())
         .addContext(e.getMessage())
         .addContext(errorContext)
         .build(logger);
@@ -518,10 +510,6 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
       finalColumn = readerConfig.lastColumn - 1;
     }
 
-    if (rowWriter.limitReached(maxRecords)) {
-      return false;
-    }
-
     rowWriter.start();
     for (int colWriterIndex = 0; colPosition < finalColumn; colWriterIndex++) {
       Cell cell = currentRow.getCell(colPosition);
@@ -679,9 +667,7 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
     int index = rowWriter.tupleSchema().index(name);
     if (index == -1) {
       ColumnMetadata colSchema = MetadataUtils.newScalar(name, type, DataMode.OPTIONAL);
-      if (isMetadata) {
-        colSchema.setBooleanProperty(ColumnMetadata.EXCLUDE_FROM_WILDCARD, true);
-      }
+      colSchema.setBooleanProperty(ColumnMetadata.EXCLUDE_FROM_WILDCARD, isMetadata);
       index = rowWriter.addColumn(colSchema);
     } else {
       return;
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java
index b662060108..a78f026619 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java
@@ -24,10 +24,13 @@ import com.fasterxml.jackson.annotation.JsonProperty;
 import com.fasterxml.jackson.annotation.JsonTypeName;
 
 import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
 import org.apache.drill.common.logical.FormatPluginConfig;
 import org.apache.drill.exec.store.excel.ExcelBatchReader.ExcelReaderConfig;
 import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
 import org.apache.poi.ss.SpreadsheetVersion;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.util.Collections;
 import java.util.List;
@@ -37,6 +40,8 @@ import java.util.Objects;
 @JsonInclude(JsonInclude.Include.NON_DEFAULT)
 public class ExcelFormatConfig implements FormatPluginConfig {
 
+  private static final Logger logger = LoggerFactory.getLogger(ExcelFormatPlugin.class);
+
   // This is the theoretical maximum number of rows in an Excel spreadsheet
   private final int MAX_ROWS = SpreadsheetVersion.EXCEL2007.getMaxRows();
 
@@ -78,6 +83,7 @@ public class ExcelFormatConfig implements FormatPluginConfig {
     this.maxArraySize = maxArraySize == null ? -1 : maxArraySize;
     this.thresholdBytesForTempFiles = thresholdBytesForTempFiles == null ? -1 : thresholdBytesForTempFiles;
     this.useTempFilePackageParts = useTempFilePackageParts == null ? false : useTempFilePackageParts;
+    validateConfig();
   }
 
   @JsonInclude(JsonInclude.Include.NON_DEFAULT)
@@ -184,4 +190,46 @@ public class ExcelFormatConfig implements FormatPluginConfig {
         .field("useTempFilePackageParts", useTempFilePackageParts)
         .toString();
   }
+
+  /**
+   * This function validates that the user entered valid user configuration options.  Specifically it verifies that:
+   * <ul>
+   * <li>the lastColumn is greater than the first column</li>
+   * <li>The lastColumn is not zero</li>
+   * <li>firstColumn is greater than zero</li>
+   * <li>lastColumn is greater than zero</li>
+   * <li>The headerRow index is less than the lastRow index</li>
+   * </ul>
+   *
+   */
+  private void validateConfig() {
+    // Validate the config variables
+    if ((lastColumn < firstColumn) && lastColumn != 0) {
+      throw UserException
+        .validationError()
+        .message("Invalid column configuration. The first column index is greater than the last column index.")
+        .build(logger);
+    }
+
+    if (firstColumn < 0) {
+      throw UserException
+        .validationError()
+        .message("Invalid value for first column. Index must be greater than zero.")
+        .build(logger);
+    }
+
+    if (lastColumn < 0) {
+      throw UserException
+        .validationError()
+        .message("Invalid value for last column. Index must be greater than zero.")
+        .build(logger);
+    }
+
+    if (headerRow > lastRow) {
+      throw UserException
+        .validationError()
+        .message("Invalid value for headerRow. Header row must be less than last row.")
+        .build(logger);
+    }
+  }
 }
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatPlugin.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatPlugin.java
index e37027bb61..3436baee04 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatPlugin.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatPlugin.java
@@ -18,42 +18,37 @@
 
 package org.apache.drill.exec.store.excel;
 
-import org.apache.drill.common.exceptions.UserException;
 import org.apache.drill.common.logical.StoragePluginConfig;
 import org.apache.drill.common.types.TypeProtos;
 import org.apache.drill.common.types.Types;
-import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileReaderFactory;
-import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileScanBuilder;
-import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileReaderFactory;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileScanLifecycleBuilder;
+import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator;
 
-import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader;
+import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader.EarlyEofException;
 import org.apache.drill.exec.server.DrillbitContext;
-import org.apache.drill.exec.server.options.OptionSet;
 import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin;
 import org.apache.drill.exec.store.dfs.easy.EasySubScan;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.drill.exec.store.excel.ExcelBatchReader.ExcelReaderConfig;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 
 public class ExcelFormatPlugin extends EasyFormatPlugin<ExcelFormatConfig> {
 
-  protected static final String DEFAULT_NAME = "excel";
-  private static final Logger logger = LoggerFactory.getLogger(ExcelFormatPlugin.class);
+  public static final String DEFAULT_NAME = "excel";
+  public static final String OPERATOR_TYPE = "EXCEL_SUB_SCAN";
 
   private static class ExcelReaderFactory extends FileReaderFactory {
     private final ExcelBatchReader.ExcelReaderConfig readerConfig;
-    private final int maxRecords;
+    private final EasySubScan scan;
 
-    public ExcelReaderFactory(ExcelReaderConfig config, int maxRecords) {
-      readerConfig = config;
-      this.maxRecords = maxRecords;
+    public ExcelReaderFactory(ExcelBatchReader.ExcelReaderConfig config, EasySubScan scan) {
+      this.readerConfig = config;
+      this.scan = scan;
     }
 
     @Override
-    public ManagedReader<? extends FileSchemaNegotiator> newReader() {
-      return new ExcelBatchReader(readerConfig, maxRecords);
+    public ManagedReader newReader(FileSchemaNegotiator negotiator) throws EarlyEofException {
+      return new ExcelBatchReader(readerConfig, scan, negotiator);
     }
   }
 
@@ -69,74 +64,19 @@ public class ExcelFormatPlugin extends EasyFormatPlugin<ExcelFormatConfig> {
         .writable(false)
         .blockSplittable(false)
         .compressible(true)
-        .supportsProjectPushdown(true)
         .extensions(pluginConfig.getExtensions())
         .fsConf(fsConf)
-        .defaultName(DEFAULT_NAME)
-        .scanVersion(ScanFrameworkVersion.EVF_V1)
+        .readerOperatorType(OPERATOR_TYPE)
+        .defaultName(ExcelFormatPlugin.DEFAULT_NAME)
+        .scanVersion(ScanFrameworkVersion.EVF_V2)
+        .supportsProjectPushdown(true)
         .supportsLimitPushdown(true)
         .build();
   }
 
   @Override
-  public ManagedReader<? extends FileSchemaNegotiator> newBatchReader(
-    EasySubScan scan, OptionSet options) {
-    return new ExcelBatchReader(formatConfig.getReaderConfig(this), scan.getMaxRecords());
-  }
-
-  @Override
-  protected FileScanBuilder frameworkBuilder(EasySubScan scan, OptionSet options) {
-    FileScanBuilder builder = new FileScanBuilder();
-    ExcelReaderConfig readerConfig = new ExcelReaderConfig(this);
-
-    verifyConfigOptions(readerConfig);
-    builder.setReaderFactory(new ExcelReaderFactory(readerConfig, scan.getMaxRecords()));
-
-    initScanBuilder(builder, scan);
+  protected void configureScan(FileScanLifecycleBuilder builder, EasySubScan scan) {
     builder.nullType(Types.optional(TypeProtos.MinorType.VARCHAR));
-    return builder;
-  }
-
-  /**
-   * This function verifies that the user entered valid user configuration options.  Specifically it verifies that:
-   * <ul>
-   * <li>the lastColumn is greater than the first column</li>
-   * <li>The lastColumn is not zero</li>
-   * <li>firstColumn is greater than zero</li>
-   * <li>lastColumn is greater than zero</li>
-   * <li>The headerRow index is less than the lastRow index</li>
-   * </ul>
-   *
-   * @param readerConfig The readerConfig object for which the function will verify the config options
-   */
-  private void verifyConfigOptions(ExcelReaderConfig readerConfig) {
-    // Validate the config variables
-    if ((readerConfig.lastColumn < readerConfig.firstColumn) && readerConfig.lastColumn != 0) {
-      throw UserException
-        .validationError()
-        .message("Invalid column configuration. The first column index is greater than the last column index.")
-        .build(logger);
-    }
-
-    if (readerConfig.firstColumn < 0) {
-      throw UserException
-        .validationError()
-        .message("Invalid value for first column. Index must be greater than zero.")
-        .build(logger);
-    }
-
-    if (readerConfig.lastColumn < 0) {
-      throw UserException
-        .validationError()
-        .message("Invalid value for last column. Index must be greater than zero.")
-        .build(logger);
-    }
-
-    if (readerConfig.headerRow > readerConfig.lastRow) {
-      throw UserException
-        .validationError()
-        .message("Invalid value for headerRow. Header row must be less than last row.")
-        .build(logger);
-    }
+    builder.readerFactory(new ExcelReaderFactory(formatConfig.getReaderConfig(this), scan));
   }
 }
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/schema/DynamicSchemaFilter.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/schema/DynamicSchemaFilter.java
index 687176534f..cff86c61db 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/schema/DynamicSchemaFilter.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/schema/DynamicSchemaFilter.java
@@ -232,6 +232,9 @@ public abstract class DynamicSchemaFilter implements ProjectionFilter {
     @Override
     public ProjResult projection(ColumnMetadata col) {
       ColumnHandle handle = schema.find(col.name());
+      if (SchemaUtils.isExcludedFromWildcard(col) && handle == null) {
+        return NOT_PROJECTED;
+      }
       if (handle == null) {
         return newColumnProjection();
       }