You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2021/12/09 02:56:41 UTC

[drill] branch master updated: DRILL-8070: format-excel assumes that rowIterator returns every row (#2399)

This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new 7ddfb08  DRILL-8070: format-excel assumes that rowIterator returns every row (#2399)
7ddfb08 is described below

commit 7ddfb08523ea9fcf0f2f4e6d1307cc9ca9c4c851
Author: PJ Fanning <pj...@users.noreply.github.com>
AuthorDate: Thu Dec 9 03:56:28 2021 +0100

    DRILL-8070: format-excel assumes that rowIterator returns every row (#2399)
    
    * DRILL-8070: format-excel assumes that rowIterator returns every row
    
    * add test
---
 .../drill/exec/store/excel/ExcelBatchReader.java   |  27 +++++++++++++--------
 .../drill/exec/store/excel/TestExcelFormat.java    |  22 +++++++++++++++++
 .../src/test/resources/excel/blank_rows.xlsx       | Bin 0 -> 10131 bytes
 3 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index ce39783..26e925f 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -272,7 +272,9 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
 
     // Get the number of columns.
     // This method also advances the row reader to the location of the first row of data
-    setFirstDataRow();
+    if (!setFirstDataRow()) {
+      return;
+    }
     totalColumnCount = finalSchema.size();
     firstLine = false;
 
@@ -299,7 +301,9 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
 
     // Get the number of columns.
     // This method also advances the row reader to the location of the first row of data
-    setFirstRow();
+    if (!setFirstRow()) {
+      return;
+    }
 
     excelFieldNames = new ArrayList<>();
     cellWriterArray = new ArrayList<>();
@@ -400,32 +404,28 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
    * There are a few gotchas here in that we have to know the header row and count the physical number of cells
    * in that row.  This function also has to move the rowIterator object to the first row of data.
    */
-  private void setFirstRow() {
+  private boolean setFirstRow() {
     // Initialize
     currentRow = rowIterator.next();
     int rowNumber = readerConfig.headerRow > 0 ? sheet.getFirstRowNum() : 0;
 
     // If the headerRow is greater than zero, advance the iterator to the first row of data
     // This is unfortunately necessary since the streaming reader eliminated the getRow() method.
-    for (int i = 1; i < rowNumber; i++) {
-      currentRow = rowIterator.next();
-    }
+    return skipToRow(rowNumber);
   }
 
   /**
    * This function is used to set the iterator to the first row of actual data.  When a schema is provided,
    * we can safely skip the header row, and start reading the first row of data.
    */
-  private void setFirstDataRow() {
+  private boolean setFirstDataRow() {
     // Initialize
     currentRow = rowIterator.next();
     int rowNumber = readerConfig.headerRow > 0 ? sheet.getFirstRowNum() : 0;
 
     // If the headerRow is greater than zero, advance the iterator to the first row of data
     // This is unfortunately necessary since the streaming reader eliminated the getRow() method.
-    for (int i = 0; i <= rowNumber; i++) {
-      currentRow = rowIterator.next();
-    }
+    return skipToRow(rowNumber + 1);
   }
 
   @Override
@@ -439,6 +439,13 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
     return true;
   }
 
+  private boolean skipToRow(int minRowNum) {
+    while (currentRow.getRowNum() < minRowNum && rowIterator.hasNext()) {
+      currentRow = rowIterator.next();
+    }
+    return currentRow.getRowNum() >= minRowNum;
+  }
+
   private boolean nextLine(RowSetLoader rowWriter) {
     if (rowIterator == null) {
       rowIterator = sheet.rowIterator();
diff --git a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index 3f66450..1a65512 100644
--- a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++ b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
@@ -266,6 +266,28 @@ public class TestExcelFormat extends ClusterTest {
   }
 
   @Test
+  public void testHandleMissingRows() throws RpcException {
+    String sql = "SELECT id, first_name, order_count FROM table(cp.`excel/blank_rows.xlsx` (type => 'excel', sheetName => 'data', headerRow => 3))";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addNullable("id", TypeProtos.MinorType.FLOAT8)
+      .addNullable("first_name", TypeProtos.MinorType.VARCHAR)
+      .addNullable("order_count", TypeProtos.MinorType.FLOAT8)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(1.0, "Cornelia", 22.0)
+      .addRow(2.0, "Nydia", 22.0)
+      .addRow(3.0, "Waiter", 17.0)
+      .addRow(4.0, "Cicely", 6.0)
+      .addRow(5.0, "Dorie", 17.0)
+      .build();
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
+  @Test
   public void testExplicitWithSpacesInColHeader() throws RpcException {
     String sql = "SELECT col1, col2 FROM table(cp.`excel/test_data.xlsx` (type => 'excel', sheetName => 'spaceInColHeader'))";
 
diff --git a/contrib/format-excel/src/test/resources/excel/blank_rows.xlsx b/contrib/format-excel/src/test/resources/excel/blank_rows.xlsx
new file mode 100644
index 0000000..43e8135
Binary files /dev/null and b/contrib/format-excel/src/test/resources/excel/blank_rows.xlsx differ