You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2021/10/05 03:18:49 UTC

[drill] branch master updated: DRILL-8006: Leading and Trailing Whitespace Causes Query Failures in Excel Files (#2325)

This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new bad5e66  DRILL-8006: Leading and Trailing Whitespace Causes Query Failures in Excel Files (#2325)
bad5e66 is described below

commit bad5e669d5241b56d34560e93bd93e670785e388
Author: Charles S. Givre <cg...@apache.org>
AuthorDate: Mon Oct 4 23:18:02 2021 -0400

    DRILL-8006: Leading and Trailing Whitespace Causes Query Failures in Excel Files (#2325)
    
    * Initial Commit
    
    * Delete logback-test.xml
---
 contrib/format-excel/pom.xml                       |   2 +-
 .../drill/exec/store/excel/ExcelBatchReader.java   |   8 +++++++-
 .../drill/exec/store/excel/TestExcelFormat.java    |  22 +++++++++++++++++++--
 .../src/test/resources/excel/test_data.xlsx        | Bin 18077 -> 19559 bytes
 4 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/contrib/format-excel/pom.xml b/contrib/format-excel/pom.xml
index 7bc153f..5e831bb 100644
--- a/contrib/format-excel/pom.xml
+++ b/contrib/format-excel/pom.xml
@@ -67,7 +67,7 @@
     <dependency>
       <groupId>com.github.pjfanning</groupId>
       <artifactId>excel-streaming-reader</artifactId>
-      <version>3.0.4</version>
+      <version>3.1.2</version>
     </dependency>
   </dependencies>
   <build>
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index 73d634c..d43cdfd 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -249,7 +249,7 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
     rowIterator = sheet.iterator();
 
     // Get the number of columns.
-    // This menthod also advances the row reader to the location of the first row of data
+    // This method also advances the row reader to the location of the first row of data
     setFirstRow();
 
     excelFieldNames = new ArrayList<>();
@@ -300,6 +300,9 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
               .replace(PARSER_WILDCARD, SAFE_WILDCARD)
               .replaceAll("\\.", SAFE_SEPARATOR)
               .replaceAll("\\n", HEADER_NEW_LINE_REPLACEMENT);
+
+            // Remove leading and trailing whitespace
+            tempColumnName = tempColumnName.trim();
             makeColumn(builder, tempColumnName, TypeProtos.MinorType.VARCHAR);
             excelFieldNames.add(colPosition, tempColumnName);
             break;
@@ -308,6 +311,9 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
           case _NONE:
           case BLANK:
             tempColumnName = cell.getStringCellValue();
+
+            // Remove leading and trailing whitespace
+            tempColumnName = tempColumnName.trim();
             makeColumn(builder, tempColumnName, TypeProtos.MinorType.FLOAT8);
             excelFieldNames.add(colPosition, tempColumnName);
             break;
diff --git a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index a1521e3..7a37291 100644
--- a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++ b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
@@ -129,7 +129,7 @@ public class TestExcelFormat extends ClusterTest {
 
     RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
       .addRow("test_category", null, null, "test_author", null, null, "test_keywords", "Microsoft Office User", null, "test_subject", "test_title",
-        1571602578000L, null,1588212319000L)
+        1571602578000L, null,1633358966000L)
       .build();
 
     new RowSetComparison(expected).verifyAndClearAll(results);
@@ -158,6 +158,24 @@ public class TestExcelFormat extends ClusterTest {
   }
 
   @Test
+  public void testExplicitWithSpacesInColHeader() throws RpcException {
+    String sql = "SELECT col1, col2 FROM table(cp.`excel/test_data.xlsx` (type => 'excel', sheetName => 'spaceInColHeader'))";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addNullable("col1", MinorType.FLOAT8)
+      .addNullable("col2", MinorType.FLOAT8)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(1,2)
+      .addRow(3,4)
+      .build();
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
+  @Test
   public void testNonDefaultSheetQuery() throws RpcException {
     String sql = "SELECT * FROM  table(cp.`excel/test_data.xlsx` (type => 'excel', sheetName => 'secondSheet'))";
 
@@ -525,7 +543,7 @@ public class TestExcelFormat extends ClusterTest {
       .buildSchema();
 
     RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
-      .addRow((Object)strArray("data", "secondSheet", "thirdSheet", "fourthSheet", "emptySheet", "missingDataSheet", "inconsistentData", "comps"))
+      .addRow((Object)strArray("data", "secondSheet", "thirdSheet", "fourthSheet", "emptySheet", "missingDataSheet", "inconsistentData", "comps", "spaceInColHeader"))
       .build();
 
     new RowSetComparison(expected).verifyAndClearAll(results);
diff --git a/contrib/format-excel/src/test/resources/excel/test_data.xlsx b/contrib/format-excel/src/test/resources/excel/test_data.xlsx
index 9841bad..5c7e6b0 100644
Binary files a/contrib/format-excel/src/test/resources/excel/test_data.xlsx and b/contrib/format-excel/src/test/resources/excel/test_data.xlsx differ