You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2021/10/05 03:18:49 UTC
[drill] branch master updated: DRILL-8006: Leading and Trailing
Whitespace Causes Query Failures in Excel Files (#2325)
This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new bad5e66 DRILL-8006: Leading and Trailing Whitespace Causes Query Failures in Excel Files (#2325)
bad5e66 is described below
commit bad5e669d5241b56d34560e93bd93e670785e388
Author: Charles S. Givre <cg...@apache.org>
AuthorDate: Mon Oct 4 23:18:02 2021 -0400
DRILL-8006: Leading and Trailing Whitespace Causes Query Failures in Excel Files (#2325)
* Initial Commit
* Delete logback-test.xml
---
contrib/format-excel/pom.xml | 2 +-
.../drill/exec/store/excel/ExcelBatchReader.java | 8 +++++++-
.../drill/exec/store/excel/TestExcelFormat.java | 22 +++++++++++++++++++--
.../src/test/resources/excel/test_data.xlsx | Bin 18077 -> 19559 bytes
4 files changed, 28 insertions(+), 4 deletions(-)
diff --git a/contrib/format-excel/pom.xml b/contrib/format-excel/pom.xml
index 7bc153f..5e831bb 100644
--- a/contrib/format-excel/pom.xml
+++ b/contrib/format-excel/pom.xml
@@ -67,7 +67,7 @@
<dependency>
<groupId>com.github.pjfanning</groupId>
<artifactId>excel-streaming-reader</artifactId>
- <version>3.0.4</version>
+ <version>3.1.2</version>
</dependency>
</dependencies>
<build>
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index 73d634c..d43cdfd 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -249,7 +249,7 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
rowIterator = sheet.iterator();
// Get the number of columns.
- // This menthod also advances the row reader to the location of the first row of data
+ // This method also advances the row reader to the location of the first row of data
setFirstRow();
excelFieldNames = new ArrayList<>();
@@ -300,6 +300,9 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
.replace(PARSER_WILDCARD, SAFE_WILDCARD)
.replaceAll("\\.", SAFE_SEPARATOR)
.replaceAll("\\n", HEADER_NEW_LINE_REPLACEMENT);
+
+ // Remove leading and trailing whitespace
+ tempColumnName = tempColumnName.trim();
makeColumn(builder, tempColumnName, TypeProtos.MinorType.VARCHAR);
excelFieldNames.add(colPosition, tempColumnName);
break;
@@ -308,6 +311,9 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
case _NONE:
case BLANK:
tempColumnName = cell.getStringCellValue();
+
+ // Remove leading and trailing whitespace
+ tempColumnName = tempColumnName.trim();
makeColumn(builder, tempColumnName, TypeProtos.MinorType.FLOAT8);
excelFieldNames.add(colPosition, tempColumnName);
break;
diff --git a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index a1521e3..7a37291 100644
--- a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++ b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
@@ -129,7 +129,7 @@ public class TestExcelFormat extends ClusterTest {
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
.addRow("test_category", null, null, "test_author", null, null, "test_keywords", "Microsoft Office User", null, "test_subject", "test_title",
- 1571602578000L, null,1588212319000L)
+ 1571602578000L, null,1633358966000L)
.build();
new RowSetComparison(expected).verifyAndClearAll(results);
@@ -158,6 +158,24 @@ public class TestExcelFormat extends ClusterTest {
}
@Test
+ public void testExplicitWithSpacesInColHeader() throws RpcException {
+ String sql = "SELECT col1, col2 FROM table(cp.`excel/test_data.xlsx` (type => 'excel', sheetName => 'spaceInColHeader'))";
+
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("col1", MinorType.FLOAT8)
+ .addNullable("col2", MinorType.FLOAT8)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow(1,2)
+ .addRow(3,4)
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
public void testNonDefaultSheetQuery() throws RpcException {
String sql = "SELECT * FROM table(cp.`excel/test_data.xlsx` (type => 'excel', sheetName => 'secondSheet'))";
@@ -525,7 +543,7 @@ public class TestExcelFormat extends ClusterTest {
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
- .addRow((Object)strArray("data", "secondSheet", "thirdSheet", "fourthSheet", "emptySheet", "missingDataSheet", "inconsistentData", "comps"))
+ .addRow((Object)strArray("data", "secondSheet", "thirdSheet", "fourthSheet", "emptySheet", "missingDataSheet", "inconsistentData", "comps", "spaceInColHeader"))
.build();
new RowSetComparison(expected).verifyAndClearAll(results);
diff --git a/contrib/format-excel/src/test/resources/excel/test_data.xlsx b/contrib/format-excel/src/test/resources/excel/test_data.xlsx
index 9841bad..5c7e6b0 100644
Binary files a/contrib/format-excel/src/test/resources/excel/test_data.xlsx and b/contrib/format-excel/src/test/resources/excel/test_data.xlsx differ