You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2021/12/09 02:56:41 UTC
[drill] branch master updated: DRILL-8070: format-excel assumes that rowIterator returns every row (#2399)
This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new 7ddfb08 DRILL-8070: format-excel assumes that rowIterator returns every row (#2399)
7ddfb08 is described below
commit 7ddfb08523ea9fcf0f2f4e6d1307cc9ca9c4c851
Author: PJ Fanning <pj...@users.noreply.github.com>
AuthorDate: Thu Dec 9 03:56:28 2021 +0100
DRILL-8070: format-excel assumes that rowIterator returns every row (#2399)
* DRILL-8070: format-excel assumes that rowIterator returns every row
* add test
---
.../drill/exec/store/excel/ExcelBatchReader.java | 27 +++++++++++++--------
.../drill/exec/store/excel/TestExcelFormat.java | 22 +++++++++++++++++
.../src/test/resources/excel/blank_rows.xlsx | Bin 0 -> 10131 bytes
3 files changed, 39 insertions(+), 10 deletions(-)
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index ce39783..26e925f 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -272,7 +272,9 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
// Get the number of columns.
// This method also advances the row reader to the location of the first row of data
- setFirstDataRow();
+ if (!setFirstDataRow()) {
+ return;
+ }
totalColumnCount = finalSchema.size();
firstLine = false;
@@ -299,7 +301,9 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
// Get the number of columns.
// This method also advances the row reader to the location of the first row of data
- setFirstRow();
+ if (!setFirstRow()) {
+ return;
+ }
excelFieldNames = new ArrayList<>();
cellWriterArray = new ArrayList<>();
@@ -400,32 +404,28 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
* There are a few gotchas here in that we have to know the header row and count the physical number of cells
* in that row. This function also has to move the rowIterator object to the first row of data.
*/
- private void setFirstRow() {
+ private boolean setFirstRow() {
// Initialize
currentRow = rowIterator.next();
int rowNumber = readerConfig.headerRow > 0 ? sheet.getFirstRowNum() : 0;
// If the headerRow is greater than zero, advance the iterator to the first row of data
// This is unfortunately necessary since the streaming reader eliminated the getRow() method.
- for (int i = 1; i < rowNumber; i++) {
- currentRow = rowIterator.next();
- }
+ return skipToRow(rowNumber);
}
/**
* This function is used to set the iterator to the first row of actual data. When a schema is provided,
* we can safely skip the header row, and start reading the first row of data.
*/
- private void setFirstDataRow() {
+ private boolean setFirstDataRow() {
// Initialize
currentRow = rowIterator.next();
int rowNumber = readerConfig.headerRow > 0 ? sheet.getFirstRowNum() : 0;
// If the headerRow is greater than zero, advance the iterator to the first row of data
// This is unfortunately necessary since the streaming reader eliminated the getRow() method.
- for (int i = 0; i <= rowNumber; i++) {
- currentRow = rowIterator.next();
- }
+ return skipToRow(rowNumber + 1);
}
@Override
@@ -439,6 +439,13 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
return true;
}
+ private boolean skipToRow(int minRowNum) {
+ while (currentRow.getRowNum() < minRowNum && rowIterator.hasNext()) {
+ currentRow = rowIterator.next();
+ }
+ return currentRow.getRowNum() >= minRowNum;
+ }
+
private boolean nextLine(RowSetLoader rowWriter) {
if (rowIterator == null) {
rowIterator = sheet.rowIterator();
diff --git a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index 3f66450..1a65512 100644
--- a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++ b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
@@ -266,6 +266,28 @@ public class TestExcelFormat extends ClusterTest {
}
@Test
+ public void testHandleMissingRows() throws RpcException {
+ String sql = "SELECT id, first_name, order_count FROM table(cp.`excel/blank_rows.xlsx` (type => 'excel', sheetName => 'data', headerRow => 3))";
+
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("id", TypeProtos.MinorType.FLOAT8)
+ .addNullable("first_name", TypeProtos.MinorType.VARCHAR)
+ .addNullable("order_count", TypeProtos.MinorType.FLOAT8)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow(1.0, "Cornelia", 22.0)
+ .addRow(2.0, "Nydia", 22.0)
+ .addRow(3.0, "Waiter", 17.0)
+ .addRow(4.0, "Cicely", 6.0)
+ .addRow(5.0, "Dorie", 17.0)
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
public void testExplicitWithSpacesInColHeader() throws RpcException {
String sql = "SELECT col1, col2 FROM table(cp.`excel/test_data.xlsx` (type => 'excel', sheetName => 'spaceInColHeader'))";
diff --git a/contrib/format-excel/src/test/resources/excel/blank_rows.xlsx b/contrib/format-excel/src/test/resources/excel/blank_rows.xlsx
new file mode 100644
index 0000000..43e8135
Binary files /dev/null and b/contrib/format-excel/src/test/resources/excel/blank_rows.xlsx differ