You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2022/01/16 12:53:30 UTC
[drill] branch master updated: DRILL-8108: Excel Reader Fails with Duplicate Columns (#2428)
This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new 0adcb62 DRILL-8108: Excel Reader Fails with Duplicate Columns (#2428)
0adcb62 is described below
commit 0adcb62535e6a1f21cfcaf3cdda206b9f4f093f4
Author: Charles S. Givre <cg...@apache.org>
AuthorDate: Sun Jan 16 07:53:20 2022 -0500
DRILL-8108: Excel Reader Fails with Duplicate Columns (#2428)
* DRILL-8108 Excel Reader Fails with Duplicate Columns
* Fix for case sensitivity
* Formatting fix
* Used TreeSet
---
.../drill/exec/store/excel/ExcelBatchReader.java | 31 ++++++++++++++
.../drill/exec/store/excel/TestExcelFormat.java | 46 +++++++++++++++++++++
.../src/test/resources/excel/dup_col_names.xlsx | Bin 0 -> 8986 bytes
3 files changed, 77 insertions(+)
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index f167c6d..cecec9a 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -58,6 +58,9 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
@@ -140,6 +143,7 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
private final ExcelReaderConfig readerConfig;
private final int maxRecords;
+ private final TreeSet<String> columnNameChecker;
private Sheet sheet;
private Row currentRow;
private StreamingWorkbook streamingWorkbook;
@@ -161,6 +165,7 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
private CustomErrorContext errorContext;
+
static class ExcelReaderConfig {
final ExcelFormatPlugin plugin;
final int headerRow;
@@ -184,6 +189,7 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
public ExcelBatchReader(ExcelReaderConfig readerConfig, int maxRecords) {
this.readerConfig = readerConfig;
this.maxRecords = maxRecords;
+ this.columnNameChecker = new TreeSet<>(String.CASE_INSENSITIVE_ORDER);
firstLine = true;
}
@@ -346,16 +352,20 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
// Remove leading and trailing whitespace
tempColumnName = tempColumnName.trim();
+ tempColumnName = deconflictColumnNames(tempColumnName);
makeColumn(builder, tempColumnName, MinorType.VARCHAR);
excelFieldNames.add(colPosition, tempColumnName);
+ columnNameChecker.add(tempColumnName);
break;
default:
tempColumnName = cell.getStringCellValue();
// Remove leading and trailing whitespace
tempColumnName = tempColumnName.trim();
+ tempColumnName = deconflictColumnNames(tempColumnName);
makeColumn(builder, tempColumnName, MinorType.FLOAT8);
excelFieldNames.add(colPosition, tempColumnName);
+ columnNameChecker.add(tempColumnName);
break;
}
}
@@ -365,6 +375,27 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
}
/**
+ * This function verifies whether a given column name is already present in the projected schema.
+ * If so, it appends _n to the column name. N will be incremented for every duplicate column
+ * @param columnName The original column
+ * @return The deconflicted column name
+ */
+ private String deconflictColumnNames(String columnName) {
+ Pattern pattern = Pattern.compile("_(\\d+)$");
+ Matcher matcher = pattern.matcher(columnName);
+ while (columnNameChecker.contains(columnName)) {
+ if (matcher.find()) {
+ int index = Integer.parseInt(matcher.group(1));
+ index++;
+ columnName = matcher.replaceFirst("_" + index);
+ } else {
+ columnName = columnName + "_1";
+ }
+ }
+ return columnName;
+ }
+
+ /**
* Helper function to get the selected sheet from the configuration
* @return Sheet The selected sheet
*/
diff --git a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index e9304e2..dd37033 100644
--- a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++ b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
@@ -734,4 +734,50 @@ public class TestExcelFormat extends ClusterTest {
new RowSetComparison(expected).verifyAndClearAll(results);
}
+
+ @Test
+ public void testDuplicateColumnNames() throws Exception {
+ String sql = "SELECT * FROM cp.`excel/dup_col_names.xlsx`";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("Col1", MinorType.FLOAT8)
+ .addNullable("Col1_1", MinorType.FLOAT8)
+ .addNullable("Col1_2", MinorType.FLOAT8)
+ .addNullable("Col1_2_1", MinorType.FLOAT8)
+ .addNullable("column1", MinorType.VARCHAR)
+ .addNullable("COLUMN1_1", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow(1.0, 5.0, 9.0, 13.0, "a", "e")
+ .addRow(2.0, 6.0, 10.0, 14.0, "b", "f")
+ .addRow(3.0, 7.0, 11.0, 15.0, "c", "g")
+ .addRow(4.0, 9.0, 12.0, 16.0, "d", "h")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testDuplicateColumnNamesWithExplicitColumnNames() throws Exception {
+ String sql = "SELECT Col1, Col1_1, Col1_2, Col1_2_1 FROM cp.`excel/dup_col_names.xlsx`";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("Col1", MinorType.FLOAT8)
+ .addNullable("Col1_1", MinorType.FLOAT8)
+ .addNullable("Col1_2", MinorType.FLOAT8)
+ .addNullable("Col1_2_1", MinorType.FLOAT8)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow(1.0, 5.0, 9.0, 13.0)
+ .addRow(2.0, 6.0, 10.0, 14.0)
+ .addRow(3.0, 7.0, 11.0, 15.0)
+ .addRow(4.0, 9.0, 12.0, 16.0)
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
}
diff --git a/contrib/format-excel/src/test/resources/excel/dup_col_names.xlsx b/contrib/format-excel/src/test/resources/excel/dup_col_names.xlsx
new file mode 100644
index 0000000..d5a4777
Binary files /dev/null and b/contrib/format-excel/src/test/resources/excel/dup_col_names.xlsx differ