You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2022/01/16 12:53:30 UTC

[drill] branch master updated: DRILL-8108: Excel Reader Fails with Duplicate Columns (#2428)

This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new 0adcb62  DRILL-8108: Excel Reader Fails with Duplicate Columns (#2428)
0adcb62 is described below

commit 0adcb62535e6a1f21cfcaf3cdda206b9f4f093f4
Author: Charles S. Givre <cg...@apache.org>
AuthorDate: Sun Jan 16 07:53:20 2022 -0500

    DRILL-8108: Excel Reader Fails with Duplicate Columns (#2428)
    
    * DRILL-8108 Excel Reader Fails with Duplicate Columns
    
    * Fix for case sensitivity
    
    * Formatting fix
    
    * Used TreeSet
---
 .../drill/exec/store/excel/ExcelBatchReader.java   |  31 ++++++++++++++
 .../drill/exec/store/excel/TestExcelFormat.java    |  46 +++++++++++++++++++++
 .../src/test/resources/excel/dup_col_names.xlsx    | Bin 0 -> 8986 bytes
 3 files changed, 77 insertions(+)

diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index f167c6d..cecec9a 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -58,6 +58,9 @@ import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
 
@@ -140,6 +143,7 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
 
   private final ExcelReaderConfig readerConfig;
   private final int maxRecords;
+  private final TreeSet<String> columnNameChecker;
   private Sheet sheet;
   private Row currentRow;
   private StreamingWorkbook streamingWorkbook;
@@ -161,6 +165,7 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
   private CustomErrorContext errorContext;
 
 
+
   static class ExcelReaderConfig {
     final ExcelFormatPlugin plugin;
     final int headerRow;
@@ -184,6 +189,7 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
   public ExcelBatchReader(ExcelReaderConfig readerConfig, int maxRecords) {
     this.readerConfig = readerConfig;
     this.maxRecords = maxRecords;
+    this.columnNameChecker = new TreeSet<>(String.CASE_INSENSITIVE_ORDER);
     firstLine = true;
   }
 
@@ -346,16 +352,20 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
 
             // Remove leading and trailing whitespace
             tempColumnName = tempColumnName.trim();
+            tempColumnName = deconflictColumnNames(tempColumnName);
             makeColumn(builder, tempColumnName, MinorType.VARCHAR);
             excelFieldNames.add(colPosition, tempColumnName);
+            columnNameChecker.add(tempColumnName);
             break;
           default:
             tempColumnName = cell.getStringCellValue();
 
             // Remove leading and trailing whitespace
             tempColumnName = tempColumnName.trim();
+            tempColumnName = deconflictColumnNames(tempColumnName);
             makeColumn(builder, tempColumnName, MinorType.FLOAT8);
             excelFieldNames.add(colPosition, tempColumnName);
+            columnNameChecker.add(tempColumnName);
             break;
         }
       }
@@ -365,6 +375,27 @@ public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
   }
 
   /**
+   * This function verifies whether a given column name is already present in the projected schema.
+   * If so, it appends _n to the column name.  N will be incremented for every duplicate column
+   * @param columnName The original column
+   * @return The deconflicted column name
+   */
+  private String deconflictColumnNames(String columnName) {
+    Pattern pattern = Pattern.compile("_(\\d+)$");
+    Matcher matcher = pattern.matcher(columnName);
+    while (columnNameChecker.contains(columnName)) {
+      if (matcher.find()) {
+        int index = Integer.parseInt(matcher.group(1));
+        index++;
+        columnName = matcher.replaceFirst("_" + index);
+      } else {
+        columnName = columnName + "_1";
+      }
+    }
+    return columnName;
+  }
+
+  /**
    * Helper function to get the selected sheet from the configuration
    * @return Sheet The selected sheet
    */
diff --git a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index e9304e2..dd37033 100644
--- a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++ b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
@@ -734,4 +734,50 @@ public class TestExcelFormat extends ClusterTest {
 
     new RowSetComparison(expected).verifyAndClearAll(results);
   }
+
+  @Test
+  public void testDuplicateColumnNames() throws Exception {
+    String sql = "SELECT * FROM cp.`excel/dup_col_names.xlsx`";
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addNullable("Col1", MinorType.FLOAT8)
+      .addNullable("Col1_1", MinorType.FLOAT8)
+      .addNullable("Col1_2", MinorType.FLOAT8)
+      .addNullable("Col1_2_1", MinorType.FLOAT8)
+      .addNullable("column1", MinorType.VARCHAR)
+      .addNullable("COLUMN1_1", MinorType.VARCHAR)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(1.0, 5.0, 9.0, 13.0, "a", "e")
+      .addRow(2.0, 6.0, 10.0, 14.0, "b", "f")
+      .addRow(3.0, 7.0, 11.0, 15.0, "c", "g")
+      .addRow(4.0, 9.0, 12.0, 16.0, "d", "h")
+      .build();
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
+  @Test
+  public void testDuplicateColumnNamesWithExplicitColumnNames() throws Exception {
+    String sql = "SELECT Col1, Col1_1, Col1_2, Col1_2_1 FROM cp.`excel/dup_col_names.xlsx`";
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addNullable("Col1", MinorType.FLOAT8)
+      .addNullable("Col1_1", MinorType.FLOAT8)
+      .addNullable("Col1_2", MinorType.FLOAT8)
+      .addNullable("Col1_2_1", MinorType.FLOAT8)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(1.0, 5.0, 9.0, 13.0)
+      .addRow(2.0, 6.0, 10.0, 14.0)
+      .addRow(3.0, 7.0, 11.0, 15.0)
+      .addRow(4.0, 9.0, 12.0, 16.0)
+      .build();
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
 }
diff --git a/contrib/format-excel/src/test/resources/excel/dup_col_names.xlsx b/contrib/format-excel/src/test/resources/excel/dup_col_names.xlsx
new file mode 100644
index 0000000..d5a4777
Binary files /dev/null and b/contrib/format-excel/src/test/resources/excel/dup_col_names.xlsx differ