You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2021/01/16 21:30:52 UTC

[iceberg] branch master updated: Hive: Queries should be case insensitive (#2053)

This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new b1de539  Hive: Queries should be case insensitive (#2053)
b1de539 is described below

commit b1de539cab4ff1241a4da492c2ad0edc0447235d
Author: dixingxing <di...@yeah.net>
AuthorDate: Sun Jan 17 05:30:43 2021 +0800

    Hive: Queries should be case insensitive (#2053)
    
    Co-authored-by: dixingxing <di...@autohome.com.cn>
---
 .../org/apache/iceberg/mr/InputFormatConfig.java   |  6 ++++--
 .../apache/iceberg/mr/hive/HiveIcebergSerDe.java   |  3 ++-
 .../IcebergRecordObjectInspector.java              |  4 +++-
 .../iceberg/mr/mapreduce/IcebergInputFormat.java   | 14 +++++++++-----
 .../hive/HiveIcebergStorageHandlerTestUtils.java   |  6 ++++++
 .../TestHiveIcebergStorageHandlerLocalScan.java    | 22 ++++++++++++++++++++++
 6 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java b/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java
index 4d3580a..e479610 100644
--- a/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java
+++ b/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java
@@ -34,7 +34,6 @@ public class InputFormatConfig {
 
   // configuration values for Iceberg input formats
   public static final String REUSE_CONTAINERS = "iceberg.mr.reuse.containers";
-  public static final String CASE_SENSITIVE = "iceberg.mr.case.sensitive";
   public static final String SKIP_RESIDUAL_FILTERING = "skip.residual.filtering";
   public static final String AS_OF_TIMESTAMP = "iceberg.mr.as.of.time";
   public static final String FILTER_EXPRESSION = "iceberg.mr.filter.expression";
@@ -60,6 +59,9 @@ public class InputFormatConfig {
   public static final String COMMIT_THREAD_POOL_SIZE = "iceberg.mr.commit.thread.pool.size";
   public static final int COMMIT_THREAD_POOL_SIZE_DEFAULT = 10;
 
+  public static final String CASE_SENSITIVE = "iceberg.mr.case.sensitive";
+  public static final boolean CASE_SENSITIVE_DEFAULT = true;
+
   public static final String CATALOG_NAME = "iceberg.catalog";
   public static final String HADOOP_CATALOG = "hadoop.catalog";
   public static final String HADOOP_TABLES = "hadoop.tables";
@@ -81,7 +83,7 @@ public class InputFormatConfig {
       this.conf = conf;
       // defaults
       conf.setBoolean(SKIP_RESIDUAL_FILTERING, false);
-      conf.setBoolean(CASE_SENSITIVE, true);
+      conf.setBoolean(CASE_SENSITIVE, CASE_SENSITIVE_DEFAULT);
       conf.setBoolean(REUSE_CONTAINERS, false);
       conf.setBoolean(LOCALITY, false);
     }
diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java
index 452b1e5..268c314 100644
--- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java
+++ b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java
@@ -80,12 +80,13 @@ public class HiveIcebergSerDe extends AbstractSerDe {
       }
     }
 
+    configuration.setBoolean(InputFormatConfig.CASE_SENSITIVE, false);
     String[] selectedColumns = ColumnProjectionUtils.getReadColumnNames(configuration);
     // When same table is joined multiple times, it is possible some selected columns are duplicated,
     // in this case wrong recordStructField position leads wrong value or ArrayIndexOutOfBoundException
     String[] distinctSelectedColumns = Arrays.stream(selectedColumns).distinct().toArray(String[]::new);
     Schema projectedSchema = distinctSelectedColumns.length > 0 ?
-            tableSchema.select(distinctSelectedColumns) : tableSchema;
+            tableSchema.caseInsensitiveSelect(distinctSelectedColumns) : tableSchema;
     // the input split mapper handles does not belong to this table
     // it is necessary to ensure projectedSchema equals to tableSchema,
     // or we cannot find selectOperator's column from inspector
diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java
index bc208c1..1f56b4c 100644
--- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java
+++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java
@@ -48,7 +48,9 @@ public final class IcebergRecordObjectInspector extends StructObjectInspector {
 
     for (Types.NestedField field : structType.fields()) {
       ObjectInspector oi = objectInspectors.get(position);
-      IcebergRecordStructField structField = new IcebergRecordStructField(field, oi, position);
+      Types.NestedField fieldInLowercase = Types.NestedField.of(field.fieldId(), field.isOptional(),
+              field.name().toLowerCase(), field.type(), field.doc());
+      IcebergRecordStructField structField = new IcebergRecordStructField(fieldInLowercase, oi, position);
       structFields.add(structField);
       position++;
     }
diff --git a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
index 116e729..e319269 100644
--- a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
+++ b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
@@ -100,7 +100,7 @@ public class IcebergInputFormat<T> extends InputFormat<Void, T> {
     }
 
     TableScan scan = table.newScan()
-            .caseSensitive(conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, true));
+            .caseSensitive(conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT));
     long snapshotId = conf.getLong(InputFormatConfig.SNAPSHOT_ID, -1);
     if (snapshotId != -1) {
       scan = scan.useSnapshot(snapshotId);
@@ -188,9 +188,9 @@ public class IcebergInputFormat<T> extends InputFormat<Void, T> {
       this.encryptionManager = ((IcebergSplit) split).encryptionManager();
       this.tasks = task.files().iterator();
       this.tableSchema = InputFormatConfig.tableSchema(conf);
-      this.expectedSchema = readSchema(conf, tableSchema);
+      this.caseSensitive = conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT);
+      this.expectedSchema = readSchema(conf, tableSchema, caseSensitive);
       this.reuseContainers = conf.getBoolean(InputFormatConfig.REUSE_CONTAINERS, false);
-      this.caseSensitive = conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, true);
       this.inMemoryDataModel = conf.getEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL,
               InputFormatConfig.InMemoryDataModel.GENERIC);
       this.currentIterator = open(tasks.next(), expectedSchema).iterator();
@@ -373,7 +373,7 @@ public class IcebergInputFormat<T> extends InputFormat<Void, T> {
       }
     }
 
-    private static Schema readSchema(Configuration conf, Schema tableSchema) {
+    private static Schema readSchema(Configuration conf, Schema tableSchema, boolean caseSensitive) {
       Schema readSchema = InputFormatConfig.readSchema(conf);
 
       if (readSchema != null) {
@@ -381,7 +381,11 @@ public class IcebergInputFormat<T> extends InputFormat<Void, T> {
       }
 
       String[] selectedColumns = InputFormatConfig.selectedColumns(conf);
-      return selectedColumns != null ? tableSchema.select(selectedColumns) : tableSchema;
+      if (selectedColumns == null) {
+        return tableSchema;
+      }
+
+      return caseSensitive ? tableSchema.select(selectedColumns) : tableSchema.caseInsensitiveSelect(selectedColumns);
     }
   }
 
diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java b/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java
index 97e0cb9..a02a997 100644
--- a/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java
+++ b/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java
@@ -41,6 +41,12 @@ public class HiveIcebergStorageHandlerTestUtils {
           optional(3, "last_name", Types.StringType.get())
   );
 
+  static final Schema CUSTOMER_SCHEMA_WITH_UPPERCASE = new Schema(
+          optional(1, "CustomER_Id", Types.LongType.get()),
+          optional(2, "First_name", Types.StringType.get()),
+          optional(3, "Last_name", Types.StringType.get())
+  );
+
   static final List<Record> CUSTOMER_RECORDS = TestHelper.RecordsBuilder.newInstance(CUSTOMER_SCHEMA)
           .add(0L, "Alice", "Brown")
           .add(1L, "Bob", "Green")
diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java
index 33203f9..6a0a065 100644
--- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java
+++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java
@@ -139,6 +139,28 @@ public class TestHiveIcebergStorageHandlerLocalScan {
   }
 
   @Test
+  public void testScanTableCaseInsensitive() throws IOException {
+    testTables.createTable(shell, "customers",
+            HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA_WITH_UPPERCASE, fileFormat,
+            HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
+
+    List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
+
+    Assert.assertEquals(3, rows.size());
+    Assert.assertArrayEquals(new Object[] {0L, "Alice", "Brown"}, rows.get(0));
+    Assert.assertArrayEquals(new Object[] {1L, "Bob", "Green"}, rows.get(1));
+    Assert.assertArrayEquals(new Object[] {2L, "Trudy", "Pink"}, rows.get(2));
+
+    rows = shell.executeStatement("SELECT * FROM default.customers where CustomER_Id < 2 " +
+            "and first_name in ('Alice', 'Bob')");
+
+    Assert.assertEquals(2, rows.size());
+    Assert.assertArrayEquals(new Object[] {0L, "Alice", "Brown"}, rows.get(0));
+    Assert.assertArrayEquals(new Object[] {1L, "Bob", "Green"}, rows.get(1));
+  }
+
+
+  @Test
   public void testDecimalTableWithPredicateLiterals() throws IOException {
     Schema schema = new Schema(required(1, "decimal_field", Types.DecimalType.of(7, 2)));
     List<Record> records = TestHelper.RecordsBuilder.newInstance(schema)