You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2021/01/16 21:30:52 UTC
[iceberg] branch master updated: Hive: Queries should be case
insensitive (#2053)
This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new b1de539 Hive: Queries should be case insensitive (#2053)
b1de539 is described below
commit b1de539cab4ff1241a4da492c2ad0edc0447235d
Author: dixingxing <di...@yeah.net>
AuthorDate: Sun Jan 17 05:30:43 2021 +0800
Hive: Queries should be case insensitive (#2053)
Co-authored-by: dixingxing <di...@autohome.com.cn>
---
.../org/apache/iceberg/mr/InputFormatConfig.java | 6 ++++--
.../apache/iceberg/mr/hive/HiveIcebergSerDe.java | 3 ++-
.../IcebergRecordObjectInspector.java | 4 +++-
.../iceberg/mr/mapreduce/IcebergInputFormat.java | 14 +++++++++-----
.../hive/HiveIcebergStorageHandlerTestUtils.java | 6 ++++++
.../TestHiveIcebergStorageHandlerLocalScan.java | 22 ++++++++++++++++++++++
6 files changed, 46 insertions(+), 9 deletions(-)
diff --git a/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java b/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java
index 4d3580a..e479610 100644
--- a/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java
+++ b/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java
@@ -34,7 +34,6 @@ public class InputFormatConfig {
// configuration values for Iceberg input formats
public static final String REUSE_CONTAINERS = "iceberg.mr.reuse.containers";
- public static final String CASE_SENSITIVE = "iceberg.mr.case.sensitive";
public static final String SKIP_RESIDUAL_FILTERING = "skip.residual.filtering";
public static final String AS_OF_TIMESTAMP = "iceberg.mr.as.of.time";
public static final String FILTER_EXPRESSION = "iceberg.mr.filter.expression";
@@ -60,6 +59,9 @@ public class InputFormatConfig {
public static final String COMMIT_THREAD_POOL_SIZE = "iceberg.mr.commit.thread.pool.size";
public static final int COMMIT_THREAD_POOL_SIZE_DEFAULT = 10;
+ public static final String CASE_SENSITIVE = "iceberg.mr.case.sensitive";
+ public static final boolean CASE_SENSITIVE_DEFAULT = true;
+
public static final String CATALOG_NAME = "iceberg.catalog";
public static final String HADOOP_CATALOG = "hadoop.catalog";
public static final String HADOOP_TABLES = "hadoop.tables";
@@ -81,7 +83,7 @@ public class InputFormatConfig {
this.conf = conf;
// defaults
conf.setBoolean(SKIP_RESIDUAL_FILTERING, false);
- conf.setBoolean(CASE_SENSITIVE, true);
+ conf.setBoolean(CASE_SENSITIVE, CASE_SENSITIVE_DEFAULT);
conf.setBoolean(REUSE_CONTAINERS, false);
conf.setBoolean(LOCALITY, false);
}
diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java
index 452b1e5..268c314 100644
--- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java
+++ b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java
@@ -80,12 +80,13 @@ public class HiveIcebergSerDe extends AbstractSerDe {
}
}
+ configuration.setBoolean(InputFormatConfig.CASE_SENSITIVE, false);
String[] selectedColumns = ColumnProjectionUtils.getReadColumnNames(configuration);
// When same table is joined multiple times, it is possible some selected columns are duplicated,
// in this case wrong recordStructField position leads wrong value or ArrayIndexOutOfBoundException
String[] distinctSelectedColumns = Arrays.stream(selectedColumns).distinct().toArray(String[]::new);
Schema projectedSchema = distinctSelectedColumns.length > 0 ?
- tableSchema.select(distinctSelectedColumns) : tableSchema;
+ tableSchema.caseInsensitiveSelect(distinctSelectedColumns) : tableSchema;
// the input split mapper handles does not belong to this table
// it is necessary to ensure projectedSchema equals to tableSchema,
// or we cannot find selectOperator's column from inspector
diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java
index bc208c1..1f56b4c 100644
--- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java
+++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java
@@ -48,7 +48,9 @@ public final class IcebergRecordObjectInspector extends StructObjectInspector {
for (Types.NestedField field : structType.fields()) {
ObjectInspector oi = objectInspectors.get(position);
- IcebergRecordStructField structField = new IcebergRecordStructField(field, oi, position);
+ Types.NestedField fieldInLowercase = Types.NestedField.of(field.fieldId(), field.isOptional(),
+ field.name().toLowerCase(), field.type(), field.doc());
+ IcebergRecordStructField structField = new IcebergRecordStructField(fieldInLowercase, oi, position);
structFields.add(structField);
position++;
}
diff --git a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
index 116e729..e319269 100644
--- a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
+++ b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
@@ -100,7 +100,7 @@ public class IcebergInputFormat<T> extends InputFormat<Void, T> {
}
TableScan scan = table.newScan()
- .caseSensitive(conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, true));
+ .caseSensitive(conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT));
long snapshotId = conf.getLong(InputFormatConfig.SNAPSHOT_ID, -1);
if (snapshotId != -1) {
scan = scan.useSnapshot(snapshotId);
@@ -188,9 +188,9 @@ public class IcebergInputFormat<T> extends InputFormat<Void, T> {
this.encryptionManager = ((IcebergSplit) split).encryptionManager();
this.tasks = task.files().iterator();
this.tableSchema = InputFormatConfig.tableSchema(conf);
- this.expectedSchema = readSchema(conf, tableSchema);
+ this.caseSensitive = conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT);
+ this.expectedSchema = readSchema(conf, tableSchema, caseSensitive);
this.reuseContainers = conf.getBoolean(InputFormatConfig.REUSE_CONTAINERS, false);
- this.caseSensitive = conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, true);
this.inMemoryDataModel = conf.getEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL,
InputFormatConfig.InMemoryDataModel.GENERIC);
this.currentIterator = open(tasks.next(), expectedSchema).iterator();
@@ -373,7 +373,7 @@ public class IcebergInputFormat<T> extends InputFormat<Void, T> {
}
}
- private static Schema readSchema(Configuration conf, Schema tableSchema) {
+ private static Schema readSchema(Configuration conf, Schema tableSchema, boolean caseSensitive) {
Schema readSchema = InputFormatConfig.readSchema(conf);
if (readSchema != null) {
@@ -381,7 +381,11 @@ public class IcebergInputFormat<T> extends InputFormat<Void, T> {
}
String[] selectedColumns = InputFormatConfig.selectedColumns(conf);
- return selectedColumns != null ? tableSchema.select(selectedColumns) : tableSchema;
+ if (selectedColumns == null) {
+ return tableSchema;
+ }
+
+ return caseSensitive ? tableSchema.select(selectedColumns) : tableSchema.caseInsensitiveSelect(selectedColumns);
}
}
diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java b/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java
index 97e0cb9..a02a997 100644
--- a/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java
+++ b/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java
@@ -41,6 +41,12 @@ public class HiveIcebergStorageHandlerTestUtils {
optional(3, "last_name", Types.StringType.get())
);
+ static final Schema CUSTOMER_SCHEMA_WITH_UPPERCASE = new Schema(
+ optional(1, "CustomER_Id", Types.LongType.get()),
+ optional(2, "First_name", Types.StringType.get()),
+ optional(3, "Last_name", Types.StringType.get())
+ );
+
static final List<Record> CUSTOMER_RECORDS = TestHelper.RecordsBuilder.newInstance(CUSTOMER_SCHEMA)
.add(0L, "Alice", "Brown")
.add(1L, "Bob", "Green")
diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java
index 33203f9..6a0a065 100644
--- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java
+++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java
@@ -139,6 +139,28 @@ public class TestHiveIcebergStorageHandlerLocalScan {
}
@Test
+ public void testScanTableCaseInsensitive() throws IOException {
+ testTables.createTable(shell, "customers",
+ HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA_WITH_UPPERCASE, fileFormat,
+ HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
+
+ List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
+
+ Assert.assertEquals(3, rows.size());
+ Assert.assertArrayEquals(new Object[] {0L, "Alice", "Brown"}, rows.get(0));
+ Assert.assertArrayEquals(new Object[] {1L, "Bob", "Green"}, rows.get(1));
+ Assert.assertArrayEquals(new Object[] {2L, "Trudy", "Pink"}, rows.get(2));
+
+ rows = shell.executeStatement("SELECT * FROM default.customers where CustomER_Id < 2 " +
+ "and first_name in ('Alice', 'Bob')");
+
+ Assert.assertEquals(2, rows.size());
+ Assert.assertArrayEquals(new Object[] {0L, "Alice", "Brown"}, rows.get(0));
+ Assert.assertArrayEquals(new Object[] {1L, "Bob", "Green"}, rows.get(1));
+ }
+
+
+ @Test
public void testDecimalTableWithPredicateLiterals() throws IOException {
Schema schema = new Schema(required(1, "decimal_field", Types.DecimalType.of(7, 2)));
List<Record> records = TestHelper.RecordsBuilder.newInstance(schema)