You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sz...@apache.org on 2022/06/02 07:50:48 UTC
[hive] branch master updated: HIVE-25421: Fallback from vectorization when reading Iceberg's time columns from ORC files (#3334) (Adam Szita, reviewed by Laszlo Pinter)
This is an automated email from the ASF dual-hosted git repository.
szita pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 63326ff775 HIVE-25421: Fallback from vectorization when reading Iceberg's time columns from ORC files (#3334) (Adam Szita, reviewed by Laszlo Pinter)
63326ff775 is described below
commit 63326ff775206e59547b6b1332e25279e90ef5ee
Author: Adam Szita <40...@users.noreply.github.com>
AuthorDate: Thu Jun 2 09:50:40 2022 +0200
HIVE-25421: Fallback from vectorization when reading Iceberg's time columns from ORC files (#3334) (Adam Szita, reviewed by Laszlo Pinter)
---
.../apache/iceberg/mr/hive/HiveIcebergStorageHandler.java | 15 ++++++++++++++-
.../apache/iceberg/mr/hive/TestHiveIcebergSelects.java | 9 ++++-----
2 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index ba066ed0db..c79d344272 100644
--- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -781,17 +781,30 @@ public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, H
* <li>iceberg format-version is "2"</li>
* <li>fileformat is set to avro</li>
* <li>querying metadata tables</li>
+ * <li>fileformat is set to ORC, and table schema has time type column</li>
* </ul>
* @param tableProps table properties, must be not null
*/
private void fallbackToNonVectorizedModeBasedOnProperties(Properties tableProps) {
if ("2".equals(tableProps.get(TableProperties.FORMAT_VERSION)) ||
FileFormat.AVRO.name().equalsIgnoreCase(tableProps.getProperty(TableProperties.DEFAULT_FILE_FORMAT)) ||
- (tableProps.containsKey("metaTable") && isValidMetadataTable(tableProps.getProperty("metaTable")))) {
+ (tableProps.containsKey("metaTable") && isValidMetadataTable(tableProps.getProperty("metaTable"))) ||
+ hasOrcTimeInSchema(tableProps)) {
conf.setBoolean(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED.varname, false);
}
}
+ // Iceberg Time type columns are written as longs into ORC files. There is no Time type in Hive, so it is represented
+ // as String instead. For ORC there's no automatic conversion from long to string during vectorized reading such as
+ // for example in Parquet (in Parquet files Time type is an int64 with 'time' logical annotation).
+ private static boolean hasOrcTimeInSchema(Properties tableProps) {
+ if (!FileFormat.ORC.name().equalsIgnoreCase(tableProps.getProperty(TableProperties.DEFAULT_FILE_FORMAT))) {
+ return false;
+ }
+ Schema tableSchema = SchemaParser.fromJson(tableProps.getProperty(InputFormatConfig.TABLE_SCHEMA));
+ return tableSchema.columns().stream().anyMatch(f -> Types.TimeType.get().typeId() == f.type().typeId());
+ }
+
/**
* Generates a JobContext for the OutputCommitter for the specific table.
* @param configuration The configuration used for as a base of the JobConf
diff --git a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSelects.java b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSelects.java
index ff54a9b0e2..a9c692d12e 100644
--- a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSelects.java
+++ b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSelects.java
@@ -117,9 +117,8 @@ public class TestHiveIcebergSelects extends HiveIcebergStorageHandlerWithEngineB
public void testJoinTablesSupportedTypes() throws IOException {
for (int i = 0; i < SUPPORTED_TYPES.size(); i++) {
Type type = SUPPORTED_TYPES.get(i);
- if ((type == Types.TimestampType.withZone() || type == Types.TimeType.get()) &&
- isVectorized && fileFormat == FileFormat.ORC) {
- // ORC/TIMESTAMP_INSTANT and time are not supported vectorized types for Hive
+ if ((type == Types.TimestampType.withZone()) && isVectorized && fileFormat == FileFormat.ORC) {
+ // ORC/TIMESTAMP_INSTANT is not supported vectorized types for Hive
continue;
}
// TODO: remove this filter when issue #1881 is resolved
@@ -145,9 +144,9 @@ public class TestHiveIcebergSelects extends HiveIcebergStorageHandlerWithEngineB
public void testSelectDistinctFromTable() throws IOException {
for (int i = 0; i < SUPPORTED_TYPES.size(); i++) {
Type type = SUPPORTED_TYPES.get(i);
- if ((type == Types.TimestampType.withZone() || type == Types.TimeType.get()) &&
+ if ((type == Types.TimestampType.withZone()) &&
isVectorized && fileFormat == FileFormat.ORC) {
- // ORC/TIMESTAMP_INSTANT and time are not supported vectorized types for Hive
+ // ORC/TIMESTAMP_INSTANT is not supported vectorized types for Hive
continue;
}
// TODO: remove this filter when issue #1881 is resolved