You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sz...@apache.org on 2022/06/02 07:50:48 UTC

[hive] branch master updated: HIVE-25421: Fallback from vectorization when reading Iceberg's time columns from ORC files (#3334) (Adam Szita, reviewed by Laszlo Pinter)

This is an automated email from the ASF dual-hosted git repository.

szita pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 63326ff775 HIVE-25421: Fallback from vectorization when reading Iceberg's time columns from ORC files (#3334) (Adam Szita, reviewed by Laszlo Pinter)
63326ff775 is described below

commit 63326ff775206e59547b6b1332e25279e90ef5ee
Author: Adam Szita <40...@users.noreply.github.com>
AuthorDate: Thu Jun 2 09:50:40 2022 +0200

    HIVE-25421: Fallback from vectorization when reading Iceberg's time columns from ORC files (#3334) (Adam Szita, reviewed by Laszlo Pinter)
---
 .../apache/iceberg/mr/hive/HiveIcebergStorageHandler.java | 15 ++++++++++++++-
 .../apache/iceberg/mr/hive/TestHiveIcebergSelects.java    |  9 ++++-----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index ba066ed0db..c79d344272 100644
--- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -781,17 +781,30 @@ public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, H
    *   <li>iceberg format-version is "2"</li>
    *   <li>fileformat is set to avro</li>
    *   <li>querying metadata tables</li>
+   *   <li>fileformat is set to ORC, and table schema has time type column</li>
    * </ul>
    * @param tableProps table properties, must be not null
    */
   private void fallbackToNonVectorizedModeBasedOnProperties(Properties tableProps) {
     if ("2".equals(tableProps.get(TableProperties.FORMAT_VERSION)) ||
         FileFormat.AVRO.name().equalsIgnoreCase(tableProps.getProperty(TableProperties.DEFAULT_FILE_FORMAT)) ||
-        (tableProps.containsKey("metaTable") && isValidMetadataTable(tableProps.getProperty("metaTable")))) {
+        (tableProps.containsKey("metaTable") && isValidMetadataTable(tableProps.getProperty("metaTable"))) ||
+        hasOrcTimeInSchema(tableProps)) {
       conf.setBoolean(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED.varname, false);
     }
   }
 
+  // Iceberg Time type columns are written as longs into ORC files. There is no Time type in Hive, so it is represented
+  // as String instead. For ORC there's no automatic conversion from long to string during vectorized reading such as
+  // for example in Parquet (in Parquet files Time type is an int64 with 'time' logical annotation).
+  private static boolean hasOrcTimeInSchema(Properties tableProps) {
+    if (!FileFormat.ORC.name().equalsIgnoreCase(tableProps.getProperty(TableProperties.DEFAULT_FILE_FORMAT))) {
+      return false;
+    }
+    Schema tableSchema = SchemaParser.fromJson(tableProps.getProperty(InputFormatConfig.TABLE_SCHEMA));
+    return tableSchema.columns().stream().anyMatch(f -> Types.TimeType.get().typeId() == f.type().typeId());
+  }
+
   /**
    * Generates a JobContext for the OutputCommitter for the specific table.
    * @param configuration The configuration used for as a base of the JobConf
diff --git a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSelects.java b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSelects.java
index ff54a9b0e2..a9c692d12e 100644
--- a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSelects.java
+++ b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSelects.java
@@ -117,9 +117,8 @@ public class TestHiveIcebergSelects extends HiveIcebergStorageHandlerWithEngineB
   public void testJoinTablesSupportedTypes() throws IOException {
     for (int i = 0; i < SUPPORTED_TYPES.size(); i++) {
       Type type = SUPPORTED_TYPES.get(i);
-      if ((type == Types.TimestampType.withZone() || type == Types.TimeType.get()) &&
-          isVectorized && fileFormat == FileFormat.ORC) {
-        // ORC/TIMESTAMP_INSTANT and time are not supported vectorized types for Hive
+      if ((type == Types.TimestampType.withZone()) && isVectorized && fileFormat == FileFormat.ORC) {
+        // ORC/TIMESTAMP_INSTANT is not supported vectorized types for Hive
         continue;
       }
       // TODO: remove this filter when issue #1881 is resolved
@@ -145,9 +144,9 @@ public class TestHiveIcebergSelects extends HiveIcebergStorageHandlerWithEngineB
   public void testSelectDistinctFromTable() throws IOException {
     for (int i = 0; i < SUPPORTED_TYPES.size(); i++) {
       Type type = SUPPORTED_TYPES.get(i);
-      if ((type == Types.TimestampType.withZone() || type == Types.TimeType.get()) &&
+      if ((type == Types.TimestampType.withZone()) &&
           isVectorized && fileFormat == FileFormat.ORC) {
-        // ORC/TIMESTAMP_INSTANT and time are not supported vectorized types for Hive
+        // ORC/TIMESTAMP_INSTANT is not supported vectorized types for Hive
         continue;
       }
       // TODO: remove this filter when issue #1881 is resolved