You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sz...@apache.org on 2022/02/15 07:50:21 UTC
[hive] branch master updated: HIVE-25955: Partitioned tables migrated to Iceberg aren't cached in LLAP (#3026) (Adam Szita, reviewed by Peter Vary)
This is an automated email from the ASF dual-hosted git repository.
szita pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 62668e5 HIVE-25955: Partitioned tables migrated to Iceberg aren't cached in LLAP (#3026) (Adam Szita, reviewed by Peter Vary)
62668e5 is described below
commit 62668e5b5b65e92f87e8f0188621d21fe1a98426
Author: Adam Szita <40...@users.noreply.github.com>
AuthorDate: Tue Feb 15 08:49:58 2022 +0100
HIVE-25955: Partitioned tables migrated to Iceberg aren't cached in LLAP (#3026) (Adam Szita, reviewed by Peter Vary)
---
.../apache/iceberg/orc/VectorizedReadUtils.java | 16 ++++++-
.../test/queries/positive/llap_iceberg_read_orc.q | 18 +++++++-
.../positive/llap/llap_iceberg_read_orc.q.out | 52 ++++++++++++++++++++++
.../hive/llap/io/api/impl/LlapRecordReader.java | 2 +
4 files changed, 85 insertions(+), 3 deletions(-)
diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/orc/VectorizedReadUtils.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/orc/VectorizedReadUtils.java
index 287dd04..2f6b3ab 100644
--- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/orc/VectorizedReadUtils.java
+++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/orc/VectorizedReadUtils.java
@@ -21,6 +21,7 @@ package org.apache.iceberg.orc;
import java.io.IOException;
import java.nio.ByteBuffer;
+import java.util.stream.Collectors;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.io.CacheTag;
import org.apache.hadoop.hive.conf.HiveConf;
@@ -37,6 +38,7 @@ import org.apache.hive.iceberg.org.apache.orc.TypeDescription;
import org.apache.hive.iceberg.org.apache.orc.impl.OrcTail;
import org.apache.hive.iceberg.org.apache.orc.impl.ReaderImpl;
import org.apache.iceberg.FileScanTask;
+import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.expressions.Binder;
import org.apache.iceberg.expressions.Expression;
@@ -134,15 +136,25 @@ public class VectorizedReadUtils {
// We need to map with the current (i.e. current Hive table columns) full schema (without projections),
// as OrcInputFormat will take care of the projections by the use of an include boolean array
- Schema currentSchema = task.spec().schema();
+ PartitionSpec spec = task.spec();
+ Schema currentSchema = spec.schema();
TypeDescription readOrcSchema;
if (ORCSchemaUtil.hasIds(fileSchema)) {
readOrcSchema = ORCSchemaUtil.buildOrcProjection(currentSchema, fileSchema);
} else {
+ Schema readSchemaForOriginalFile = currentSchema;
+ // In case of migrated, originally partitioned tables, partition values are not present in the file
+ if (spec.isPartitioned()) {
+ readSchemaForOriginalFile = currentSchema.select(currentSchema.columns().stream()
+ .filter(c -> !spec.identitySourceIds().contains(c.fieldId()))
+ .map(c -> c.name())
+ .collect(Collectors.toList()));
+ }
+
TypeDescription typeWithIds =
ORCSchemaUtil.applyNameMapping(fileSchema, MappingUtil.create(currentSchema));
- readOrcSchema = ORCSchemaUtil.buildOrcProjection(currentSchema, typeWithIds);
+ readOrcSchema = ORCSchemaUtil.buildOrcProjection(readSchemaForOriginalFile, typeWithIds);
}
job.set(ColumnProjectionUtils.ORC_SCHEMA_STRING, readOrcSchema.toString());
diff --git a/iceberg/iceberg-handler/src/test/queries/positive/llap_iceberg_read_orc.q b/iceberg/iceberg-handler/src/test/queries/positive/llap_iceberg_read_orc.q
index c692dd4..b171157 100644
--- a/iceberg/iceberg-handler/src/test/queries/positive/llap_iceberg_read_orc.q
+++ b/iceberg/iceberg-handler/src/test/queries/positive/llap_iceberg_read_orc.q
@@ -4,6 +4,7 @@ set hive.vectorized.execution.enabled=true;
DROP TABLE IF EXISTS llap_orders PURGE;
DROP TABLE IF EXISTS llap_items PURGE;
+DROP TABLE IF EXISTS mig_source PURGE;
CREATE EXTERNAL TABLE llap_items (itemid INT, price INT, category STRING, name STRING, description STRING) STORED BY ICEBERG STORED AS ORC;
@@ -103,4 +104,19 @@ SELECT state, max(city), avg(itemid) from llap_orders WHERE region = 'EU' GROUP
--some more projections
SELECT o.city, i.name, min(i.cost), max(to60), sum(o.quantity) FROM llap_items i JOIN llap_orders o ON i.itemid = o.itemid WHERE region = 'EU' and i.cost >= 50000 and ordertime > timestamp('2010-01-01') GROUP BY o.city, i.name;
-SELECT i.name, i.description, SUM(o.quantity) FROM llap_items i JOIN llap_orders o ON i.itemid = o.itemid WHERE region = 'EU' and i.cost >= 50000 GROUP BY i.name, i.description;
\ No newline at end of file
+SELECT i.name, i.description, SUM(o.quantity) FROM llap_items i JOIN llap_orders o ON i.itemid = o.itemid WHERE region = 'EU' and i.cost >= 50000 GROUP BY i.name, i.description;
+
+---------------------------------------------
+--Test migrated partitioned table gets cached
+
+CREATE EXTERNAL TABLE mig_source (id int) partitioned by (region string) stored as ORC;
+INSERT INTO mig_source VALUES (1, 'EU'), (1, 'US'), (2, 'EU'), (3, 'EU'), (2, 'US');
+ALTER TABLE mig_source SET TBLPROPERTIES ('storage_handler'='org.apache.iceberg.mr.hive.HiveIcebergStorageHandler');
+
+-- Should miss, but fill cache
+SELECT region, SUM(id) from mig_source GROUP BY region;
+
+-- Should hit cache
+set hive.llap.io.cache.only=true;
+SELECT region, SUM(id) from mig_source GROUP BY region;
+set hive.llap.io.cache.only=false;
diff --git a/iceberg/iceberg-handler/src/test/results/positive/llap/llap_iceberg_read_orc.q.out b/iceberg/iceberg-handler/src/test/results/positive/llap/llap_iceberg_read_orc.q.out
index 23392e3..585d8a0 100644
--- a/iceberg/iceberg-handler/src/test/results/positive/llap/llap_iceberg_read_orc.q.out
+++ b/iceberg/iceberg-handler/src/test/results/positive/llap/llap_iceberg_read_orc.q.out
@@ -6,6 +6,10 @@ PREHOOK: query: DROP TABLE IF EXISTS llap_items PURGE
PREHOOK: type: DROPTABLE
POSTHOOK: query: DROP TABLE IF EXISTS llap_items PURGE
POSTHOOK: type: DROPTABLE
+PREHOOK: query: DROP TABLE IF EXISTS mig_source PURGE
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS mig_source PURGE
+POSTHOOK: type: DROPTABLE
PREHOOK: query: CREATE EXTERNAL TABLE llap_items (itemid INT, price INT, category STRING, name STRING, description STRING) STORED BY ICEBERG STORED AS ORC
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
@@ -411,3 +415,51 @@ Model 3 Performance 42
Model S Long range 389
Model S Plaid 221
Model Y Performance 163
+PREHOOK: query: CREATE EXTERNAL TABLE mig_source (id int) partitioned by (region string) stored as ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@mig_source
+POSTHOOK: query: CREATE EXTERNAL TABLE mig_source (id int) partitioned by (region string) stored as ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@mig_source
+PREHOOK: query: INSERT INTO mig_source VALUES (1, 'EU'), (1, 'US'), (2, 'EU'), (3, 'EU'), (2, 'US')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@mig_source
+POSTHOOK: query: INSERT INTO mig_source VALUES (1, 'EU'), (1, 'US'), (2, 'EU'), (3, 'EU'), (2, 'US')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@mig_source
+POSTHOOK: Output: default@mig_source@region=EU
+POSTHOOK: Output: default@mig_source@region=US
+POSTHOOK: Lineage: mig_source PARTITION(region=EU).id SCRIPT []
+POSTHOOK: Lineage: mig_source PARTITION(region=US).id SCRIPT []
+PREHOOK: query: ALTER TABLE mig_source SET TBLPROPERTIES ('storage_handler'='org.apache.iceberg.mr.hive.HiveIcebergStorageHandler')
+PREHOOK: type: ALTERTABLE_PROPERTIES
+PREHOOK: Input: default@mig_source
+PREHOOK: Output: default@mig_source
+POSTHOOK: query: ALTER TABLE mig_source SET TBLPROPERTIES ('storage_handler'='org.apache.iceberg.mr.hive.HiveIcebergStorageHandler')
+POSTHOOK: type: ALTERTABLE_PROPERTIES
+POSTHOOK: Input: default@mig_source
+POSTHOOK: Output: default@mig_source
+PREHOOK: query: SELECT region, SUM(id) from mig_source GROUP BY region
+PREHOOK: type: QUERY
+PREHOOK: Input: default@mig_source
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT region, SUM(id) from mig_source GROUP BY region
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@mig_source
+#### A masked pattern was here ####
+EU 6
+US 3
+PREHOOK: query: SELECT region, SUM(id) from mig_source GROUP BY region
+PREHOOK: type: QUERY
+PREHOOK: Input: default@mig_source
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT region, SUM(id) from mig_source GROUP BY region
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@mig_source
+#### A masked pattern was here ####
+EU 6
+US 3
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java
index c2005d6..315752d 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java
@@ -84,6 +84,7 @@ import org.slf4j.LoggerFactory;
import org.slf4j.MDC;
import static java.util.stream.Collectors.toList;
+import static org.apache.hadoop.hive.llap.LlapHiveUtils.throwIfCacheOnlyRead;
class LlapRecordReader implements RecordReader<NullWritable, VectorizedRowBatch>, Consumer<ColumnVectorBatch> {
@@ -135,6 +136,7 @@ class LlapRecordReader implements RecordReader<NullWritable, VectorizedRowBatch>
cvp, executor, sourceInputFormat, sourceSerDe, reporter, daemonConf);
if (!rr.checkOrcSchemaEvolution()) {
rr.close();
+ throwIfCacheOnlyRead(HiveConf.getBoolVar(job, ConfVars.LLAP_IO_CACHE_ONLY));
return null;
}
return rr;