You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by ao...@apache.org on 2021/03/24 18:41:27 UTC
[iceberg] 12/18: ORC: Fix vectorized reads with metadata columns
(#2241)
This is an automated email from the ASF dual-hosted git repository.
aokolnychyi pushed a commit to branch 0.11.x
in repository https://gitbox.apache.org/repos/asf/iceberg.git
commit 42018f97ff1949b3f11c512c15a36b07a84f9551
Author: Anton Okolnychyi <ao...@apple.com>
AuthorDate: Tue Feb 16 10:39:25 2021 -0800
ORC: Fix vectorized reads with metadata columns (#2241)
---
.../java/org/apache/iceberg/spark/source/BatchDataReader.java | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java b/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
index 6f49509..d48cf24 100644
--- a/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
+++ b/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java
@@ -20,11 +20,13 @@
package org.apache.iceberg.spark.source;
import java.util.Map;
+import java.util.Set;
import org.apache.arrow.vector.NullCheckingForGet;
import org.apache.iceberg.CombinedScanTask;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.FileScanTask;
+import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.Schema;
import org.apache.iceberg.encryption.EncryptionManager;
import org.apache.iceberg.io.CloseableIterable;
@@ -35,6 +37,7 @@ import org.apache.iceberg.mapping.NameMappingParser;
import org.apache.iceberg.orc.ORC;
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders;
import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders;
import org.apache.iceberg.types.TypeUtil;
@@ -90,9 +93,12 @@ class BatchDataReader extends BaseDataReader<ColumnarBatch> {
iter = builder.build();
} else if (task.file().format() == FileFormat.ORC) {
- Schema schemaWithoutConstants = TypeUtil.selectNot(expectedSchema, idToConstant.keySet());
+ Set<Integer> constantFieldIds = idToConstant.keySet();
+ Set<Integer> metadataFieldIds = MetadataColumns.metadataFieldIds();
+ Sets.SetView<Integer> constantAndMetadataFieldIds = Sets.union(constantFieldIds, metadataFieldIds);
+ Schema schemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(expectedSchema, constantAndMetadataFieldIds);
ORC.ReadBuilder builder = ORC.read(location)
- .project(schemaWithoutConstants)
+ .project(schemaWithoutConstantAndMetadataFields)
.split(task.start(), task.length())
.createBatchedReaderFunc(fileSchema -> VectorizedSparkOrcReaders.buildReader(expectedSchema, fileSchema,
idToConstant))