You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2015/05/17 09:42:30 UTC
spark git commit: [SPARK-7447] [SQL] Don't re-merge Parquet schema
when the relation is deserialized
Repository: spark
Updated Branches:
refs/heads/master edf09ea1b -> 339905578
[SPARK-7447] [SQL] Don't re-merge Parquet schema when the relation is deserialized
JIRA: https://issues.apache.org/jira/browse/SPARK-7447
`MetadataCache` in `ParquetRelation2` is annotated as `transient`. When `ParquetRelation2` is deserialized, we ask `MetadataCache` to refresh and perform schema merging again. It is time-consuming especially for very many parquet files.
With the new `FSBasedParquetRelation`, although `MetadataCache` is not `transient` now, `MetadataCache.refresh()` still performs schema merging again when the relation is deserialized.
Author: Liang-Chi Hsieh <vi...@gmail.com>
Closes #6012 from viirya/without_remerge_schema and squashes the following commits:
2663957 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into without_remerge_schema
6ac7d93 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into without_remerge_schema
b0fc09b [Liang-Chi Hsieh] Don't generate and merge parquetSchema multiple times.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33990557
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33990557
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33990557
Branch: refs/heads/master
Commit: 339905578790fa37fcad9684b859b443313a5aa2
Parents: edf09ea
Author: Liang-Chi Hsieh <vi...@gmail.com>
Authored: Sun May 17 15:42:21 2015 +0800
Committer: Cheng Lian <li...@databricks.com>
Committed: Sun May 17 15:42:21 2015 +0800
----------------------------------------------------------------------
.../apache/spark/sql/parquet/newParquet.scala | 32 +++++++++++---------
1 file changed, 18 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/33990557/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 946062f..bcbdb1e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -340,7 +340,7 @@ private[sql] class ParquetRelation2(
// Schema of the actual Parquet files, without partition columns discovered from partition
// directory paths.
- var dataSchema: StructType = _
+ var dataSchema: StructType = null
// Schema of the whole table, including partition columns.
var schema: StructType = _
@@ -379,19 +379,23 @@ private[sql] class ParquetRelation2(
f -> new Footer(f.getPath, parquetMetadata)
}.seq.toMap
- dataSchema = {
- val dataSchema0 =
- maybeDataSchema
- .orElse(readSchema())
- .orElse(maybeMetastoreSchema)
- .getOrElse(sys.error("Failed to get the schema."))
-
- // If this Parquet relation is converted from a Hive Metastore table, must reconcile case
- // case insensitivity issue and possible schema mismatch (probably caused by schema
- // evolution).
- maybeMetastoreSchema
- .map(ParquetRelation2.mergeMetastoreParquetSchema(_, dataSchema0))
- .getOrElse(dataSchema0)
+ // If we already get the schema, don't need to re-compute it since the schema merging is
+ // time-consuming.
+ if (dataSchema == null) {
+ dataSchema = {
+ val dataSchema0 =
+ maybeDataSchema
+ .orElse(readSchema())
+ .orElse(maybeMetastoreSchema)
+ .getOrElse(sys.error("Failed to get the schema."))
+
+ // If this Parquet relation is converted from a Hive Metastore table, must reconcile case
+ // case insensitivity issue and possible schema mismatch (probably caused by schema
+ // evolution).
+ maybeMetastoreSchema
+ .map(ParquetRelation2.mergeMetastoreParquetSchema(_, dataSchema0))
+ .getOrElse(dataSchema0)
+ }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org