You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by yu...@apache.org on 2021/01/29 00:08:33 UTC

[spark] branch master updated: [SPARK-26346][BUILD][SQL] Upgrade Parquet to 1.11.1

This is an automated email from the ASF dual-hosted git repository.

yumwang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new a7683af  [SPARK-26346][BUILD][SQL] Upgrade Parquet to 1.11.1
a7683af is described below

commit a7683afdf498c3ee09466dfa9635edacb5cc8f0c
Author: Yuming Wang <yu...@ebay.com>
AuthorDate: Fri Jan 29 08:07:49 2021 +0800

    [SPARK-26346][BUILD][SQL] Upgrade Parquet to 1.11.1
    
    ### What changes were proposed in this pull request?
    
    This PR upgrade Parquet to 1.11.1.
    
    Parquet 1.11.1 new features:
    
    - [PARQUET-1201](https://issues.apache.org/jira/browse/PARQUET-1201) - Column indexes
    - [PARQUET-1253](https://issues.apache.org/jira/browse/PARQUET-1253) - Support for new logical type representation
    - [PARQUET-1388](https://issues.apache.org/jira/browse/PARQUET-1388) - Nanosecond precision time and timestamp - parquet-mr
    
    More details:
    https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.1/CHANGES.md
    
    ### Why are the changes needed?
    Support column indexes to improve query performance.
    
    ### Does this PR introduce any user-facing change?
    No.
    
    ### How was this patch tested?
    Existing test.
    
    Closes #26804 from wangyum/SPARK-26346.
    
    Authored-by: Yuming Wang <yu...@ebay.com>
    Signed-off-by: Yuming Wang <yu...@ebay.com>
---
 dev/deps/spark-deps-hadoop-2.7-hive-2.3            | 12 ++++++------
 dev/deps/spark-deps-hadoop-3.2-hive-2.3            | 12 ++++++------
 pom.xml                                            |  6 +++++-
 .../datasources/parquet/ParquetSchemaSuite.scala   | 22 +++++++++++-----------
 .../apache/spark/sql/streaming/StreamSuite.scala   |  4 +++-
 .../apache/spark/sql/hive/StatisticsSuite.scala    |  2 +-
 6 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3
index 2c468b8..179ab36 100644
--- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3
@@ -202,12 +202,12 @@ orc-shims/1.6.7//orc-shims-1.6.7.jar
 oro/2.0.8//oro-2.0.8.jar
 osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar
 paranamer/2.8//paranamer-2.8.jar
-parquet-column/1.10.1//parquet-column-1.10.1.jar
-parquet-common/1.10.1//parquet-common-1.10.1.jar
-parquet-encoding/1.10.1//parquet-encoding-1.10.1.jar
-parquet-format/2.4.0//parquet-format-2.4.0.jar
-parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar
-parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar
+parquet-column/1.11.1//parquet-column-1.11.1.jar
+parquet-common/1.11.1//parquet-common-1.11.1.jar
+parquet-encoding/1.11.1//parquet-encoding-1.11.1.jar
+parquet-format-structures/1.11.1//parquet-format-structures-1.11.1.jar
+parquet-hadoop/1.11.1//parquet-hadoop-1.11.1.jar
+parquet-jackson/1.11.1//parquet-jackson-1.11.1.jar
 protobuf-java/2.5.0//protobuf-java-2.5.0.jar
 py4j/0.10.9.1//py4j-0.10.9.1.jar
 pyrolite/4.30//pyrolite-4.30.jar
diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3
index 894fd6a..83c32c4 100644
--- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3
@@ -172,12 +172,12 @@ orc-shims/1.6.7//orc-shims-1.6.7.jar
 oro/2.0.8//oro-2.0.8.jar
 osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar
 paranamer/2.8//paranamer-2.8.jar
-parquet-column/1.10.1//parquet-column-1.10.1.jar
-parquet-common/1.10.1//parquet-common-1.10.1.jar
-parquet-encoding/1.10.1//parquet-encoding-1.10.1.jar
-parquet-format/2.4.0//parquet-format-2.4.0.jar
-parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar
-parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar
+parquet-column/1.11.1//parquet-column-1.11.1.jar
+parquet-common/1.11.1//parquet-common-1.11.1.jar
+parquet-encoding/1.11.1//parquet-encoding-1.11.1.jar
+parquet-format-structures/1.11.1//parquet-format-structures-1.11.1.jar
+parquet-hadoop/1.11.1//parquet-hadoop-1.11.1.jar
+parquet-jackson/1.11.1//parquet-jackson-1.11.1.jar
 protobuf-java/2.5.0//protobuf-java-2.5.0.jar
 py4j/0.10.9.1//py4j-0.10.9.1.jar
 pyrolite/4.30//pyrolite-4.30.jar
diff --git a/pom.xml b/pom.xml
index 84ec92e..05a2e04 100644
--- a/pom.xml
+++ b/pom.xml
@@ -136,7 +136,7 @@
     <kafka.version>2.6.0</kafka.version>
     <!-- After 10.15.1.3, the minimum required version is JDK9 -->
     <derby.version>10.14.2.0</derby.version>
-    <parquet.version>1.10.1</parquet.version>
+    <parquet.version>1.11.1</parquet.version>
     <orc.version>1.6.7</orc.version>
     <jetty.version>9.4.34.v20201102</jetty.version>
     <jakartaservlet.version>4.0.3</jakartaservlet.version>
@@ -2290,6 +2290,10 @@
             <groupId>commons-pool</groupId>
             <artifactId>commons-pool</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>javax.annotation</groupId>
+            <artifactId>javax.annotation-api</artifactId>
+          </exclusion>
         </exclusions>
       </dependency>
       <dependency>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index e97c6cd..fcc08ee 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -251,7 +251,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """
       |message root {
       |  optional group _1 (MAP) {
-      |    repeated group map (MAP_KEY_VALUE) {
+      |    repeated group key_value (MAP_KEY_VALUE) {
       |      required int32 key;
       |      optional binary value (UTF8);
       |    }
@@ -267,7 +267,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """
       |message root {
       |  optional group _1 (MAP) {
-      |    repeated group map (MAP_KEY_VALUE) {
+      |    repeated group key_value (MAP_KEY_VALUE) {
       |      required group key {
       |        optional binary _1 (UTF8);
       |        optional binary _2 (UTF8);
@@ -300,7 +300,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """
       |message root {
       |  optional group _1 (MAP_KEY_VALUE) {
-      |    repeated group map {
+      |    repeated group key_value {
       |      required int32 key;
       |      optional group value {
       |        optional binary _1 (UTF8);
@@ -740,7 +740,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
         nullable = true))),
     """message root {
       |  optional group f1 (MAP_KEY_VALUE) {
-      |    repeated group map {
+      |    repeated group key_value {
       |      required int32 num;
       |      required binary str (UTF8);
       |    }
@@ -759,7 +759,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
         nullable = true))),
     """message root {
       |  optional group f1 (MAP) {
-      |    repeated group map (MAP_KEY_VALUE) {
+      |    repeated group key_value (MAP_KEY_VALUE) {
       |      required int32 key;
       |      required binary value (UTF8);
       |    }
@@ -797,7 +797,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
         nullable = true))),
     """message root {
       |  optional group f1 (MAP_KEY_VALUE) {
-      |    repeated group map {
+      |    repeated group key_value {
       |      required int32 num;
       |      optional binary str (UTF8);
       |    }
@@ -816,7 +816,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
         nullable = true))),
     """message root {
       |  optional group f1 (MAP) {
-      |    repeated group map (MAP_KEY_VALUE) {
+      |    repeated group key_value (MAP_KEY_VALUE) {
       |      required int32 key;
       |      optional binary value (UTF8);
       |    }
@@ -857,7 +857,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
         nullable = true))),
     """message root {
       |  optional group f1 (MAP) {
-      |    repeated group map (MAP_KEY_VALUE) {
+      |    repeated group key_value (MAP_KEY_VALUE) {
       |      required int32 key;
       |      required binary value (UTF8);
       |    }
@@ -893,7 +893,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
         nullable = true))),
     """message root {
       |  optional group f1 (MAP) {
-      |    repeated group map (MAP_KEY_VALUE) {
+      |    repeated group key_value (MAP_KEY_VALUE) {
       |      required int32 key;
       |      optional binary value (UTF8);
       |    }
@@ -1447,7 +1447,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     parquetSchema =
       """message root {
         |  required group f0 (MAP) {
-        |    repeated group map (MAP_KEY_VALUE) {
+        |    repeated group key_value (MAP_KEY_VALUE) {
         |      required int32 key;
         |      required group value {
         |        required int32 value_f0;
@@ -1472,7 +1472,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     expectedSchema =
       """message root {
         |  required group f0 (MAP) {
-        |    repeated group map (MAP_KEY_VALUE) {
+        |    repeated group key_value (MAP_KEY_VALUE) {
         |      required int32 key;
         |      required group value {
         |        required int64 value_f1;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index 440fe99..c4e43d2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -214,7 +214,9 @@ class StreamSuite extends StreamTest {
             .start(outputDir.getAbsolutePath)
           try {
             query.processAllAvailable()
-            val outputDf = spark.read.parquet(outputDir.getAbsolutePath).as[Long]
+            // Parquet write page-level CRC checksums will change the file size and
+            // affect the data order when reading these files. Please see PARQUET-1746 for details.
+            val outputDf = spark.read.parquet(outputDir.getAbsolutePath).sort('a).as[Long]
             checkDataset[Long](outputDf, (0L to 10L).toArray: _*)
           } finally {
             query.stop()
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 5357f4b..c91ee92 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -1528,7 +1528,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
         Seq(tbl, ext_tbl).foreach { tblName =>
           sql(s"INSERT INTO $tblName VALUES (1, 'a', '2019-12-13')")
 
-          val expectedSize = 601
+          val expectedSize = 651
           // analyze table
           sql(s"ANALYZE TABLE $tblName COMPUTE STATISTICS NOSCAN")
           var tableStats = getTableStats(tblName)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org