You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hv...@apache.org on 2016/09/03 17:02:27 UTC

spark git commit: [SPARK-17335][SQL] Fix ArrayType and MapType CatalogString.

Repository: spark
Updated Branches:
  refs/heads/master a8a35b39b -> c2a1576c2


[SPARK-17335][SQL] Fix ArrayType and MapType CatalogString.

## What changes were proposed in this pull request?
the `catalogString` for `ArrayType` and `MapType` currently calls the `simpleString` method on its children. This is a problem when the child is a struct, the `struct.simpleString` implementation truncates the number of fields it shows (25 at max). This breaks the generation of a proper `catalogString`, and has shown to cause errors while writing to Hive.

This PR fixes this by providing proper `catalogString` implementations for `ArrayData` or `MapData`.

## How was this patch tested?
Added testing for `catalogString` to `DataTypeSuite`.

Author: Herman van Hovell <hv...@databricks.com>

Closes #14938 from hvanhovell/SPARK-17335.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c2a1576c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c2a1576c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c2a1576c

Branch: refs/heads/master
Commit: c2a1576c230697f56f282b6388c79835377e0f2f
Parents: a8a35b3
Author: Herman van Hovell <hv...@databricks.com>
Authored: Sat Sep 3 19:02:20 2016 +0200
Committer: Herman van Hovell <hv...@databricks.com>
Committed: Sat Sep 3 19:02:20 2016 +0200

----------------------------------------------------------------------
 .../org/apache/spark/sql/types/ArrayType.scala  |   2 +
 .../org/apache/spark/sql/types/MapType.scala    |   2 +
 .../apache/spark/sql/types/DataTypeSuite.scala  |  30 ++++
 .../benchmarks/WideSchemaBenchmark-results.txt  | 174 +++++++++++--------
 4 files changed, 133 insertions(+), 75 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/c2a1576c/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index 520e344..82a03b0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -77,6 +77,8 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT
 
   override def simpleString: String = s"array<${elementType.simpleString}>"
 
+  override def catalogString: String = s"array<${elementType.catalogString}>"
+
   override def sql: String = s"ARRAY<${elementType.sql}>"
 
   override private[spark] def asNullable: ArrayType =

http://git-wip-us.apache.org/repos/asf/spark/blob/c2a1576c/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
index 454ea40..1789609 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
@@ -64,6 +64,8 @@ case class MapType(
 
   override def simpleString: String = s"map<${keyType.simpleString},${valueType.simpleString}>"
 
+  override def catalogString: String = s"map<${keyType.catalogString},${valueType.catalogString}>"
+
   override def sql: String = s"MAP<${keyType.sql}, ${valueType.sql}>"
 
   override private[spark] def asNullable: MapType =

http://git-wip-us.apache.org/repos/asf/spark/blob/c2a1576c/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index 688bc3e..b8ab9a9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.types
 
 import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 
 class DataTypeSuite extends SparkFunSuite {
 
@@ -359,4 +360,33 @@ class DataTypeSuite extends SparkFunSuite {
       StructField("a", StringType, nullable = false) ::
       StructField("b", StringType, nullable = false) :: Nil),
     expected = false)
+
+  def checkCatalogString(dt: DataType): Unit = {
+    test(s"catalogString: $dt") {
+      val dt2 = CatalystSqlParser.parseDataType(dt.catalogString)
+      assert(dt === dt2)
+    }
+  }
+  def createStruct(n: Int): StructType = new StructType(Array.tabulate(n) {
+    i => StructField(s"col$i", IntegerType, nullable = true)
+  })
+
+  checkCatalogString(BooleanType)
+  checkCatalogString(ByteType)
+  checkCatalogString(ShortType)
+  checkCatalogString(IntegerType)
+  checkCatalogString(LongType)
+  checkCatalogString(FloatType)
+  checkCatalogString(DoubleType)
+  checkCatalogString(DecimalType(10, 5))
+  checkCatalogString(BinaryType)
+  checkCatalogString(StringType)
+  checkCatalogString(DateType)
+  checkCatalogString(TimestampType)
+  checkCatalogString(createStruct(4))
+  checkCatalogString(createStruct(40))
+  checkCatalogString(ArrayType(IntegerType))
+  checkCatalogString(ArrayType(createStruct(40)))
+  checkCatalogString(MapType(IntegerType, StringType))
+  checkCatalogString(MapType(IntegerType, createStruct(40)))
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/c2a1576c/sql/core/benchmarks/WideSchemaBenchmark-results.txt
----------------------------------------------------------------------
diff --git a/sql/core/benchmarks/WideSchemaBenchmark-results.txt b/sql/core/benchmarks/WideSchemaBenchmark-results.txt
index ea6a661..0b9f791 100644
--- a/sql/core/benchmarks/WideSchemaBenchmark-results.txt
+++ b/sql/core/benchmarks/WideSchemaBenchmark-results.txt
@@ -1,93 +1,117 @@
-OpenJDK 64-Bit Server VM 1.8.0_66-internal-b17 on Linux 4.2.0-36-generic
-Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
+
 parsing large select:                    Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-1 select expressions                             3 /    5          0.0     2967064.0       1.0X
-100 select expressions                          11 /   12          0.0    11369518.0       0.3X
-2500 select expressions                        243 /  250          0.0   242561004.0       0.0X
+1 select expressions                             2 /    4          0.0     2050147.0       1.0X
+100 select expressions                           6 /    7          0.0     6123412.0       0.3X
+2500 select expressions                        135 /  141          0.0   134623148.0       0.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
 
-OpenJDK 64-Bit Server VM 1.8.0_66-internal-b17 on Linux 4.2.0-36-generic
-Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
 many column field r/w:                   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-1 cols x 100000 rows (read in-mem)              28 /   40          3.6         278.8       1.0X
-1 cols x 100000 rows (exec in-mem)              28 /   42          3.5         284.0       1.0X
-1 cols x 100000 rows (read parquet)             23 /   35          4.4         228.8       1.2X
-1 cols x 100000 rows (write parquet)           163 /  182          0.6        1633.0       0.2X
-100 cols x 1000 rows (read in-mem)              27 /   39          3.7         266.9       1.0X
-100 cols x 1000 rows (exec in-mem)              48 /   79          2.1         481.7       0.6X
-100 cols x 1000 rows (read parquet)             25 /   36          3.9         254.3       1.1X
-100 cols x 1000 rows (write parquet)           182 /  196          0.5        1819.5       0.2X
-2500 cols x 40 rows (read in-mem)              280 /  315          0.4        2797.1       0.1X
-2500 cols x 40 rows (exec in-mem)              606 /  638          0.2        6064.3       0.0X
-2500 cols x 40 rows (read parquet)             836 /  843          0.1        8356.4       0.0X
-2500 cols x 40 rows (write parquet)            490 /  522          0.2        4900.6       0.1X
+1 cols x 100000 rows (read in-mem)              16 /   18          6.3         158.6       1.0X
+1 cols x 100000 rows (exec in-mem)              17 /   19          6.0         166.7       1.0X
+1 cols x 100000 rows (read parquet)             24 /   26          4.3         235.1       0.7X
+1 cols x 100000 rows (write parquet)            81 /   85          1.2         811.3       0.2X
+100 cols x 1000 rows (read in-mem)              17 /   19          6.0         166.2       1.0X
+100 cols x 1000 rows (exec in-mem)              25 /   27          4.0         249.2       0.6X
+100 cols x 1000 rows (read parquet)             23 /   25          4.4         226.0       0.7X
+100 cols x 1000 rows (write parquet)            83 /   87          1.2         831.0       0.2X
+2500 cols x 40 rows (read in-mem)              132 /  137          0.8        1322.9       0.1X
+2500 cols x 40 rows (exec in-mem)              326 /  330          0.3        3260.6       0.0X
+2500 cols x 40 rows (read parquet)             831 /  839          0.1        8305.8       0.0X
+2500 cols x 40 rows (write parquet)            237 /  245          0.4        2372.6       0.1X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
 
-OpenJDK 64-Bit Server VM 1.8.0_66-internal-b17 on Linux 4.2.0-36-generic
-Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
 wide shallowly nested struct field r/w:  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-1 wide x 100000 rows (read in-mem)              22 /   35          4.6         216.0       1.0X
-1 wide x 100000 rows (exec in-mem)              40 /   63          2.5         400.6       0.5X
-1 wide x 100000 rows (read parquet)             93 /  134          1.1         933.9       0.2X
-1 wide x 100000 rows (write parquet)           133 /  174          0.7        1334.3       0.2X
-100 wide x 1000 rows (read in-mem)              22 /   44          4.5         223.3       1.0X
-100 wide x 1000 rows (exec in-mem)              88 /  138          1.1         878.6       0.2X
-100 wide x 1000 rows (read parquet)            117 /  186          0.9        1172.0       0.2X
-100 wide x 1000 rows (write parquet)           144 /  174          0.7        1441.6       0.1X
-2500 wide x 40 rows (read in-mem)               36 /   57          2.8         358.9       0.6X
-2500 wide x 40 rows (exec in-mem)             1466 / 1507          0.1       14656.6       0.0X
-2500 wide x 40 rows (read parquet)             690 /  802          0.1        6898.2       0.0X
-2500 wide x 40 rows (write parquet)            197 /  207          0.5        1970.9       0.1X
+1 wide x 100000 rows (read in-mem)              15 /   17          6.6         151.0       1.0X
+1 wide x 100000 rows (exec in-mem)              20 /   22          5.1         196.6       0.8X
+1 wide x 100000 rows (read parquet)             59 /   63          1.7         592.8       0.3X
+1 wide x 100000 rows (write parquet)            81 /   87          1.2         814.6       0.2X
+100 wide x 1000 rows (read in-mem)              21 /   25          4.8         208.7       0.7X
+100 wide x 1000 rows (exec in-mem)              72 /   81          1.4         718.5       0.2X
+100 wide x 1000 rows (read parquet)             75 /   85          1.3         752.6       0.2X
+100 wide x 1000 rows (write parquet)            88 /   95          1.1         876.7       0.2X
+2500 wide x 40 rows (read in-mem)               28 /   34          3.5         282.2       0.5X
+2500 wide x 40 rows (exec in-mem)             1269 / 1284          0.1       12688.1       0.0X
+2500 wide x 40 rows (read parquet)             549 /  578          0.2        5493.4       0.0X
+2500 wide x 40 rows (write parquet)             96 /  104          1.0         959.1       0.2X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
 
-OpenJDK 64-Bit Server VM 1.8.0_66-internal-b17 on Linux 4.2.0-36-generic
-Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
 deeply nested struct field r/w:          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-1 deep x 100000 rows (read in-mem)              22 /   35          4.5         223.9       1.0X
-1 deep x 100000 rows (exec in-mem)              28 /   52          3.6         280.6       0.8X
-1 deep x 100000 rows (read parquet)             41 /   65          2.4         410.5       0.5X
-1 deep x 100000 rows (write parquet)           163 /  173          0.6        1634.5       0.1X
-100 deep x 1000 rows (read in-mem)              43 /   63          2.3         425.9       0.5X
-100 deep x 1000 rows (exec in-mem)             232 /  280          0.4        2321.7       0.1X
-100 deep x 1000 rows (read parquet)           1989 / 2281          0.1       19886.6       0.0X
-100 deep x 1000 rows (write parquet)           144 /  184          0.7        1442.6       0.2X
-250 deep x 400 rows (read in-mem)               68 /   95          1.5         680.9       0.3X
-250 deep x 400 rows (exec in-mem)             1310 / 1403          0.1       13096.4       0.0X
-250 deep x 400 rows (read parquet)          41477 / 41847          0.0      414766.8       0.0X
-250 deep x 400 rows (write parquet)            243 /  272          0.4        2433.1       0.1X
+1 deep x 100000 rows (read in-mem)              14 /   16          7.0         143.8       1.0X
+1 deep x 100000 rows (exec in-mem)              17 /   19          5.9         169.7       0.8X
+1 deep x 100000 rows (read parquet)             33 /   35          3.1         327.0       0.4X
+1 deep x 100000 rows (write parquet)            79 /   84          1.3         786.9       0.2X
+100 deep x 1000 rows (read in-mem)              21 /   24          4.7         211.3       0.7X
+100 deep x 1000 rows (exec in-mem)             221 /  235          0.5        2214.5       0.1X
+100 deep x 1000 rows (read parquet)           1928 / 1952          0.1       19277.1       0.0X
+100 deep x 1000 rows (write parquet)            91 /   96          1.1         909.5       0.2X
+250 deep x 400 rows (read in-mem)               57 /   61          1.8         567.1       0.3X
+250 deep x 400 rows (exec in-mem)             1329 / 1385          0.1       13291.8       0.0X
+250 deep x 400 rows (read parquet)          36563 / 36750          0.0      365630.2       0.0X
+250 deep x 400 rows (write parquet)            126 /  130          0.8        1262.0       0.1X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
 
-OpenJDK 64-Bit Server VM 1.8.0_66-internal-b17 on Linux 4.2.0-36-generic
-Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
 bushy struct field r/w:                  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-1 x 1 deep x 100000 rows (read in-mem)          23 /   36          4.4         229.8       1.0X
-1 x 1 deep x 100000 rows (exec in-mem)          27 /   48          3.7         269.6       0.9X
-1 x 1 deep x 100000 rows (read parquet)         25 /   33          4.0         247.5       0.9X
-1 x 1 deep x 100000 rows (write parquet)        82 /  134          1.2         821.1       0.3X
-128 x 8 deep x 1000 rows (read in-mem)          19 /   29          5.3         189.5       1.2X
-128 x 8 deep x 1000 rows (exec in-mem)         144 /  165          0.7        1440.4       0.2X
-128 x 8 deep x 1000 rows (read parquet)        117 /  159          0.9        1174.4       0.2X
-128 x 8 deep x 1000 rows (write parquet)       135 /  162          0.7        1349.0       0.2X
-1024 x 11 deep x 100 rows (read in-mem)         30 /   49          3.3         304.4       0.8X
-1024 x 11 deep x 100 rows (exec in-mem)       1146 / 1183          0.1       11457.6       0.0X
-1024 x 11 deep x 100 rows (read parquet)       712 /  758          0.1        7119.5       0.0X
-1024 x 11 deep x 100 rows (write parquet)       104 /  143          1.0        1037.3       0.2X
+1 x 1 deep x 100000 rows (read in-mem)          13 /   15          7.8         127.7       1.0X
+1 x 1 deep x 100000 rows (exec in-mem)          15 /   17          6.6         151.5       0.8X
+1 x 1 deep x 100000 rows (read parquet)         20 /   23          5.0         198.3       0.6X
+1 x 1 deep x 100000 rows (write parquet)        77 /   82          1.3         770.4       0.2X
+128 x 8 deep x 1000 rows (read in-mem)          12 /   14          8.2         122.5       1.0X
+128 x 8 deep x 1000 rows (exec in-mem)         124 /  140          0.8        1241.2       0.1X
+128 x 8 deep x 1000 rows (read parquet)         69 /   74          1.4         693.9       0.2X
+128 x 8 deep x 1000 rows (write parquet)        78 /   83          1.3         777.7       0.2X
+1024 x 11 deep x 100 rows (read in-mem)         25 /   29          4.1         246.1       0.5X
+1024 x 11 deep x 100 rows (exec in-mem)       1197 / 1223          0.1       11974.6       0.0X
+1024 x 11 deep x 100 rows (read parquet)       426 /  433          0.2        4263.7       0.0X
+1024 x 11 deep x 100 rows (write parquet)        91 /   98          1.1         913.5       0.1X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
 
-OpenJDK 64-Bit Server VM 1.8.0_66-internal-b17 on Linux 4.2.0-36-generic
-Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
 wide array field r/w:                    Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------
-1 wide x 100000 rows (read in-mem)              18 /   31          5.6         179.3       1.0X
-1 wide x 100000 rows (exec in-mem)              31 /   47          3.2         310.2       0.6X
-1 wide x 100000 rows (read parquet)             45 /   73          2.2         445.1       0.4X
-1 wide x 100000 rows (write parquet)           109 /  140          0.9        1085.9       0.2X
-100 wide x 1000 rows (read in-mem)              17 /   25          5.8         172.7       1.0X
-100 wide x 1000 rows (exec in-mem)              18 /   22          5.4         184.6       1.0X
-100 wide x 1000 rows (read parquet)             26 /   42          3.8         261.8       0.7X
-100 wide x 1000 rows (write parquet)           150 /  164          0.7        1499.4       0.1X
-2500 wide x 40 rows (read in-mem)               19 /   31          5.1         194.7       0.9X
-2500 wide x 40 rows (exec in-mem)               19 /   24          5.3         188.5       1.0X
-2500 wide x 40 rows (read parquet)              33 /   47          3.0         334.4       0.5X
-2500 wide x 40 rows (write parquet)            153 /  164          0.7        1528.2       0.1X
+1 wide x 100000 rows (read in-mem)              14 /   16          7.0         143.2       1.0X
+1 wide x 100000 rows (exec in-mem)              17 /   19          5.9         170.9       0.8X
+1 wide x 100000 rows (read parquet)             43 /   46          2.3         434.1       0.3X
+1 wide x 100000 rows (write parquet)            78 /   83          1.3         777.6       0.2X
+100 wide x 1000 rows (read in-mem)              11 /   13          9.0         111.5       1.3X
+100 wide x 1000 rows (exec in-mem)              13 /   15          7.8         128.3       1.1X
+100 wide x 1000 rows (read parquet)             24 /   27          4.1         245.0       0.6X
+100 wide x 1000 rows (write parquet)            74 /   80          1.4         740.5       0.2X
+2500 wide x 40 rows (read in-mem)               11 /   13          9.1         109.5       1.3X
+2500 wide x 40 rows (exec in-mem)               13 /   15          7.7         129.4       1.1X
+2500 wide x 40 rows (read parquet)              24 /   26          4.1         241.3       0.6X
+2500 wide x 40 rows (write parquet)             75 /   81          1.3         751.8       0.2X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
+
+wide map field r/w:                      Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+1 wide x 100000 rows (read in-mem)              16 /   18          6.2         162.6       1.0X
+1 wide x 100000 rows (exec in-mem)              21 /   23          4.8         208.2       0.8X
+1 wide x 100000 rows (read parquet)             54 /   59          1.8         543.6       0.3X
+1 wide x 100000 rows (write parquet)            80 /   86          1.2         804.5       0.2X
+100 wide x 1000 rows (read in-mem)              11 /   13          8.7         114.5       1.4X
+100 wide x 1000 rows (exec in-mem)              14 /   16          7.0         143.5       1.1X
+100 wide x 1000 rows (read parquet)             30 /   32          3.3         300.4       0.5X
+100 wide x 1000 rows (write parquet)            75 /   80          1.3         749.9       0.2X
+2500 wide x 40 rows (read in-mem)               13 /   15          7.8         128.1       1.3X
+2500 wide x 40 rows (exec in-mem)               15 /   18          6.5         153.6       1.1X
+2500 wide x 40 rows (read parquet)              30 /   33          3.3         304.4       0.5X
+2500 wide x 40 rows (write parquet)             77 /   83          1.3         768.5       0.2X
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org