You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2016/09/07 00:13:17 UTC
spark git commit: [SPARK-17408][TEST] Flaky test:
org.apache.spark.sql.hive.StatisticsSuite
Repository: spark
Updated Branches:
refs/heads/master c07cbb353 -> a40657bfd
[SPARK-17408][TEST] Flaky test: org.apache.spark.sql.hive.StatisticsSuite
### What changes were proposed in this pull request?
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/64956/testReport/junit/org.apache.spark.sql.hive/StatisticsSuite/test_statistics_of_LogicalRelation_converted_from_MetastoreRelation/
```
org.apache.spark.sql.hive.StatisticsSuite.test statistics of LogicalRelation converted from MetastoreRelation
Failing for the past 1 build (Since Failed#64956 )
Took 1.4 sec.
Error Message
org.scalatest.exceptions.TestFailedException: 6871 did not equal 4236
Stacktrace
sbt.ForkMain$ForkError: org.scalatest.exceptions.TestFailedException: 6871 did not equal 4236
at org.scalatest.Assertions$class.newAssertionFailedException(Assertions.scala:500)
```
This fix does not check the exact value of `sizeInBytes`. Instead, we compare whether it is larger than zero and compare the values between different values.
In addition, we also combine `checkMetastoreRelationStats` and `checkLogicalRelationStats` into the same checking function.
### How was this patch tested?
N/A
Author: gatorsmile <ga...@gmail.com>
Closes #14978 from gatorsmile/spark17408.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a40657bf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a40657bf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a40657bf
Branch: refs/heads/master
Commit: a40657bfd375bd27d65204bb42ed0cbd7bd1ebf2
Parents: c07cbb3
Author: gatorsmile <ga...@gmail.com>
Authored: Wed Sep 7 08:13:12 2016 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Wed Sep 7 08:13:12 2016 +0800
----------------------------------------------------------------------
.../apache/spark/sql/hive/StatisticsSuite.scala | 141 +++++++++++--------
1 file changed, 80 insertions(+), 61 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/a40657bf/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 33ed675..9956706 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -171,23 +171,37 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
TableIdentifier("tempTable"), ignoreIfNotExists = true, purge = false)
}
- private def checkMetastoreRelationStats(
+ private def checkStats(
+ stats: Option[Statistics],
+ hasSizeInBytes: Boolean,
+ expectedRowCounts: Option[Int]): Unit = {
+ if (hasSizeInBytes || expectedRowCounts.nonEmpty) {
+ assert(stats.isDefined)
+ assert(stats.get.sizeInBytes > 0)
+ assert(stats.get.rowCount === expectedRowCounts)
+ } else {
+ assert(stats.isEmpty)
+ }
+ }
+
+ private def checkStats(
tableName: String,
- expectedStats: Option[Statistics]): Unit = {
+ isDataSourceTable: Boolean,
+ hasSizeInBytes: Boolean,
+ expectedRowCounts: Option[Int]): Option[Statistics] = {
val df = sql(s"SELECT * FROM $tableName")
- val relations = df.queryExecution.analyzed.collect { case rel: MetastoreRelation =>
- expectedStats match {
- case Some(es) =>
- assert(rel.catalogTable.stats.isDefined)
- val stats = rel.catalogTable.stats.get
- assert(stats.sizeInBytes === es.sizeInBytes)
- assert(stats.rowCount === es.rowCount)
- case None =>
- assert(rel.catalogTable.stats.isEmpty)
- }
- rel
+ val stats = df.queryExecution.analyzed.collect {
+ case rel: MetastoreRelation =>
+ checkStats(rel.catalogTable.stats, hasSizeInBytes, expectedRowCounts)
+ assert(!isDataSourceTable, "Expected a data source table, but got a Hive serde table")
+ rel.catalogTable.stats
+ case rel: LogicalRelation =>
+ checkStats(rel.catalogTable.get.stats, hasSizeInBytes, expectedRowCounts)
+ assert(isDataSourceTable, "Expected a Hive serde table, but got a data source table")
+ rel.catalogTable.get.stats
}
- assert(relations.size === 1)
+ assert(stats.size == 1)
+ stats.head
}
test("test table-level statistics for hive tables created in HiveExternalCatalog") {
@@ -196,19 +210,28 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
// Currently Spark's statistics are self-contained, we don't have statistics until we use
// the `ANALYZE TABLE` command.
sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE")
- checkMetastoreRelationStats(textTable, expectedStats = None)
+ checkStats(
+ textTable,
+ isDataSourceTable = false,
+ hasSizeInBytes = false,
+ expectedRowCounts = None)
sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
- checkMetastoreRelationStats(textTable, expectedStats = None)
+ checkStats(
+ textTable,
+ isDataSourceTable = false,
+ hasSizeInBytes = false,
+ expectedRowCounts = None)
// noscan won't count the number of rows
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
- checkMetastoreRelationStats(textTable, expectedStats =
- Some(Statistics(sizeInBytes = 5812, rowCount = None)))
+ val fetchedStats1 = checkStats(
+ textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = None)
// without noscan, we count the number of rows
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS")
- checkMetastoreRelationStats(textTable, expectedStats =
- Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
+ val fetchedStats2 = checkStats(
+ textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = Some(500))
+ assert(fetchedStats1.get.sizeInBytes == fetchedStats2.get.sizeInBytes)
}
}
@@ -218,40 +241,22 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE")
sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS")
- checkMetastoreRelationStats(textTable, expectedStats =
- Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
+ val fetchedStats1 = checkStats(
+ textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = Some(500))
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
// when the total size is not changed, the old row count is kept
- checkMetastoreRelationStats(textTable, expectedStats =
- Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
+ val fetchedStats2 = checkStats(
+ textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = Some(500))
+ assert(fetchedStats1 == fetchedStats2)
sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
// update total size and remove the old and invalid row count
- checkMetastoreRelationStats(textTable, expectedStats =
- Some(Statistics(sizeInBytes = 11624, rowCount = None)))
- }
- }
-
- private def checkLogicalRelationStats(
- tableName: String,
- expectedStats: Option[Statistics]): Unit = {
- val df = sql(s"SELECT * FROM $tableName")
- val relations = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
- assert(rel.catalogTable.isDefined)
- expectedStats match {
- case Some(es) =>
- assert(rel.catalogTable.get.stats.isDefined)
- val stats = rel.catalogTable.get.stats.get
- assert(stats.sizeInBytes === es.sizeInBytes)
- assert(stats.rowCount === es.rowCount)
- case None =>
- assert(rel.catalogTable.get.stats.isEmpty)
- }
- rel
+ val fetchedStats3 = checkStats(
+ textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = None)
+ assert(fetchedStats3.get.sizeInBytes > fetchedStats2.get.sizeInBytes)
}
- assert(relations.size === 1)
}
test("test statistics of LogicalRelation converted from MetastoreRelation") {
@@ -266,16 +271,21 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
// the default value for `spark.sql.hive.convertMetastoreParquet` is true, here we just set it
// for robustness
withSQLConf("spark.sql.hive.convertMetastoreParquet" -> "true") {
- checkLogicalRelationStats(parquetTable, expectedStats = None)
+ checkStats(
+ parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
- checkLogicalRelationStats(parquetTable, expectedStats =
- Some(Statistics(sizeInBytes = 4236, rowCount = Some(500))))
+ checkStats(
+ parquetTable,
+ isDataSourceTable = true,
+ hasSizeInBytes = true,
+ expectedRowCounts = Some(500))
}
withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "true") {
- checkLogicalRelationStats(orcTable, expectedStats = None)
+ checkStats(
+ orcTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
sql(s"ANALYZE TABLE $orcTable COMPUTE STATISTICS")
- checkLogicalRelationStats(orcTable, expectedStats =
- Some(Statistics(sizeInBytes = 3023, rowCount = Some(500))))
+ checkStats(
+ orcTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = Some(500))
}
}
}
@@ -288,22 +298,28 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
assert(DDLUtils.isDatasourceTable(catalogTable))
sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
- checkLogicalRelationStats(parquetTable, expectedStats = None)
+ checkStats(
+ parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
// noscan won't count the number of rows
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
- checkLogicalRelationStats(parquetTable, expectedStats =
- Some(Statistics(sizeInBytes = 4236, rowCount = None)))
+ val fetchedStats1 = checkStats(
+ parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
- checkLogicalRelationStats(parquetTable, expectedStats =
- Some(Statistics(sizeInBytes = 8472, rowCount = None)))
+ val fetchedStats2 = checkStats(
+ parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
+ assert(fetchedStats2.get.sizeInBytes > fetchedStats1.get.sizeInBytes)
// without noscan, we count the number of rows
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
- checkLogicalRelationStats(parquetTable, expectedStats =
- Some(Statistics(sizeInBytes = 8472, rowCount = Some(1000))))
+ val fetchedStats3 = checkStats(
+ parquetTable,
+ isDataSourceTable = true,
+ hasSizeInBytes = true,
+ expectedRowCounts = Some(1000))
+ assert(fetchedStats3.get.sizeInBytes == fetchedStats2.get.sizeInBytes)
}
}
@@ -314,8 +330,11 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
val dfNoCols = spark.createDataFrame(rddNoCols, StructType(Seq.empty))
dfNoCols.write.format("json").saveAsTable(table_no_cols)
sql(s"ANALYZE TABLE $table_no_cols COMPUTE STATISTICS")
- checkLogicalRelationStats(table_no_cols, expectedStats =
- Some(Statistics(sizeInBytes = 30, rowCount = Some(10))))
+ checkStats(
+ table_no_cols,
+ isDataSourceTable = true,
+ hasSizeInBytes = true,
+ expectedRowCounts = Some(10))
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org