You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2016/09/07 00:13:17 UTC

spark git commit: [SPARK-17408][TEST] Flaky test: org.apache.spark.sql.hive.StatisticsSuite

Repository: spark
Updated Branches:
  refs/heads/master c07cbb353 -> a40657bfd


[SPARK-17408][TEST] Flaky test: org.apache.spark.sql.hive.StatisticsSuite

### What changes were proposed in this pull request?
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/64956/testReport/junit/org.apache.spark.sql.hive/StatisticsSuite/test_statistics_of_LogicalRelation_converted_from_MetastoreRelation/
```
org.apache.spark.sql.hive.StatisticsSuite.test statistics of LogicalRelation converted from MetastoreRelation

Failing for the past 1 build (Since Failed#64956 )
Took 1.4 sec.
Error Message

org.scalatest.exceptions.TestFailedException: 6871 did not equal 4236
Stacktrace

sbt.ForkMain$ForkError: org.scalatest.exceptions.TestFailedException: 6871 did not equal 4236
	at org.scalatest.Assertions$class.newAssertionFailedException(Assertions.scala:500)
```

This fix does not check the exact value of `sizeInBytes`. Instead, we compare whether it is larger than zero and compare the values between different values.

In addition, we also combine `checkMetastoreRelationStats` and `checkLogicalRelationStats` into the same checking function.

### How was this patch tested?
N/A

Author: gatorsmile <ga...@gmail.com>

Closes #14978 from gatorsmile/spark17408.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a40657bf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a40657bf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a40657bf

Branch: refs/heads/master
Commit: a40657bfd375bd27d65204bb42ed0cbd7bd1ebf2
Parents: c07cbb3
Author: gatorsmile <ga...@gmail.com>
Authored: Wed Sep 7 08:13:12 2016 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Wed Sep 7 08:13:12 2016 +0800

----------------------------------------------------------------------
 .../apache/spark/sql/hive/StatisticsSuite.scala | 141 +++++++++++--------
 1 file changed, 80 insertions(+), 61 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/a40657bf/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 33ed675..9956706 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -171,23 +171,37 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
       TableIdentifier("tempTable"), ignoreIfNotExists = true, purge = false)
   }
 
-  private def checkMetastoreRelationStats(
+  private def checkStats(
+      stats: Option[Statistics],
+      hasSizeInBytes: Boolean,
+      expectedRowCounts: Option[Int]): Unit = {
+    if (hasSizeInBytes || expectedRowCounts.nonEmpty) {
+      assert(stats.isDefined)
+      assert(stats.get.sizeInBytes > 0)
+      assert(stats.get.rowCount === expectedRowCounts)
+    } else {
+      assert(stats.isEmpty)
+    }
+  }
+
+  private def checkStats(
       tableName: String,
-      expectedStats: Option[Statistics]): Unit = {
+      isDataSourceTable: Boolean,
+      hasSizeInBytes: Boolean,
+      expectedRowCounts: Option[Int]): Option[Statistics] = {
     val df = sql(s"SELECT * FROM $tableName")
-    val relations = df.queryExecution.analyzed.collect { case rel: MetastoreRelation =>
-      expectedStats match {
-        case Some(es) =>
-          assert(rel.catalogTable.stats.isDefined)
-          val stats = rel.catalogTable.stats.get
-          assert(stats.sizeInBytes === es.sizeInBytes)
-          assert(stats.rowCount === es.rowCount)
-        case None =>
-          assert(rel.catalogTable.stats.isEmpty)
-      }
-      rel
+    val stats = df.queryExecution.analyzed.collect {
+      case rel: MetastoreRelation =>
+        checkStats(rel.catalogTable.stats, hasSizeInBytes, expectedRowCounts)
+        assert(!isDataSourceTable, "Expected a data source table, but got a Hive serde table")
+        rel.catalogTable.stats
+      case rel: LogicalRelation =>
+        checkStats(rel.catalogTable.get.stats, hasSizeInBytes, expectedRowCounts)
+        assert(isDataSourceTable, "Expected a Hive serde table, but got a data source table")
+        rel.catalogTable.get.stats
     }
-    assert(relations.size === 1)
+    assert(stats.size == 1)
+    stats.head
   }
 
   test("test table-level statistics for hive tables created in HiveExternalCatalog") {
@@ -196,19 +210,28 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
       // Currently Spark's statistics are self-contained, we don't have statistics until we use
       // the `ANALYZE TABLE` command.
       sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE")
-      checkMetastoreRelationStats(textTable, expectedStats = None)
+      checkStats(
+        textTable,
+        isDataSourceTable = false,
+        hasSizeInBytes = false,
+        expectedRowCounts = None)
       sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
-      checkMetastoreRelationStats(textTable, expectedStats = None)
+      checkStats(
+        textTable,
+        isDataSourceTable = false,
+        hasSizeInBytes = false,
+        expectedRowCounts = None)
 
       // noscan won't count the number of rows
       sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
-      checkMetastoreRelationStats(textTable, expectedStats =
-        Some(Statistics(sizeInBytes = 5812, rowCount = None)))
+      val fetchedStats1 = checkStats(
+        textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = None)
 
       // without noscan, we count the number of rows
       sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS")
-      checkMetastoreRelationStats(textTable, expectedStats =
-          Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
+      val fetchedStats2 = checkStats(
+        textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = Some(500))
+      assert(fetchedStats1.get.sizeInBytes == fetchedStats2.get.sizeInBytes)
     }
   }
 
@@ -218,40 +241,22 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
       sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE")
       sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
       sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS")
-      checkMetastoreRelationStats(textTable, expectedStats =
-        Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
+      val fetchedStats1 = checkStats(
+        textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = Some(500))
 
       sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
       // when the total size is not changed, the old row count is kept
-      checkMetastoreRelationStats(textTable, expectedStats =
-        Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
+      val fetchedStats2 = checkStats(
+        textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = Some(500))
+      assert(fetchedStats1 == fetchedStats2)
 
       sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
       sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
       // update total size and remove the old and invalid row count
-      checkMetastoreRelationStats(textTable, expectedStats =
-        Some(Statistics(sizeInBytes = 11624, rowCount = None)))
-    }
-  }
-
-  private def checkLogicalRelationStats(
-      tableName: String,
-      expectedStats: Option[Statistics]): Unit = {
-    val df = sql(s"SELECT * FROM $tableName")
-    val relations = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
-      assert(rel.catalogTable.isDefined)
-      expectedStats match {
-        case Some(es) =>
-          assert(rel.catalogTable.get.stats.isDefined)
-          val stats = rel.catalogTable.get.stats.get
-          assert(stats.sizeInBytes === es.sizeInBytes)
-          assert(stats.rowCount === es.rowCount)
-        case None =>
-          assert(rel.catalogTable.get.stats.isEmpty)
-      }
-      rel
+      val fetchedStats3 = checkStats(
+        textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = None)
+      assert(fetchedStats3.get.sizeInBytes > fetchedStats2.get.sizeInBytes)
     }
-    assert(relations.size === 1)
   }
 
   test("test statistics of LogicalRelation converted from MetastoreRelation") {
@@ -266,16 +271,21 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
       // the default value for `spark.sql.hive.convertMetastoreParquet` is true, here we just set it
       // for robustness
       withSQLConf("spark.sql.hive.convertMetastoreParquet" -> "true") {
-        checkLogicalRelationStats(parquetTable, expectedStats = None)
+        checkStats(
+          parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
         sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
-        checkLogicalRelationStats(parquetTable, expectedStats =
-          Some(Statistics(sizeInBytes = 4236, rowCount = Some(500))))
+        checkStats(
+          parquetTable,
+          isDataSourceTable = true,
+          hasSizeInBytes = true,
+          expectedRowCounts = Some(500))
       }
       withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "true") {
-        checkLogicalRelationStats(orcTable, expectedStats = None)
+        checkStats(
+          orcTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
         sql(s"ANALYZE TABLE $orcTable COMPUTE STATISTICS")
-        checkLogicalRelationStats(orcTable, expectedStats =
-          Some(Statistics(sizeInBytes = 3023, rowCount = Some(500))))
+        checkStats(
+          orcTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = Some(500))
       }
     }
   }
@@ -288,22 +298,28 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
       assert(DDLUtils.isDatasourceTable(catalogTable))
 
       sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
-      checkLogicalRelationStats(parquetTable, expectedStats = None)
+      checkStats(
+        parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
 
       // noscan won't count the number of rows
       sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
-      checkLogicalRelationStats(parquetTable, expectedStats =
-        Some(Statistics(sizeInBytes = 4236, rowCount = None)))
+      val fetchedStats1 = checkStats(
+        parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
 
       sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
       sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
-      checkLogicalRelationStats(parquetTable, expectedStats =
-        Some(Statistics(sizeInBytes = 8472, rowCount = None)))
+      val fetchedStats2 = checkStats(
+        parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
+      assert(fetchedStats2.get.sizeInBytes > fetchedStats1.get.sizeInBytes)
 
       // without noscan, we count the number of rows
       sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
-      checkLogicalRelationStats(parquetTable, expectedStats =
-        Some(Statistics(sizeInBytes = 8472, rowCount = Some(1000))))
+      val fetchedStats3 = checkStats(
+        parquetTable,
+        isDataSourceTable = true,
+        hasSizeInBytes = true,
+        expectedRowCounts = Some(1000))
+      assert(fetchedStats3.get.sizeInBytes == fetchedStats2.get.sizeInBytes)
     }
   }
 
@@ -314,8 +330,11 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
       val dfNoCols = spark.createDataFrame(rddNoCols, StructType(Seq.empty))
       dfNoCols.write.format("json").saveAsTable(table_no_cols)
       sql(s"ANALYZE TABLE $table_no_cols COMPUTE STATISTICS")
-      checkLogicalRelationStats(table_no_cols, expectedStats =
-        Some(Statistics(sizeInBytes = 30, rowCount = Some(10))))
+      checkStats(
+        table_no_cols,
+        isDataSourceTable = true,
+        hasSizeInBytes = true,
+        expectedRowCounts = Some(10))
     }
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org