You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Tom Ogle (JIRA)" <ji...@apache.org> on 2017/06/21 14:56:00 UTC
[jira] [Created] (SPARK-21162) Cannot count rows in an empty Hive table stored as parquet when spark.sql.parquet.cacheMetadata is set to false

Tom Ogle created SPARK-21162:
--------------------------------

             Summary: Cannot count rows in an empty Hive table stored as parquet when spark.sql.parquet.cacheMetadata is set to false
                 Key: SPARK-21162
                 URL: https://issues.apache.org/jira/browse/SPARK-21162
             Project: Spark
          Issue Type: Bug
          Components: SQL
    Affects Versions: 1.6.3, 1.6.2
            Reporter: Tom Ogle


With spark.sql.parquet.cacheMetadata set to false, creating an empty Hive table stored as Parquet and then trying to count the rows using SparkSQL throws an IOException. The issue does not affect Spark 2. This issue is inconvenient in environments using Spark 1.6.x where spark.sql.parquet.cacheMetadata is explicitly set to false for some reason, such as in Google DataProc 1.0.

Here is the stacktrace:

{code}
17/06/21 15:30:10 INFO ParquetRelation: Reading Parquet file(s) from 
Exception in thread "main" org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
TungstenAggregate(key=[], functions=[(count(1),mode=Final,isDistinct=false)], output=[count#30L])
+- TungstenExchange SinglePartition, None
   +- TungstenAggregate(key=[], functions=[(count(1),mode=Partial,isDistinct=false)], output=[count#33L])
      +- Scan ParquetRelation: my_test_db.test_table[] InputPaths: <snip>/my_test_db.db/test_table

	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49)
	at org.apache.spark.sql.execution.aggregate.TungstenAggregate.doExecute(TungstenAggregate.scala:80)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:166)
	at org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:174)
	at org.apache.spark.sql.DataFrame$$anonfun$org$apache$spark$sql$DataFrame$$execute$1$1.apply(DataFrame.scala:1500)
	at org.apache.spark.sql.DataFrame$$anonfun$org$apache$spark$sql$DataFrame$$execute$1$1.apply(DataFrame.scala:1500)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56)
	at org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2087)
	at org.apache.spark.sql.DataFrame.org$apache$spark$sql$DataFrame$$execute$1(DataFrame.scala:1499)
	at org.apache.spark.sql.DataFrame.org$apache$spark$sql$DataFrame$$collect(DataFrame.scala:1506)
	at org.apache.spark.sql.DataFrame$$anonfun$count$1.apply(DataFrame.scala:1516)
	at org.apache.spark.sql.DataFrame$$anonfun$count$1.apply(DataFrame.scala:1515)
	at org.apache.spark.sql.DataFrame.withCallback(DataFrame.scala:2100)
	at org.apache.spark.sql.DataFrame.count(DataFrame.scala:1515)
	at App$.main(App.scala:23)
	at App.main(App.scala)
Caused by: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
TungstenExchange SinglePartition, None
+- TungstenAggregate(key=[], functions=[(count(1),mode=Partial,isDistinct=false)], output=[count#33L])
   +- Scan ParquetRelation: my_test_db.test_table[] InputPaths: <snip>/my_test_db.db/test_table

	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49)
	at org.apache.spark.sql.execution.Exchange.doExecute(Exchange.scala:247)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
	at org.apache.spark.sql.execution.aggregate.TungstenAggregate$$anonfun$doExecute$1.apply(TungstenAggregate.scala:86)
	at org.apache.spark.sql.execution.aggregate.TungstenAggregate$$anonfun$doExecute$1.apply(TungstenAggregate.scala:80)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48)
	... 19 more
Caused by: java.io.IOException: No input paths specified in job
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:231)
	at org.apache.parquet.hadoop.ParquetInputFormat.listStatus(ParquetInputFormat.java:339)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$buildInternalScan$1$$anon$1$$anon$4.listStatus(ParquetRelation.scala:358)
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:340)
	at org.apache.parquet.hadoop.ParquetInputFormat.getSplits(ParquetInputFormat.java:294)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anonfun$buildInternalScan$1$$anon$1.getPartitions(ParquetRelation.scala:363)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
	at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91)
	at org.apache.spark.sql.execution.Exchange.prepareShuffleDependency(Exchange.scala:220)
	at org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:254)
	at org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:248)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48)
	... 27 more
{code}

Here is some Scala code to reproduce the issue locally:

App.scala:
{code}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object App {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("Testing Issue").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val hiveContext = new HiveContext(sc)
    hiveContext.setConf("spark.sql.parquet.cacheMetadata", "false")

    val databaseName = "my_test_db"
    val tableName = "test_table"
    val fullTableName = databaseName + "." + tableName

    hiveContext.sql("DROP TABLE IF EXISTS " + fullTableName)
    hiveContext.sql("DROP DATABASE IF EXISTS " + databaseName)
    hiveContext.sql("CREATE DATABASE IF NOT EXISTS " + databaseName)
    hiveContext.sql(
      s"""CREATE TABLE IF NOT EXISTS $fullTableName
         | (x string) stored as parquet
       """.stripMargin)

    hiveContext.table(fullTableName).count()
    sc.stop()
  }
}
{code}
build.sbt:
{code}
name := "test-issue"

version := "1.0"

scalaVersion := "2.10.5"

val sparkVersion = "1.6.3"

libraryDependencies ++= Seq(
  "org.apache.spark" %% "spark-core" % sparkVersion,
  "org.apache.spark" %% "spark-sql" % sparkVersion,
  "org.apache.spark" %% "spark-hive" % sparkVersion
)
{code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org