You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Shixiong Zhu (JIRA)" <ji...@apache.org> on 2017/03/16 02:10:42 UTC
[jira] [Created] (SPARK-19965) DataFrame batch reader may fail to infer partitions when reading FileStreamSink's output

Shixiong Zhu created SPARK-19965:
------------------------------------

             Summary: DataFrame batch reader may fail to infer partitions when reading FileStreamSink's output
                 Key: SPARK-19965
                 URL: https://issues.apache.org/jira/browse/SPARK-19965
             Project: Spark
          Issue Type: Bug
          Components: Structured Streaming
    Affects Versions: 2.1.0
            Reporter: Shixiong Zhu


Reproducer
{code}
  test("partitioned writing and batch reading with 'basePath'") {
    val inputData = MemoryStream[Int]
    val ds = inputData.toDS()

    val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath
    val checkpointDir = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath

    var query: StreamingQuery = null

    try {
      query =
        ds.map(i => (i, i * 1000))
          .toDF("id", "value")
          .writeStream
          .partitionBy("id")
          .option("checkpointLocation", checkpointDir)
          .format("parquet")
          .start(outputDir)

      inputData.addData(1, 2, 3)
      failAfter(streamingTimeout) {
        query.processAllAvailable()
      }

      spark.read.option("basePath", outputDir).parquet(outputDir + "/*").show()
    } finally {
      if (query != null) {
        query.stop()
      }
    }
  }
{code}

Stack trace
{code}
[info] - partitioned writing and batch reading with 'basePath' *** FAILED *** (3 seconds, 928 milliseconds)
[info]   java.lang.AssertionError: assertion failed: Conflicting directory structures detected. Suspicious paths:
[info] 	***/stream.output-65e3fa45-595a-4d29-b3df-4c001e321637
[info] 	***/stream.output-65e3fa45-595a-4d29-b3df-4c001e321637/_spark_metadata
[info] 
[info] If provided paths are partition directories, please set "basePath" in the options of the data source to specify the root directory of the table. If there are multiple root directories, please load them separately and then union them.
[info]   at scala.Predef$.assert(Predef.scala:170)
[info]   at org.apache.spark.sql.execution.datasources.PartitioningUtils$.parsePartitions(PartitioningUtils.scala:133)
[info]   at org.apache.spark.sql.execution.datasources.PartitioningUtils$.parsePartitions(PartitioningUtils.scala:98)
[info]   at org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex.inferPartitioning(PartitioningAwareFileIndex.scala:156)
[info]   at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.partitionSpec(InMemoryFileIndex.scala:54)
[info]   at org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex.partitionSchema(PartitioningAwareFileIndex.scala:55)
[info]   at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:133)
[info]   at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:361)
[info]   at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:160)
[info]   at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:536)
[info]   at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:520)
[info]   at org.apache.spark.sql.streaming.FileStreamSinkSuite$$anonfun$8.apply$mcV$sp(FileStreamSinkSuite.scala:292)
[info]   at org.apache.spark.sql.streaming.FileStreamSinkSuite$$anonfun$8.apply(FileStreamSinkSuite.scala:268)
[info]   at org.apache.spark.sql.streaming.FileStreamSinkSuite$$anonfun$8.apply(FileStreamSinkSuite.scala:268)
{code}



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org