You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Hossein Falaki (JIRA)" <ji...@apache.org> on 2014/10/29 20:16:34 UTC
[jira] [Updated] (SPARK-4135) Error reading Parquet file generated
with SparkSQL
[ https://issues.apache.org/jira/browse/SPARK-4135?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Hossein Falaki updated SPARK-4135:
----------------------------------
Attachment: _metadata
part-r-1.parquet
Files generated by SparkSQL that cannot be read.
> Error reading Parquet file generated with SparkSQL
> --------------------------------------------------
>
> Key: SPARK-4135
> URL: https://issues.apache.org/jira/browse/SPARK-4135
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 1.1.0
> Reporter: Hossein Falaki
> Attachments: _metadata, part-r-1.parquet
>
>
> I read a tsv version of the one million songs dataset (available here: http://tbmmsd.s3.amazonaws.com/)
> After reading it I create a SchemaRDD with following schema:
> {code}
> root
> |-- track_id: string (nullable = true)
> |-- analysis_sample_rate: string (nullable = true)
> |-- artist_7digitalid: string (nullable = true)
> |-- artist_familiarity: double (nullable = true)
> |-- artist_hotness: double (nullable = true)
> |-- artist_id: string (nullable = true)
> |-- artist_latitude: string (nullable = true)
> |-- artist_location: string (nullable = true)
> |-- artist_longitude: string (nullable = true)
> |-- artist_mbid: string (nullable = true)
> |-- artist_mbtags: array (nullable = true)
> | |-- element: string (containsNull = true)
> |-- artist_mbtags_count: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- artist_name: string (nullable = true)
> |-- artist_playmeid: string (nullable = true)
> |-- artist_terms: array (nullable = true)
> | |-- element: string (containsNull = true)
> |-- artist_terms_freq: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- artist_terms_weight: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- audio_md5: string (nullable = true)
> |-- bars_confidence: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- bars_start: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- beats_confidence: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- beats_start: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- danceability: double (nullable = true)
> |-- duration: double (nullable = true)
> |-- end_of_fade_in: double (nullable = true)
> |-- energy: double (nullable = true)
> |-- key: string (nullable = true)
> |-- key_confidence: double (nullable = true)
> |-- loudness: double (nullable = true)
> |-- mode: double (nullable = true)
> |-- mode_confidence: double (nullable = true)
> |-- release: string (nullable = true)
> |-- release_7digitalid: string (nullable = true)
> |-- sections_confidence: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- sections_start: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- segments_confidence: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- segments_loudness_max: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- segments_loudness_max_time: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- segments_loudness_start: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- segments_pitches: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- segments_start: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- segments_timbre: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- similar_artists: array (nullable = true)
> | |-- element: string (containsNull = true)
> |-- song_hotness: double (nullable = true)
> |-- song_id: string (nullable = true)
> |-- start_of_fade_out: double (nullable = true)
> |-- tatums_confidence: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- tatums_start: array (nullable = true)
> | |-- element: double (containsNull = true)
> |-- tempo: double (nullable = true)
> |-- time_signature: double (nullable = true)
> |-- time_signature_confidence: double (nullable = true)
> |-- title: string (nullable = true)
> |-- track_7digitalid: string (nullable = true)
> |-- year: double (nullable = true)
> {code}
> I select a single record from it and save it using saveAsParquetFile().
> When I read it later and try to query it I get the following exception:
> {code}
> Error in SQL statement: java.lang.RuntimeException: java.lang.reflect.InvocationTargetException
> at sun.reflect.GeneratedMethodAccessor208.invoke(Unknown Source)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at org.apache.spark.sql.parquet.FilteringParquetRowInputFormat$$anonfun$getSplits$1.apply(ParquetTableOperations.scala:472)
> at org.apache.spark.sql.parquet.FilteringParquetRowInputFormat$$anonfun$getSplits$1.apply(ParquetTableOperations.scala:457)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
> at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
> at org.apache.spark.sql.parquet.FilteringParquetRowInputFormat.getSplits(ParquetTableOperations.scala:457)
> at parquet.hadoop.ParquetInputFormat.getSplits(ParquetInputFormat.java:344)
> at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:95)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202)
> at scala.Option.getOrElse(Option.scala:120)
> at org.apache.spark.rdd.RDD.partitions(RDD.scala:202)
> at org.apache.spark.rdd.MappedRDD.getPartitions(MappedRDD.scala:28)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202)
> at scala.Option.getOrElse(Option.scala:120)
> at org.apache.spark.rdd.RDD.partitions(RDD.scala:202)
> at org.apache.spark.rdd.FilteredRDD.getPartitions(FilteredRDD.scala:29)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202)
> at scala.Option.getOrElse(Option.scala:120)
> at org.apache.spark.rdd.RDD.partitions(RDD.scala:202)
> at org.apache.spark.rdd.MappedRDD.getPartitions(MappedRDD.scala:28)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202)
> at scala.Option.getOrElse(Option.scala:120)
> at org.apache.spark.rdd.RDD.partitions(RDD.scala:202)
> at org.apache.spark.sql.execution.Limit.executeCollect(basicOperators.scala:118)
> at org.apache.spark.sql.SchemaRDD.collect(SchemaRDD.scala:454)
> at org.apache.spark.sql.SchemaRDD.take(SchemaRDD.scala:456)
> at com.databricks.backend.daemon.driver.OutputAggregator$.withOutputAggregation0(OutputAggregator.scala:67)
> at com.databricks.backend.daemon.driver.OutputAggregator$.withOutputAggregation(OutputAggregator.scala:40)
> at com.databricks.backend.daemon.driver.DriverLocal.executeSql(DriverLocal.scala:214)
> at com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:83)
> at com.databricks.backend.daemon.chauffeur.DriverWrapper$$anonfun$2.apply(DriverWrapper.scala:398)
> at com.databricks.backend.daemon.chauffeur.DriverWrapper$$anonfun$2.apply(DriverWrapper.scala:398)
> at scala.util.Try$.apply(Try.scala:161)
> at com.databricks.backend.daemon.chauffeur.DriverWrapper.executeCommand(DriverWrapper.scala:395)
> at com.databricks.backend.daemon.chauffeur.DriverWrapper.runInner(DriverWrapper.scala:301)
> at com.databricks.backend.daemon.chauffeur.DriverWrapper.run(DriverWrapper.scala:186)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: parquet.io.ParquetDecodingException: the row group is not in hdfs blocks in the file: midpoint of row groups is 324413162, the end of the hdfs block is 209035672
> at parquet.hadoop.ParquetInputFormat$HDFSBlocks.checkBelongingToANewHDFSBlock(ParquetInputFormat.java:193)
> at parquet.hadoop.ParquetInputFormat$HDFSBlocks.access$100(ParquetInputFormat.java:158)
> at parquet.hadoop.ParquetInputFormat.generateSplits(ParquetInputFormat.java:304)
> ... 45 more
> at com.databricks.backend.daemon.driver.DriverLocal.executeSql(DriverLocal.scala:224)
> at com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:83)
> at com.databricks.backend.daemon.chauffeur.DriverWrapper$$anonfun$2.apply(DriverWrapper.scala:398)
> at com.databricks.backend.daemon.chauffeur.DriverWrapper$$anonfun$2.apply(DriverWrapper.scala:398)
> at scala.util.Try$.apply(Try.scala:161)
> at com.databricks.backend.daemon.chauffeur.DriverWrapper.executeCommand(DriverWrapper.scala:395)
> at com.databricks.backend.daemon.chauffeur.DriverWrapper.runInner(DriverWrapper.scala:301)
> at com.databricks.backend.daemon.chauffeur.DriverWrapper.run(DriverWrapper.scala:186)
> at java.lang.Thread.run(Thread.java:745)
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org