You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@spark.apache.org by Pei Sun <pe...@alluxio.com> on 2016/07/16 21:01:25 UTC

Fwd: File to read sharded (2 levels) parquet files

 Hi Spark experts,
     spark version: 2.0.0-preview,
      hadoop version: 2.4, 2.7 (all tried, none works)

     The data is in parquet format and stored in hdfs:
/root/file/partition1/file-xxx.parquet
/root/file/partition2/file-xxx.parquet

Then I did:
sqlContext.read.format("parquet").load("hdfs://Master:port/root/file")

It failed with:

val a = sqlContext.read.parquet("hdfs://
ec2-54-191-19-229.us-west-2.compute.amazonaws.com:9000/alluxio_storage/tpcds1_r2/catalog_sales
")

java.io.FileNotFoundException: Path is not a file:
/alluxio_storage/tpcds1_r2/catalog_sales/cs_sold_date_sk=2450815

at
org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:75)

at
org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:61)

at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocationsInt(FSNamesystem.java:1828)

at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:1799)

at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:1712)

at
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:587)

at
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:365)

at
org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)

at
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:616)

at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:969)

at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2049)

at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2045)

at java.security.AccessController.doPrivileged(Native Method)

at javax.security.auth.Subject.doAs(Subject.java:415)

at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)

at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2043)


  at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)

  at
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)

  at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)

  at java.lang.reflect.Constructor.newInstance(Constructor.java:526)

  at
org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:106)

  at
org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:73)

  at
org.apache.hadoop.hdfs.DFSClient.callGetBlockLocations(DFSClient.java:1239)

  at org.apache.hadoop.hdfs.DFSClient.getLocatedBlocks(DFSClient.java:1224)

  at org.apache.hadoop.hdfs.DFSClient.getBlockLocations(DFSClient.java:1282)

  at
org.apache.hadoop.hdfs.DistributedFileSystem$1.doCall(DistributedFileSystem.java:221)

  at
org.apache.hadoop.hdfs.DistributedFileSystem$1.doCall(DistributedFileSystem.java:217)

  at
org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)

  at
org.apache.hadoop.hdfs.DistributedFileSystem.getFileBlockLocations(DistributedFileSystem.java:228)

  at
org.apache.hadoop.hdfs.DistributedFileSystem.getFileBlockLocations(DistributedFileSystem.java:209)

  at
org.apache.spark.sql.execution.datasources.ListingFileCatalog$$anonfun$1$$anonfun$apply$2.apply(ListingFileCatalog.scala:104)

  at
org.apache.spark.sql.execution.datasources.ListingFileCatalog$$anonfun$1$$anonfun$apply$2.apply(ListingFileCatalog.scala:92)

  at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)

  at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)

  at
scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)

  at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)

  at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)

  at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)

  at
org.apache.spark.sql.execution.datasources.ListingFileCatalog$$anonfun$1.apply(ListingFileCatalog.scala:92)

  at
org.apache.spark.sql.execution.datasources.ListingFileCatalog$$anonfun$1.apply(ListingFileCatalog.scala:80)

  at
scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)

  at
scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)

  at
scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)

  at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)

  at
scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)

  at scala.collection.AbstractTraversable.flatMap(Traversable.scala:104)

  at
org.apache.spark.sql.execution.datasources.ListingFileCatalog.listLeafFiles(ListingFileCatalog.scala:80)

  at
org.apache.spark.sql.execution.datasources.ListingFileCatalog.refresh(ListingFileCatalog.scala:69)

  at
org.apache.spark.sql.execution.datasources.ListingFileCatalog.<init>(ListingFileCatalog.scala:50)

  at
org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:307)

  at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:160)

  at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:419)

Anyone can help me to resolve this?

-- 
Pei Sun