You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@hudi.apache.org by Gurudatt Kulkarni <gu...@gmail.com> on 2019/11/18 07:08:49 UTC

Issue while querying Hive table after updates

Hi All,

I am facing an issue where the aggregate query fails on partitions that
have more than one parquet file. But if I run a select *, query it displays
all results properly. Here's the stack trace of the error that I am
getting. I checked the hdfs directory for the particular file and it exists
in the directory but some how hive is not able to find it after the update.

java.io.IOException: cannot find dir =
hdfs://hadoop-host:8020/tmp/hoodie/tables/tbl_test/2019/11/09/6f864d6d-40a6-4eb7-9ee8-6133a16aa9e5-0_59-22-256_20191115185447.parquet
in pathToPartitionInfo: [hdfs:/tmp/hoodie/tables/tbl_test/2019/11/09]
  at
org.apache.hadoop.hive.ql.io.HiveFileFormatUtils.getPartitionDescFromPathRecursively(HiveFileFormatUtils.java:368)
  at
org.apache.hadoop.hive.ql.io.HiveFileFormatUtils.getPartitionDescFromPathRecursively(HiveFileFormatUtils.java:330)
  at
org.apache.hadoop.hive.ql.io.CombineHiveInputFormat$CombineHiveInputSplit.<init>(CombineHiveInputFormat.java:166)
  at
org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getCombineSplits(CombineHiveInputFormat.java:460)
  at
org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getSplits(CombineHiveInputFormat.java:547)
  at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)
  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
  at scala.Option.getOrElse(Option.scala:120)
  at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
  at
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
  at scala.Option.getOrElse(Option.scala:120)
  at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
  at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91)
  at org.apache.spark.rdd.ShuffledRDD.getDependencies(ShuffledRDD.scala:80)
  at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:226)
  at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:224)
  at scala.Option.getOrElse(Option.scala:120)
  at org.apache.spark.rdd.RDD.dependencies(RDD.scala:224)
  at org.apache.spark.scheduler.DAGScheduler.visit$1(DAGScheduler.scala:386)
  at
org.apache.spark.scheduler.DAGScheduler.getParentStages(DAGScheduler.scala:398)
  at
org.apache.spark.scheduler.DAGScheduler.getParentStagesAndId(DAGScheduler.scala:299)
  at
org.apache.spark.scheduler.DAGScheduler.newResultStage(DAGScheduler.scala:334)
  at
org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:837)
  at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1635)
  at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1627)
  at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1616)
  at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
19/11/18 12:19:16 ERROR impl.RemoteSparkJobStatus: Failed to run job 2
java.util.concurrent.ExecutionException: Exception thrown by job
  at
org.apache.spark.JavaFutureActionWrapper.getImpl(FutureAction.scala:311)
  at org.apache.spark.JavaFutureActionWrapper.get(FutureAction.scala:316)
  at
org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus$GetJobInfoJob.call(RemoteSparkJobStatus.java:195)
  at
org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus$GetJobInfoJob.call(RemoteSparkJobStatus.java:171)
  at
org.apache.hive.spark.client.RemoteDriver$DriverProtocol.handle(RemoteDriver.java:327)
  at sun.reflect.GeneratedMethodAccessor1.invoke(Unknown Source)
  at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
  at java.lang.reflect.Method.invoke(Method.java:498)
  at
org.apache.hive.spark.client.rpc.RpcDispatcher.handleCall(RpcDispatcher.java:120)
  at
org.apache.hive.spark.client.rpc.RpcDispatcher.channelRead0(RpcDispatcher.java:79)
  at
io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
  at
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
  at
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
  at
io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
  at
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
  at
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
  at
io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:244)
  at
io.netty.handler.codec.ByteToMessageCodec.channelRead(ByteToMessageCodec.java:103)
  at
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
  at
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
  at
io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
  at
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
  at
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
  at
io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:846)
  at
io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
  at
io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511)
  at
io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
  at
io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
  at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
  at
io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
  at java.lang.Thread.run(Thread.java:748)

Regards,
Gurudatt

Re: Issue while querying Hive table after updates

Posted by Gurudatt Kulkarni <gu...@gmail.com>.
Hi Balaji,

I just saw a PR[1], which is solving the same error. I will check once that
is merged.

[1] https://github.com/apache/incubator-hudi/pull/1001

Regards,
Gurudatt

On Thu, Nov 21, 2019 at 5:16 AM Balaji Varadarajan <vb...@apache.org>
wrote:

> Hi Gurudatt,
>
> From the stack-trace, it looks like you are using CombineInputFormat as
> your default input format for the hive session.  If your intention is to
> use combined input format, can you instead try setting default (set
> hive.input.format=) to
> org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat ?
>
>
> https://github.com/apache/incubator-hudi/blob/master/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/hive/HoodieCombineHiveInputFormat.java
>
> Thanks,
> Balaji.V
>
>
> On Mon, Nov 18, 2019 at 11:15 PM Gurudatt Kulkarni <gu...@gmail.com>
> wrote:
>
> > Hi Bhavani Sudha,
> >
> > >> Are you using spark sql or Hive query?
> >     This happens on all hive, hive on spark, spark sql.
> >
> > >> the table type ,
> >    This happens for both copy on write and merge on read.
> >
> > >> configs,
> >
> > hoodie.upsert.shuffle.parallelism=2
> > hoodie.insert.shuffle.parallelism=2
> > hoodie.bulkinsert.shuffle.parallelism=2
> >
> > # Key fields, for kafka example
> > hoodie.datasource.write.storage.type=MERGE_ON_READ
> > hoodie.datasource.write.recordkey.field=record_key
> > hoodie.datasource.write.partitionpath.field=timestamp
> > hoodie.deltastreamer.keygen.timebased.timestamp.type=EPOCHMILLISECONDS
> >
> >
> hoodie.datasource.write.keygenerator.class=org.apache.hudi.utilities.keygen.TimestampBasedKeyGenerator
> > hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd
> >
> > # schema provider configs
> > hoodie.deltastreamer.schemaprovider.registry.url=
> > http://schema-registry:8082/subjects/tbl_test-value/versions/latest
> >
> > hoodie.datasource.hive_sync.database=default
> > hoodie.datasource.hive_sync.table=tbl_test
> > hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hive-server:10000
> > hoodie.datasource.hive_sync.partition_fields=datestr
> >
> >
> hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
> >
> > #Kafka props
> > hoodie.deltastreamer.source.kafka.topic=tbl_test
> > metadata.broker.list=kafka-1:9092,kafka-2:9092
> > auto.offset.reset=smallest
> > schema.registry.url=http://schema-registry:8082
> >
> > Spark Submit Command
> >
> > spark-submit --master yarn --deploy-mode cluster --name "Test Job Hoodie"
> > --executor-memory 8g --driver-memory 2g --jars
> >
> >
> hdfs:///tmp/hudi/hudi-hive-bundle-0.5.1-SNAPSHOT.jar,hdfs:///tmp/hudi/hudi-spark-bundle-0.5.1-SNAPSHOT.jar,hdfs:///tmp/hudi/hudi-hadoop-mr-bundle-0.5.1-SNAPSHOT.jar
> > --files hdfs:///tmp/hudi/hive-site.xml --class
> > org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer
> > hdfs:///tmp/hudi/hudi-utilities-bundle-0.5.1-SNAPSHOT.jar
> > --schemaprovider-class
> > org.apache.hudi.utilities.schema.SchemaRegistryProvider --source-class
> > org.apache.hudi.utilities.sources.AvroKafkaSource --source-ordering-field
> > timestamp --target-base-path hdfs:///tmp/hoodie/tables/tbl_test
> > --filter-dupes --target-table tbl_test --storage-type MERGE_ON_READ
> --props
> > hdfs:///tmp/hudi/config.properties --enable-hive-sync
> >
> > Regards,
> > Gurudatt
> >
> >
> >
> > On Tue, Nov 19, 2019 at 1:11 AM Bhavani Sudha <bh...@gmail.com>
> > wrote:
> >
> > > Hi Gurudatt,
> > >
> > > Can you share more context on the table and the query. Are you using
> > spark
> > > sql or Hive query? the table type , etc? Also, if you can provide a
> small
> > > snippet to reproduce with the configs that you used, it would be useful
> > to
> > > debug.
> > >
> > > Thanks,
> > > Sudha
> > >
> > > On Sun, Nov 17, 2019 at 11:09 PM Gurudatt Kulkarni <
> guruak107@gmail.com>
> > > wrote:
> > >
> > > > Hi All,
> > > >
> > > > I am facing an issue where the aggregate query fails on partitions
> that
> > > > have more than one parquet file. But if I run a select *, query it
> > > displays
> > > > all results properly. Here's the stack trace of the error that I am
> > > > getting. I checked the hdfs directory for the particular file and it
> > > exists
> > > > in the directory but some how hive is not able to find it after the
> > > update.
> > > >
> > > > java.io.IOException: cannot find dir =
> > > >
> > > >
> > >
> >
> hdfs://hadoop-host:8020/tmp/hoodie/tables/tbl_test/2019/11/09/6f864d6d-40a6-4eb7-9ee8-6133a16aa9e5-0_59-22-256_20191115185447.parquet
> > > > in pathToPartitionInfo: [hdfs:/tmp/hoodie/tables/tbl_test/2019/11/09]
> > > >   at
> > > > org.apache.hadoop.hive.ql.io
> > > >
> > >
> >
> .HiveFileFormatUtils.getPartitionDescFromPathRecursively(HiveFileFormatUtils.java:368)
> > > >   at
> > > > org.apache.hadoop.hive.ql.io
> > > >
> > >
> >
> .HiveFileFormatUtils.getPartitionDescFromPathRecursively(HiveFileFormatUtils.java:330)
> > > >   at
> > > > org.apache.hadoop.hive.ql.io
> > > >
> > >
> >
> .CombineHiveInputFormat$CombineHiveInputSplit.<init>(CombineHiveInputFormat.java:166)
> > > >   at
> > > > org.apache.hadoop.hive.ql.io
> > > >
> > .CombineHiveInputFormat.getCombineSplits(CombineHiveInputFormat.java:460)
> > > >   at
> > > > org.apache.hadoop.hive.ql.io
> > > > .CombineHiveInputFormat.getSplits(CombineHiveInputFormat.java:547)
> > > >   at
> org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)
> > > >   at
> > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
> > > >   at
> > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
> > > >   at scala.Option.getOrElse(Option.scala:120)
> > > >   at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
> > > >   at
> > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
> > > >   at
> > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
> > > >   at scala.Option.getOrElse(Option.scala:120)
> > > >   at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
> > > >   at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91)
> > > >   at
> > > org.apache.spark.rdd.ShuffledRDD.getDependencies(ShuffledRDD.scala:80)
> > > >   at
> > > org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:226)
> > > >   at
> > > org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:224)
> > > >   at scala.Option.getOrElse(Option.scala:120)
> > > >   at org.apache.spark.rdd.RDD.dependencies(RDD.scala:224)
> > > >   at
> > > >
> org.apache.spark.scheduler.DAGScheduler.visit$1(DAGScheduler.scala:386)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.spark.scheduler.DAGScheduler.getParentStages(DAGScheduler.scala:398)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.spark.scheduler.DAGScheduler.getParentStagesAndId(DAGScheduler.scala:299)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.spark.scheduler.DAGScheduler.newResultStage(DAGScheduler.scala:334)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:837)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1635)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1627)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1616)
> > > >   at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> > > > 19/11/18 12:19:16 ERROR impl.RemoteSparkJobStatus: Failed to run job
> 2
> > > > java.util.concurrent.ExecutionException: Exception thrown by job
> > > >   at
> > > >
> > org.apache.spark.JavaFutureActionWrapper.getImpl(FutureAction.scala:311)
> > > >   at
> > org.apache.spark.JavaFutureActionWrapper.get(FutureAction.scala:316)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus$GetJobInfoJob.call(RemoteSparkJobStatus.java:195)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus$GetJobInfoJob.call(RemoteSparkJobStatus.java:171)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.hive.spark.client.RemoteDriver$DriverProtocol.handle(RemoteDriver.java:327)
> > > >   at sun.reflect.GeneratedMethodAccessor1.invoke(Unknown Source)
> > > >   at
> > > >
> > > >
> > >
> >
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> > > >   at java.lang.reflect.Method.invoke(Method.java:498)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.hive.spark.client.rpc.RpcDispatcher.handleCall(RpcDispatcher.java:120)
> > > >   at
> > > >
> > > >
> > >
> >
> org.apache.hive.spark.client.rpc.RpcDispatcher.channelRead0(RpcDispatcher.java:79)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:244)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.handler.codec.ByteToMessageCodec.channelRead(ByteToMessageCodec.java:103)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:846)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
> > > >   at
> > > >
> > >
> >
> io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
> > > >   at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
> > > >   at
> > > >
> > > >
> > >
> >
> io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
> > > >   at java.lang.Thread.run(Thread.java:748)
> > > >
> > > > Regards,
> > > > Gurudatt
> > > >
> > >
> >
>

Re: Issue while querying Hive table after updates

Posted by Balaji Varadarajan <vb...@apache.org>.
Hi Gurudatt,

From the stack-trace, it looks like you are using CombineInputFormat as
your default input format for the hive session.  If your intention is to
use combined input format, can you instead try setting default (set
hive.input.format=) to
org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat ?

https://github.com/apache/incubator-hudi/blob/master/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/hive/HoodieCombineHiveInputFormat.java

Thanks,
Balaji.V


On Mon, Nov 18, 2019 at 11:15 PM Gurudatt Kulkarni <gu...@gmail.com>
wrote:

> Hi Bhavani Sudha,
>
> >> Are you using spark sql or Hive query?
>     This happens on all hive, hive on spark, spark sql.
>
> >> the table type ,
>    This happens for both copy on write and merge on read.
>
> >> configs,
>
> hoodie.upsert.shuffle.parallelism=2
> hoodie.insert.shuffle.parallelism=2
> hoodie.bulkinsert.shuffle.parallelism=2
>
> # Key fields, for kafka example
> hoodie.datasource.write.storage.type=MERGE_ON_READ
> hoodie.datasource.write.recordkey.field=record_key
> hoodie.datasource.write.partitionpath.field=timestamp
> hoodie.deltastreamer.keygen.timebased.timestamp.type=EPOCHMILLISECONDS
>
> hoodie.datasource.write.keygenerator.class=org.apache.hudi.utilities.keygen.TimestampBasedKeyGenerator
> hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd
>
> # schema provider configs
> hoodie.deltastreamer.schemaprovider.registry.url=
> http://schema-registry:8082/subjects/tbl_test-value/versions/latest
>
> hoodie.datasource.hive_sync.database=default
> hoodie.datasource.hive_sync.table=tbl_test
> hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hive-server:10000
> hoodie.datasource.hive_sync.partition_fields=datestr
>
> hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
>
> #Kafka props
> hoodie.deltastreamer.source.kafka.topic=tbl_test
> metadata.broker.list=kafka-1:9092,kafka-2:9092
> auto.offset.reset=smallest
> schema.registry.url=http://schema-registry:8082
>
> Spark Submit Command
>
> spark-submit --master yarn --deploy-mode cluster --name "Test Job Hoodie"
> --executor-memory 8g --driver-memory 2g --jars
>
> hdfs:///tmp/hudi/hudi-hive-bundle-0.5.1-SNAPSHOT.jar,hdfs:///tmp/hudi/hudi-spark-bundle-0.5.1-SNAPSHOT.jar,hdfs:///tmp/hudi/hudi-hadoop-mr-bundle-0.5.1-SNAPSHOT.jar
> --files hdfs:///tmp/hudi/hive-site.xml --class
> org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer
> hdfs:///tmp/hudi/hudi-utilities-bundle-0.5.1-SNAPSHOT.jar
> --schemaprovider-class
> org.apache.hudi.utilities.schema.SchemaRegistryProvider --source-class
> org.apache.hudi.utilities.sources.AvroKafkaSource --source-ordering-field
> timestamp --target-base-path hdfs:///tmp/hoodie/tables/tbl_test
> --filter-dupes --target-table tbl_test --storage-type MERGE_ON_READ --props
> hdfs:///tmp/hudi/config.properties --enable-hive-sync
>
> Regards,
> Gurudatt
>
>
>
> On Tue, Nov 19, 2019 at 1:11 AM Bhavani Sudha <bh...@gmail.com>
> wrote:
>
> > Hi Gurudatt,
> >
> > Can you share more context on the table and the query. Are you using
> spark
> > sql or Hive query? the table type , etc? Also, if you can provide a small
> > snippet to reproduce with the configs that you used, it would be useful
> to
> > debug.
> >
> > Thanks,
> > Sudha
> >
> > On Sun, Nov 17, 2019 at 11:09 PM Gurudatt Kulkarni <gu...@gmail.com>
> > wrote:
> >
> > > Hi All,
> > >
> > > I am facing an issue where the aggregate query fails on partitions that
> > > have more than one parquet file. But if I run a select *, query it
> > displays
> > > all results properly. Here's the stack trace of the error that I am
> > > getting. I checked the hdfs directory for the particular file and it
> > exists
> > > in the directory but some how hive is not able to find it after the
> > update.
> > >
> > > java.io.IOException: cannot find dir =
> > >
> > >
> >
> hdfs://hadoop-host:8020/tmp/hoodie/tables/tbl_test/2019/11/09/6f864d6d-40a6-4eb7-9ee8-6133a16aa9e5-0_59-22-256_20191115185447.parquet
> > > in pathToPartitionInfo: [hdfs:/tmp/hoodie/tables/tbl_test/2019/11/09]
> > >   at
> > > org.apache.hadoop.hive.ql.io
> > >
> >
> .HiveFileFormatUtils.getPartitionDescFromPathRecursively(HiveFileFormatUtils.java:368)
> > >   at
> > > org.apache.hadoop.hive.ql.io
> > >
> >
> .HiveFileFormatUtils.getPartitionDescFromPathRecursively(HiveFileFormatUtils.java:330)
> > >   at
> > > org.apache.hadoop.hive.ql.io
> > >
> >
> .CombineHiveInputFormat$CombineHiveInputSplit.<init>(CombineHiveInputFormat.java:166)
> > >   at
> > > org.apache.hadoop.hive.ql.io
> > >
> .CombineHiveInputFormat.getCombineSplits(CombineHiveInputFormat.java:460)
> > >   at
> > > org.apache.hadoop.hive.ql.io
> > > .CombineHiveInputFormat.getSplits(CombineHiveInputFormat.java:547)
> > >   at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)
> > >   at
> org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
> > >   at
> org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
> > >   at scala.Option.getOrElse(Option.scala:120)
> > >   at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
> > >   at
> > >
> > >
> >
> org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
> > >   at
> org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
> > >   at
> org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
> > >   at scala.Option.getOrElse(Option.scala:120)
> > >   at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
> > >   at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91)
> > >   at
> > org.apache.spark.rdd.ShuffledRDD.getDependencies(ShuffledRDD.scala:80)
> > >   at
> > org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:226)
> > >   at
> > org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:224)
> > >   at scala.Option.getOrElse(Option.scala:120)
> > >   at org.apache.spark.rdd.RDD.dependencies(RDD.scala:224)
> > >   at
> > > org.apache.spark.scheduler.DAGScheduler.visit$1(DAGScheduler.scala:386)
> > >   at
> > >
> > >
> >
> org.apache.spark.scheduler.DAGScheduler.getParentStages(DAGScheduler.scala:398)
> > >   at
> > >
> > >
> >
> org.apache.spark.scheduler.DAGScheduler.getParentStagesAndId(DAGScheduler.scala:299)
> > >   at
> > >
> > >
> >
> org.apache.spark.scheduler.DAGScheduler.newResultStage(DAGScheduler.scala:334)
> > >   at
> > >
> > >
> >
> org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:837)
> > >   at
> > >
> > >
> >
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1635)
> > >   at
> > >
> > >
> >
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1627)
> > >   at
> > >
> > >
> >
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1616)
> > >   at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> > > 19/11/18 12:19:16 ERROR impl.RemoteSparkJobStatus: Failed to run job 2
> > > java.util.concurrent.ExecutionException: Exception thrown by job
> > >   at
> > >
> org.apache.spark.JavaFutureActionWrapper.getImpl(FutureAction.scala:311)
> > >   at
> org.apache.spark.JavaFutureActionWrapper.get(FutureAction.scala:316)
> > >   at
> > >
> > >
> >
> org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus$GetJobInfoJob.call(RemoteSparkJobStatus.java:195)
> > >   at
> > >
> > >
> >
> org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus$GetJobInfoJob.call(RemoteSparkJobStatus.java:171)
> > >   at
> > >
> > >
> >
> org.apache.hive.spark.client.RemoteDriver$DriverProtocol.handle(RemoteDriver.java:327)
> > >   at sun.reflect.GeneratedMethodAccessor1.invoke(Unknown Source)
> > >   at
> > >
> > >
> >
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> > >   at java.lang.reflect.Method.invoke(Method.java:498)
> > >   at
> > >
> > >
> >
> org.apache.hive.spark.client.rpc.RpcDispatcher.handleCall(RpcDispatcher.java:120)
> > >   at
> > >
> > >
> >
> org.apache.hive.spark.client.rpc.RpcDispatcher.channelRead0(RpcDispatcher.java:79)
> > >   at
> > >
> > >
> >
> io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
> > >   at
> > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
> > >   at
> > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
> > >   at
> > >
> > >
> >
> io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
> > >   at
> > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
> > >   at
> > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
> > >   at
> > >
> > >
> >
> io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:244)
> > >   at
> > >
> > >
> >
> io.netty.handler.codec.ByteToMessageCodec.channelRead(ByteToMessageCodec.java:103)
> > >   at
> > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
> > >   at
> > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
> > >   at
> > >
> > >
> >
> io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
> > >   at
> > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
> > >   at
> > >
> > >
> >
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
> > >   at
> > >
> > >
> >
> io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:846)
> > >   at
> > >
> > >
> >
> io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
> > >   at
> > >
> >
> io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511)
> > >   at
> > >
> > >
> >
> io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
> > >   at
> > >
> > >
> >
> io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
> > >   at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
> > >   at
> > >
> > >
> >
> io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
> > >   at java.lang.Thread.run(Thread.java:748)
> > >
> > > Regards,
> > > Gurudatt
> > >
> >
>

Re: Issue while querying Hive table after updates

Posted by Gurudatt Kulkarni <gu...@gmail.com>.
Hi Bhavani Sudha,

>> Are you using spark sql or Hive query?
    This happens on all hive, hive on spark, spark sql.

>> the table type ,
   This happens for both copy on write and merge on read.

>> configs,

hoodie.upsert.shuffle.parallelism=2
hoodie.insert.shuffle.parallelism=2
hoodie.bulkinsert.shuffle.parallelism=2

# Key fields, for kafka example
hoodie.datasource.write.storage.type=MERGE_ON_READ
hoodie.datasource.write.recordkey.field=record_key
hoodie.datasource.write.partitionpath.field=timestamp
hoodie.deltastreamer.keygen.timebased.timestamp.type=EPOCHMILLISECONDS
hoodie.datasource.write.keygenerator.class=org.apache.hudi.utilities.keygen.TimestampBasedKeyGenerator
hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd

# schema provider configs
hoodie.deltastreamer.schemaprovider.registry.url=
http://schema-registry:8082/subjects/tbl_test-value/versions/latest

hoodie.datasource.hive_sync.database=default
hoodie.datasource.hive_sync.table=tbl_test
hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hive-server:10000
hoodie.datasource.hive_sync.partition_fields=datestr
hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor

#Kafka props
hoodie.deltastreamer.source.kafka.topic=tbl_test
metadata.broker.list=kafka-1:9092,kafka-2:9092
auto.offset.reset=smallest
schema.registry.url=http://schema-registry:8082

Spark Submit Command

spark-submit --master yarn --deploy-mode cluster --name "Test Job Hoodie"
--executor-memory 8g --driver-memory 2g --jars
hdfs:///tmp/hudi/hudi-hive-bundle-0.5.1-SNAPSHOT.jar,hdfs:///tmp/hudi/hudi-spark-bundle-0.5.1-SNAPSHOT.jar,hdfs:///tmp/hudi/hudi-hadoop-mr-bundle-0.5.1-SNAPSHOT.jar
--files hdfs:///tmp/hudi/hive-site.xml --class
org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer
hdfs:///tmp/hudi/hudi-utilities-bundle-0.5.1-SNAPSHOT.jar
--schemaprovider-class
org.apache.hudi.utilities.schema.SchemaRegistryProvider --source-class
org.apache.hudi.utilities.sources.AvroKafkaSource --source-ordering-field
timestamp --target-base-path hdfs:///tmp/hoodie/tables/tbl_test
--filter-dupes --target-table tbl_test --storage-type MERGE_ON_READ --props
hdfs:///tmp/hudi/config.properties --enable-hive-sync

Regards,
Gurudatt



On Tue, Nov 19, 2019 at 1:11 AM Bhavani Sudha <bh...@gmail.com>
wrote:

> Hi Gurudatt,
>
> Can you share more context on the table and the query. Are you using spark
> sql or Hive query? the table type , etc? Also, if you can provide a small
> snippet to reproduce with the configs that you used, it would be useful to
> debug.
>
> Thanks,
> Sudha
>
> On Sun, Nov 17, 2019 at 11:09 PM Gurudatt Kulkarni <gu...@gmail.com>
> wrote:
>
> > Hi All,
> >
> > I am facing an issue where the aggregate query fails on partitions that
> > have more than one parquet file. But if I run a select *, query it
> displays
> > all results properly. Here's the stack trace of the error that I am
> > getting. I checked the hdfs directory for the particular file and it
> exists
> > in the directory but some how hive is not able to find it after the
> update.
> >
> > java.io.IOException: cannot find dir =
> >
> >
> hdfs://hadoop-host:8020/tmp/hoodie/tables/tbl_test/2019/11/09/6f864d6d-40a6-4eb7-9ee8-6133a16aa9e5-0_59-22-256_20191115185447.parquet
> > in pathToPartitionInfo: [hdfs:/tmp/hoodie/tables/tbl_test/2019/11/09]
> >   at
> > org.apache.hadoop.hive.ql.io
> >
> .HiveFileFormatUtils.getPartitionDescFromPathRecursively(HiveFileFormatUtils.java:368)
> >   at
> > org.apache.hadoop.hive.ql.io
> >
> .HiveFileFormatUtils.getPartitionDescFromPathRecursively(HiveFileFormatUtils.java:330)
> >   at
> > org.apache.hadoop.hive.ql.io
> >
> .CombineHiveInputFormat$CombineHiveInputSplit.<init>(CombineHiveInputFormat.java:166)
> >   at
> > org.apache.hadoop.hive.ql.io
> > .CombineHiveInputFormat.getCombineSplits(CombineHiveInputFormat.java:460)
> >   at
> > org.apache.hadoop.hive.ql.io
> > .CombineHiveInputFormat.getSplits(CombineHiveInputFormat.java:547)
> >   at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)
> >   at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
> >   at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
> >   at scala.Option.getOrElse(Option.scala:120)
> >   at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
> >   at
> >
> >
> org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
> >   at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
> >   at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
> >   at scala.Option.getOrElse(Option.scala:120)
> >   at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
> >   at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91)
> >   at
> org.apache.spark.rdd.ShuffledRDD.getDependencies(ShuffledRDD.scala:80)
> >   at
> org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:226)
> >   at
> org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:224)
> >   at scala.Option.getOrElse(Option.scala:120)
> >   at org.apache.spark.rdd.RDD.dependencies(RDD.scala:224)
> >   at
> > org.apache.spark.scheduler.DAGScheduler.visit$1(DAGScheduler.scala:386)
> >   at
> >
> >
> org.apache.spark.scheduler.DAGScheduler.getParentStages(DAGScheduler.scala:398)
> >   at
> >
> >
> org.apache.spark.scheduler.DAGScheduler.getParentStagesAndId(DAGScheduler.scala:299)
> >   at
> >
> >
> org.apache.spark.scheduler.DAGScheduler.newResultStage(DAGScheduler.scala:334)
> >   at
> >
> >
> org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:837)
> >   at
> >
> >
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1635)
> >   at
> >
> >
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1627)
> >   at
> >
> >
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1616)
> >   at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> > 19/11/18 12:19:16 ERROR impl.RemoteSparkJobStatus: Failed to run job 2
> > java.util.concurrent.ExecutionException: Exception thrown by job
> >   at
> > org.apache.spark.JavaFutureActionWrapper.getImpl(FutureAction.scala:311)
> >   at org.apache.spark.JavaFutureActionWrapper.get(FutureAction.scala:316)
> >   at
> >
> >
> org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus$GetJobInfoJob.call(RemoteSparkJobStatus.java:195)
> >   at
> >
> >
> org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus$GetJobInfoJob.call(RemoteSparkJobStatus.java:171)
> >   at
> >
> >
> org.apache.hive.spark.client.RemoteDriver$DriverProtocol.handle(RemoteDriver.java:327)
> >   at sun.reflect.GeneratedMethodAccessor1.invoke(Unknown Source)
> >   at
> >
> >
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> >   at java.lang.reflect.Method.invoke(Method.java:498)
> >   at
> >
> >
> org.apache.hive.spark.client.rpc.RpcDispatcher.handleCall(RpcDispatcher.java:120)
> >   at
> >
> >
> org.apache.hive.spark.client.rpc.RpcDispatcher.channelRead0(RpcDispatcher.java:79)
> >   at
> >
> >
> io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
> >   at
> >
> >
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
> >   at
> >
> >
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
> >   at
> >
> >
> io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
> >   at
> >
> >
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
> >   at
> >
> >
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
> >   at
> >
> >
> io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:244)
> >   at
> >
> >
> io.netty.handler.codec.ByteToMessageCodec.channelRead(ByteToMessageCodec.java:103)
> >   at
> >
> >
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
> >   at
> >
> >
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
> >   at
> >
> >
> io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
> >   at
> >
> >
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
> >   at
> >
> >
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
> >   at
> >
> >
> io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:846)
> >   at
> >
> >
> io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
> >   at
> >
> io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511)
> >   at
> >
> >
> io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
> >   at
> >
> >
> io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
> >   at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
> >   at
> >
> >
> io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
> >   at java.lang.Thread.run(Thread.java:748)
> >
> > Regards,
> > Gurudatt
> >
>

Re: Issue while querying Hive table after updates

Posted by Bhavani Sudha <bh...@gmail.com>.
Hi Gurudatt,

Can you share more context on the table and the query. Are you using spark
sql or Hive query? the table type , etc? Also, if you can provide a small
snippet to reproduce with the configs that you used, it would be useful to
debug.

Thanks,
Sudha

On Sun, Nov 17, 2019 at 11:09 PM Gurudatt Kulkarni <gu...@gmail.com>
wrote:

> Hi All,
>
> I am facing an issue where the aggregate query fails on partitions that
> have more than one parquet file. But if I run a select *, query it displays
> all results properly. Here's the stack trace of the error that I am
> getting. I checked the hdfs directory for the particular file and it exists
> in the directory but some how hive is not able to find it after the update.
>
> java.io.IOException: cannot find dir =
>
> hdfs://hadoop-host:8020/tmp/hoodie/tables/tbl_test/2019/11/09/6f864d6d-40a6-4eb7-9ee8-6133a16aa9e5-0_59-22-256_20191115185447.parquet
> in pathToPartitionInfo: [hdfs:/tmp/hoodie/tables/tbl_test/2019/11/09]
>   at
> org.apache.hadoop.hive.ql.io
> .HiveFileFormatUtils.getPartitionDescFromPathRecursively(HiveFileFormatUtils.java:368)
>   at
> org.apache.hadoop.hive.ql.io
> .HiveFileFormatUtils.getPartitionDescFromPathRecursively(HiveFileFormatUtils.java:330)
>   at
> org.apache.hadoop.hive.ql.io
> .CombineHiveInputFormat$CombineHiveInputSplit.<init>(CombineHiveInputFormat.java:166)
>   at
> org.apache.hadoop.hive.ql.io
> .CombineHiveInputFormat.getCombineSplits(CombineHiveInputFormat.java:460)
>   at
> org.apache.hadoop.hive.ql.io
> .CombineHiveInputFormat.getSplits(CombineHiveInputFormat.java:547)
>   at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)
>   at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
>   at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
>   at scala.Option.getOrElse(Option.scala:120)
>   at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
>   at
>
> org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
>   at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
>   at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
>   at scala.Option.getOrElse(Option.scala:120)
>   at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
>   at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91)
>   at org.apache.spark.rdd.ShuffledRDD.getDependencies(ShuffledRDD.scala:80)
>   at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:226)
>   at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:224)
>   at scala.Option.getOrElse(Option.scala:120)
>   at org.apache.spark.rdd.RDD.dependencies(RDD.scala:224)
>   at
> org.apache.spark.scheduler.DAGScheduler.visit$1(DAGScheduler.scala:386)
>   at
>
> org.apache.spark.scheduler.DAGScheduler.getParentStages(DAGScheduler.scala:398)
>   at
>
> org.apache.spark.scheduler.DAGScheduler.getParentStagesAndId(DAGScheduler.scala:299)
>   at
>
> org.apache.spark.scheduler.DAGScheduler.newResultStage(DAGScheduler.scala:334)
>   at
>
> org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:837)
>   at
>
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1635)
>   at
>
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1627)
>   at
>
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1616)
>   at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> 19/11/18 12:19:16 ERROR impl.RemoteSparkJobStatus: Failed to run job 2
> java.util.concurrent.ExecutionException: Exception thrown by job
>   at
> org.apache.spark.JavaFutureActionWrapper.getImpl(FutureAction.scala:311)
>   at org.apache.spark.JavaFutureActionWrapper.get(FutureAction.scala:316)
>   at
>
> org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus$GetJobInfoJob.call(RemoteSparkJobStatus.java:195)
>   at
>
> org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus$GetJobInfoJob.call(RemoteSparkJobStatus.java:171)
>   at
>
> org.apache.hive.spark.client.RemoteDriver$DriverProtocol.handle(RemoteDriver.java:327)
>   at sun.reflect.GeneratedMethodAccessor1.invoke(Unknown Source)
>   at
>
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>   at java.lang.reflect.Method.invoke(Method.java:498)
>   at
>
> org.apache.hive.spark.client.rpc.RpcDispatcher.handleCall(RpcDispatcher.java:120)
>   at
>
> org.apache.hive.spark.client.rpc.RpcDispatcher.channelRead0(RpcDispatcher.java:79)
>   at
>
> io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
>   at
>
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
>   at
>
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
>   at
>
> io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
>   at
>
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
>   at
>
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
>   at
>
> io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:244)
>   at
>
> io.netty.handler.codec.ByteToMessageCodec.channelRead(ByteToMessageCodec.java:103)
>   at
>
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
>   at
>
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
>   at
>
> io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
>   at
>
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
>   at
>
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
>   at
>
> io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:846)
>   at
>
> io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
>   at
> io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511)
>   at
>
> io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
>   at
>
> io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
>   at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
>   at
>
> io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
>   at java.lang.Thread.run(Thread.java:748)
>
> Regards,
> Gurudatt
>