You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2021/01/26 03:39:04 UTC
[GitHub] [hudi] lshg opened a new issue #2490: spark read hudi data from hive
lshg opened a new issue #2490:
URL: https://github.com/apache/hudi/issues/2490
package com.gjr.recommend
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
object DWDTenderLog {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[2]").set("spark.executor.memory", "512m")
val sc: SparkContext = new SparkContext(conf)
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val hc = new HiveContext(sc)
hc.setConf("spark.sql.crossJoin.enabled","true");
val tenderLog: Array[Row] = hc.sql(
"""
| SELECT
|projectid,
|provinceid,
|typeId,
|tender_tag
|FROM
|(
|SELECT
|projectid,
|provinceid,
|typeId,
|antistop
|FROM
|app.dwd_recommend_tender_ds
|WHERE
|createTime >= 1608280608479 AND createTime <= 1611628847000
|AND antistop != ''
|GROUP BY
|projectid,
|provinceid,
|typeId,
|antistop
|) AS a lateral VIEW explode (split(antistop, "#")) table_tmp AS tender_tag
""".stripMargin).collect()
println(tenderLog.toBuffer)
sc.stop()
}
}
0 [main] INFO org.apache.spark.SparkContext - Running Spark version 2.4.7
346 [main] INFO org.apache.spark.SparkContext - Submitted application: DWDTenderLog$
390 [main] INFO org.apache.spark.SecurityManager - Changing view acls to: lsh
390 [main] INFO org.apache.spark.SecurityManager - Changing modify acls to: lsh
390 [main] INFO org.apache.spark.SecurityManager - Changing view acls groups to:
390 [main] INFO org.apache.spark.SecurityManager - Changing modify acls groups to:
391 [main] INFO org.apache.spark.SecurityManager - SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(lsh); groups with view permissions: Set(); users with modify permissions: Set(lsh); groups with modify permissions: Set()
2533 [main] INFO org.apache.spark.util.Utils - Successfully started service 'sparkDriver' on port 54347.
2575 [main] INFO org.apache.spark.SparkEnv - Registering MapOutputTracker
2588 [main] INFO org.apache.spark.SparkEnv - Registering BlockManagerMaster
2589 [main] INFO org.apache.spark.storage.BlockManagerMasterEndpoint - Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
2590 [main] INFO org.apache.spark.storage.BlockManagerMasterEndpoint - BlockManagerMasterEndpoint up
2596 [main] INFO org.apache.spark.storage.DiskBlockManager - Created local directory at C:\Users\lsh\AppData\Local\Temp\blockmgr-d134fb11-0552-4b4b-8f20-ea7e04fd086d
2609 [main] INFO org.apache.spark.storage.memory.MemoryStore - MemoryStore started with capacity 1979.1 MB
2619 [main] INFO org.apache.spark.SparkEnv - Registering OutputCommitCoordinator
2675 [main] INFO org.spark_project.jetty.util.log - Logging initialized @23630ms
2720 [main] INFO org.spark_project.jetty.server.Server - jetty-9.3.z-SNAPSHOT, build timestamp: 2019-02-16T00:53:49+08:00, git hash: eb70b240169fcf1abbd86af36482d1c49826fa0b
2731 [main] INFO org.spark_project.jetty.server.Server - Started @23687ms
2747 [main] INFO org.spark_project.jetty.server.AbstractConnector - Started ServerConnector@4d63b624{HTTP/1.1,[http/1.1]}{0.0.0.0:4040}
2747 [main] INFO org.apache.spark.util.Utils - Successfully started service 'SparkUI' on port 4040.
2767 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@27eb3298{/jobs,null,AVAILABLE,@Spark}
2768 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@1b58ff9e{/jobs/json,null,AVAILABLE,@Spark}
2768 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@2f66e802{/jobs/job,null,AVAILABLE,@Spark}
2769 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@76318a7d{/jobs/job/json,null,AVAILABLE,@Spark}
2770 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@2a492f2a{/stages,null,AVAILABLE,@Spark}
2770 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@3277e499{/stages/json,null,AVAILABLE,@Spark}
2771 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@585811a4{/stages/stage,null,AVAILABLE,@Spark}
2772 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@4c4d362a{/stages/stage/json,null,AVAILABLE,@Spark}
2773 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@5400db36{/stages/pool,null,AVAILABLE,@Spark}
2773 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@76b74e9c{/stages/pool/json,null,AVAILABLE,@Spark}
2774 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@2d72f75e{/storage,null,AVAILABLE,@Spark}
2775 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@8ab78bc{/storage/json,null,AVAILABLE,@Spark}
2776 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@5aa0dbf4{/storage/rdd,null,AVAILABLE,@Spark}
2776 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@16afbd92{/storage/rdd/json,null,AVAILABLE,@Spark}
2777 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@2c5d601e{/environment,null,AVAILABLE,@Spark}
2777 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@7fe083b1{/environment/json,null,AVAILABLE,@Spark}
2777 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@23c388c2{/executors,null,AVAILABLE,@Spark}
2778 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@486be205{/executors/json,null,AVAILABLE,@Spark}
2778 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@f713686{/executors/threadDump,null,AVAILABLE,@Spark}
2778 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@74f7d1d2{/executors/threadDump/json,null,AVAILABLE,@Spark}
2783 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@4b4dd216{/static,null,AVAILABLE,@Spark}
2784 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@54afd745{/,null,AVAILABLE,@Spark}
2786 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@677dbd89{/api,null,AVAILABLE,@Spark}
2787 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@2ca47471{/jobs/job/kill,null,AVAILABLE,@Spark}
2788 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@5a021cb9{/stages/stage/kill,null,AVAILABLE,@Spark}
2790 [main] INFO org.apache.spark.ui.SparkUI - Bound SparkUI to 0.0.0.0, and started at http://DESKTOP-E6TA5L3:4040
2873 [main] INFO org.apache.spark.executor.Executor - Starting executor ID driver on host localhost
2921 [main] INFO org.apache.spark.util.Utils - Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 54366.
2922 [main] INFO org.apache.spark.network.netty.NettyBlockTransferService - Server created on DESKTOP-E6TA5L3:54366
2923 [main] INFO org.apache.spark.storage.BlockManager - Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
2939 [main] INFO org.apache.spark.storage.BlockManagerMaster - Registering BlockManager BlockManagerId(driver, DESKTOP-E6TA5L3, 54366, None)
2942 [dispatcher-event-loop-0] INFO org.apache.spark.storage.BlockManagerMasterEndpoint - Registering block manager DESKTOP-E6TA5L3:54366 with 1979.1 MB RAM, BlockManagerId(driver, DESKTOP-E6TA5L3, 54366, None)
2945 [main] INFO org.apache.spark.storage.BlockManagerMaster - Registered BlockManager BlockManagerId(driver, DESKTOP-E6TA5L3, 54366, None)
2945 [main] INFO org.apache.spark.storage.BlockManager - Initialized BlockManager: BlockManagerId(driver, DESKTOP-E6TA5L3, 54366, None)
3086 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@466d49f0{/metrics/json,null,AVAILABLE,@Spark}
3109 [main] WARN org.apache.spark.SparkContext - Using an existing SparkContext; some configuration may not take effect.
3244 [main] INFO org.apache.spark.sql.internal.SharedState - loading hive config file: file:/D:/GJR_PROJECT/tt/tenderRecommend/target/classes/hive-site.xml
3279 [main] INFO org.apache.spark.sql.internal.SharedState - spark.sql.warehouse.dir is not set, but hive.metastore.warehouse.dir is set. Setting spark.sql.warehouse.dir to the value of hive.metastore.warehouse.dir ('/user/hive/warehouse').
3279 [main] INFO org.apache.spark.sql.internal.SharedState - Warehouse path is '/user/hive/warehouse'.
3285 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@796065aa{/SQL,null,AVAILABLE,@Spark}
3286 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@28a6301f{/SQL/json,null,AVAILABLE,@Spark}
3286 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@1436a7ab{/SQL/execution,null,AVAILABLE,@Spark}
3286 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@3b7b05a8{/SQL/execution/json,null,AVAILABLE,@Spark}
3287 [main] INFO org.spark_project.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@336365bc{/static/sql,null,AVAILABLE,@Spark}
3707 [main] INFO org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef - Registered StateStoreCoordinator endpoint
4072 [main] INFO org.apache.spark.sql.hive.HiveUtils - Initializing HiveMetastoreConnection version 1.2.1 using Spark classes.
4479 [main] WARN org.apache.hadoop.hive.conf.HiveConf - HiveConf of name hive.server2.webui.port does not exist
4479 [main] WARN org.apache.hadoop.hive.conf.HiveConf - HiveConf of name hive.server2.webui.host does not exist
4584 [main] INFO hive.metastore - Trying to connect to metastore with URI thrift://t1:9083
4759 [main] INFO hive.metastore - Connected to metastore.
8437 [main] INFO org.apache.hadoop.hive.ql.session.SessionState - Created local directory: C:/Users/lsh/AppData/Local/Temp/99f90c54-0932-45d6-924a-b4cdd357db61_resources
8477 [main] INFO org.apache.hadoop.hive.ql.session.SessionState - Created HDFS directory: /user/hive/tmp/lsh/99f90c54-0932-45d6-924a-b4cdd357db61
8496 [main] INFO org.apache.hadoop.hive.ql.session.SessionState - Created local directory: C:/Users/lsh/AppData/Local/Temp/lsh/99f90c54-0932-45d6-924a-b4cdd357db61
8534 [main] INFO org.apache.hadoop.hive.ql.session.SessionState - Created HDFS directory: /user/hive/tmp/lsh/99f90c54-0932-45d6-924a-b4cdd357db61/_tmp_space.db
8560 [main] INFO org.apache.spark.sql.hive.client.HiveClientImpl - Warehouse location for Hive client (version 1.2.2) is /user/hive/warehouse
10106 [main] INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy - Pruning directories with:
10108 [main] INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy - Post-Scan Filters: isnotnull(createTime#25L),isnotnull(antistop#8),(createTime#25L >= 1608280608479),(createTime#25L <= 1611628847000),NOT (antistop#8 = )
10110 [main] INFO org.apache.spark.sql.execution.datasources.FileSourceStrategy - Output Data Schema: struct<projectId: int, antistop: string, provinceId: int, typeId: int, createTime: bigint ... 3 more fields>
10118 [main] INFO org.apache.spark.sql.execution.FileSourceScanExec - Pushed Filters: IsNotNull(createTime),IsNotNull(antistop),GreaterThanOrEqual(createTime,1608280608479),LessThanOrEqual(createTime,1611628847000),Not(EqualTo(antistop,))
10165 [main] WARN org.apache.spark.util.Utils - Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.
11262 [main] INFO org.apache.spark.sql.execution.datasources.PrunedInMemoryFileIndex - It took 738 ms to list leaf files for 10 paths.
11568 [Spark Context Cleaner] INFO org.apache.spark.ContextCleaner - Cleaned accumulator 0
11593 [main] INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 223.244 ms
11751 [main] INFO org.apache.spark.sql.execution.aggregate.HashAggregateExec - spark.sql.codegen.aggregate.map.twolevel.enabled is set to true, but current version of codegened fast hashmap does not support this aggregate.
11842 [main] INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 52.0291 ms
11843 [main] INFO org.apache.spark.sql.execution.aggregate.HashAggregateExec - spark.sql.codegen.aggregate.map.twolevel.enabled is set to true, but current version of codegened fast hashmap does not support this aggregate.
11906 [main] INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 38.8988 ms
11968 [main] INFO org.apache.spark.storage.memory.MemoryStore - Block broadcast_0 stored as values in memory (estimated size 242.1 KB, free 1978.9 MB)
12016 [main] INFO org.apache.spark.storage.memory.MemoryStore - Block broadcast_0_piece0 stored as bytes in memory (estimated size 23.6 KB, free 1978.8 MB)
12018 [dispatcher-event-loop-1] INFO org.apache.spark.storage.BlockManagerInfo - Added broadcast_0_piece0 in memory on DESKTOP-E6TA5L3:54366 (size: 23.6 KB, free: 1979.1 MB)
12020 [main] INFO org.apache.spark.SparkContext - Created broadcast 0 from collect at DWDTenderLog.scala:54
12024 [main] INFO org.apache.spark.sql.execution.FileSourceScanExec - Planning scan with bin packing, max size: 49910044 bytes, open cost is considered as scanning 4194304 bytes.
12175 [main] INFO org.apache.spark.SparkContext - Starting job: collect at DWDTenderLog.scala:54
12189 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - Registering RDD 2 (collect at DWDTenderLog.scala:54) as input to shuffle 0
12191 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - Got job 0 (collect at DWDTenderLog.scala:54) with 200 output partitions
12191 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - Final stage: ResultStage 1 (collect at DWDTenderLog.scala:54)
12192 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - Parents of final stage: List(ShuffleMapStage 0)
12194 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - Missing parents: List(ShuffleMapStage 0)
12198 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - Submitting ShuffleMapStage 0 (MapPartitionsRDD[2] at collect at DWDTenderLog.scala:54), which has no missing parents
12214 [dag-scheduler-event-loop] INFO org.apache.spark.storage.memory.MemoryStore - Block broadcast_1 stored as values in memory (estimated size 25.4 KB, free 1978.8 MB)
12217 [dag-scheduler-event-loop] INFO org.apache.spark.storage.memory.MemoryStore - Block broadcast_1_piece0 stored as bytes in memory (estimated size 10.8 KB, free 1978.8 MB)
12218 [dispatcher-event-loop-0] INFO org.apache.spark.storage.BlockManagerInfo - Added broadcast_1_piece0 in memory on DESKTOP-E6TA5L3:54366 (size: 10.8 KB, free: 1979.1 MB)
12218 [dag-scheduler-event-loop] INFO org.apache.spark.SparkContext - Created broadcast 1 from broadcast at DAGScheduler.scala:1184
12227 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - Submitting 2 missing tasks from ShuffleMapStage 0 (MapPartitionsRDD[2] at collect at DWDTenderLog.scala:54) (first 15 tasks are for partitions Vector(0, 1))
12228 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Adding task set 0.0 with 2 tasks
12253 [dispatcher-event-loop-1] INFO org.apache.spark.scheduler.TaskSetManager - Starting task 0.0 in stage 0.0 (TID 0, localhost, executor driver, partition 0, ANY, 10221 bytes)
12254 [dispatcher-event-loop-1] INFO org.apache.spark.scheduler.TaskSetManager - Starting task 1.0 in stage 0.0 (TID 1, localhost, executor driver, partition 1, ANY, 10288 bytes)
12260 [Executor task launch worker for task 1] INFO org.apache.spark.executor.Executor - Running task 1.0 in stage 0.0 (TID 1)
12260 [Executor task launch worker for task 0] INFO org.apache.spark.executor.Executor - Running task 0.0 in stage 0.0 (TID 0)
12341 [Executor task launch worker for task 1] INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 20.998 ms
12361 [Executor task launch worker for task 0] INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 7.7712 ms
12375 [Executor task launch worker for task 0] INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 10.5504 ms
12388 [Executor task launch worker for task 1] INFO org.apache.spark.sql.execution.datasources.FileScanRDD - Reading File path: hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3536800-8517888_20210126111108.parquet, range: 0-791226, partition values: [2021-01-26]
12388 [Executor task launch worker for task 0] INFO org.apache.spark.sql.execution.datasources.FileScanRDD - Reading File path: hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/23/bf784cc0-1918-4ddd-8145-9de3f840c558-0_0-1553263-3646025_20210124000006.parquet, range: 0-1673011, partition values: [2021-01-23]
14219 [Executor task launch worker for task 1] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
14219 [Executor task launch worker for task 0] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
15102 [Executor task launch worker for task 0] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
15262 [Executor task launch worker for task 0] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
15594 [Executor task launch worker for task 0] INFO org.apache.hadoop.io.compress.zlib.ZlibFactory - Successfully loaded & initialized native-zlib library
15594 [Executor task launch worker for task 0] INFO org.apache.hadoop.io.compress.CodecPool - Got brand-new decompressor [.gz]
16218 [Executor task launch worker for task 1] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
16439 [Executor task launch worker for task 1] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
16943 [Executor task launch worker for task 1] INFO org.apache.hadoop.io.compress.CodecPool - Got brand-new decompressor [.gz]
18239 [Executor task launch worker for task 0] INFO org.apache.spark.sql.execution.datasources.FileScanRDD - Reading File path: hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/24/7a98c96a-fdeb-47a3-98a3-e2c4b4d6ec81-0_0-2634501-6263163_20210125000002.parquet, range: 0-1538381, partition values: [2021-01-24]
18679 [Executor task launch worker for task 1] INFO org.apache.spark.sql.execution.datasources.FileScanRDD - Reading File path: hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3536707-8517665_20210126111054.parquet, range: 0-791122, partition values: [2021-01-26]
20217 [Executor task launch worker for task 0] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
20400 [Executor task launch worker for task 1] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
21685 [Executor task launch worker for task 0] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
21988 [Executor task launch worker for task 0] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
22055 [Executor task launch worker for task 1] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
22148 [Executor task launch worker for task 1] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
24301 [Executor task launch worker for task 1] INFO org.apache.spark.sql.execution.datasources.FileScanRDD - Reading File path: hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3536622-8517450_20210126111049.parquet, range: 0-791045, partition values: [2021-01-26]
24639 [Executor task launch worker for task 0] INFO org.apache.spark.sql.execution.datasources.FileScanRDD - Reading File path: hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/25/3baf7087-95d2-4836-9a4c-3f5b4ced568c-0_0-3188990-7636378_20210125235941.parquet, range: 0-994567, partition values: [2021-01-25]
25837 [Executor task launch worker for task 0] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
26843 [Executor task launch worker for task 1] ERROR org.apache.spark.executor.Executor - Exception in task 1.0 in stage 0.0 (TID 1)
java.io.FileNotFoundException: File does not exist: hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3536622-8517450_20210126111049.parquet
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.scan_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
26859 [task-result-getter-0] WARN org.apache.spark.scheduler.TaskSetManager - Lost task 1.0 in stage 0.0 (TID 1, localhost, executor driver): java.io.FileNotFoundException: File does not exist: hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3536622-8517450_20210126111049.parquet
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.scan_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
26861 [task-result-getter-0] ERROR org.apache.spark.scheduler.TaskSetManager - Task 1 in stage 0.0 failed 1 times; aborting job
26866 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Cancelling stage 0
26867 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Killing all running tasks in stage 0: Stage cancelled
26869 [dispatcher-event-loop-0] INFO org.apache.spark.executor.Executor - Executor is trying to kill task 0.0 in stage 0.0 (TID 0), reason: Stage cancelled
26869 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Stage 0 was cancelled
26870 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - ShuffleMapStage 0 (collect at DWDTenderLog.scala:54) failed in 14.660 s due to Job aborted due to stage failure: Task 1 in stage 0.0 failed 1 times, most recent failure: Lost task 1.0 in stage 0.0 (TID 1, localhost, executor driver): java.io.FileNotFoundException: File does not exist: hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3536622-8517450_20210126111049.parquet
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.scan_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
26873 [main] INFO org.apache.spark.scheduler.DAGScheduler - Job 0 failed: collect at DWDTenderLog.scala:54, took 14.697048 s
26879 [Thread-1] INFO org.apache.spark.SparkContext - Invoking stop() from shutdown hook
26885 [Thread-1] INFO org.spark_project.jetty.server.AbstractConnector - Stopped Spark@4d63b624{HTTP/1.1,[http/1.1]}{0.0.0.0:4040}
26887 [Thread-1] INFO org.apache.spark.ui.SparkUI - Stopped Spark web UI at http://DESKTOP-E6TA5L3:4040
26896 [dispatcher-event-loop-1] INFO org.apache.spark.MapOutputTrackerMasterEndpoint - MapOutputTrackerMasterEndpoint stopped!
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 1 times, most recent failure: Lost task 1.0 in stage 0.0 (TID 1, localhost, executor driver): java.io.FileNotFoundException: File does not exist: hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3536622-8517450_20210126111049.parquet
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.scan_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2788)
at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2788)
at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3369)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:2788)
at com.gjr.recommend.DWDTenderLog$.main(DWDTenderLog.scala:54)
at com.gjr.recommend.DWDTenderLog.main(DWDTenderLog.scala)
Caused by: java.io.FileNotFoundException: File does not exist: hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3536622-8517450_20210126111049.parquet
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.scan_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
26908 [Thread-1] INFO org.apache.spark.storage.memory.MemoryStore - MemoryStore cleared
26908 [Thread-1] INFO org.apache.spark.storage.BlockManager - BlockManager stopped
26910 [Thread-1] INFO org.apache.spark.storage.BlockManagerMaster - BlockManagerMaster stopped
26911 [dispatcher-event-loop-0] INFO org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint - OutputCommitCoordinator stopped!
26915 [Thread-1] INFO org.apache.spark.SparkContext - Successfully stopped SparkContext
26916 [Thread-1] INFO org.apache.spark.util.ShutdownHookManager - Shutdown hook called
26916 [Thread-1] INFO org.apache.spark.util.ShutdownHookManager - Deleting directory C:\Users\lsh\AppData\Local\Temp\spark-b9c30c48-0f2d-46ff-adb2-44e702a4dc5a
26919 [Executor task launch worker for task 0] INFO org.apache.parquet.filter2.compat.FilterCompat - Filtering using predicate: and(and(and(and(noteq(createTime, null), noteq(antistop, null)), gteq(createTime, 1608280608479)), lteq(createTime, 1611628847000)), noteq(antistop, Binary{""}))
Process finished with exit code 1
hdfs data~~~~~~~~~~~~~~~~~~~~~~~~~
[root@t1 ~]# hdfs dfs -ls hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/local/modules/hadoop-2.8.5/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/local/software/apache-tez-0.9.2-bin/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Found 11 items
-rw-r--r-- 3 root supergroup 93 2021-01-26 00:00 hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/.hoodie_partition_metadata
-rw-r--r-- 3 root supergroup 781714 2021-01-26 10:53 hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3526941-8492659_20210126105334.parquet
-rw-r--r-- 3 root supergroup 781786 2021-01-26 10:53 hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3527031-8492879_20210126105340.parquet
-rw-r--r-- 3 root supergroup 781872 2021-01-26 10:53 hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3527113-8493091_20210126105354.parquet
-rw-r--r-- 3 root supergroup 781938 2021-01-26 10:54 hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3527261-8493515_20210126105409.parquet
-rw-r--r-- 3 root supergroup 782011 2021-01-26 10:54 hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3527346-8493730_20210126105414.parquet
-rw-r--r-- 3 root supergroup 782106 2021-01-26 10:54 hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3527439-8493953_20210126105420.parquet
-rw-r--r-- 3 root supergroup 782214 2021-01-26 10:54 hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3527532-8494176_20210126105434.parquet
-rw-r--r-- 3 root supergroup 782287 2021-01-26 10:54 hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3527617-8494391_20210126105444.parquet
-rw-r--r-- 3 root supergroup 782368 2021-01-26 10:54 hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3527710-8494614_20210126105454.parquet
-rw-r--r-- 3 root supergroup 782465 2021-01-26 10:55 hdfs://gongjiangren/data/app/dwd_recommend_tender_ds/partitionpath=2021/01/26/a685bd51-614f-48b0-a360-f09f28baae84-0_0-3527795-8494829_20210126105500.parquet
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [hudi] nsivabalan edited a comment on issue #2490: spark read hudi data from hive
Posted by GitBox <gi...@apache.org>.
nsivabalan edited a comment on issue #2490:
URL: https://github.com/apache/hudi/issues/2490#issuecomment-774506409
@lshg : Can you please take a look at this ticket and respond when you can?
Also, few more quick questions as we triage this issue.
- Were you running older version of Hudi and encountered this after upgrade? in other words, older Hudi version you were able to run successfully and with 0.7.0 there is a bug.
- Is this affecting your production? trying to gauge the severity.
- Or you are trying out a POC ? and this is the first time trying out Hudi.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [hudi] nsivabalan commented on issue #2490: spark read hudi data from hive
Posted by GitBox <gi...@apache.org>.
nsivabalan commented on issue #2490:
URL: https://github.com/apache/hudi/issues/2490#issuecomment-822572667
Closing this out due to inactivity.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [hudi] n3nash commented on issue #2490: spark read hudi data from hive
Posted by GitBox <gi...@apache.org>.
n3nash commented on issue #2490:
URL: https://github.com/apache/hudi/issues/2490#issuecomment-771411738
@Ishg Any update ?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [hudi] bvaradar commented on issue #2490: spark read hudi data from hive
Posted by GitBox <gi...@apache.org>.
bvaradar commented on issue #2490:
URL: https://github.com/apache/hudi/issues/2490#issuecomment-767390704
It could be that there are more than 1 Hudi writer writing to the same dataset. Can you check if this is the case ?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [hudi] nsivabalan edited a comment on issue #2490: spark read hudi data from hive
Posted by GitBox <gi...@apache.org>.
nsivabalan edited a comment on issue #2490:
URL: https://github.com/apache/hudi/issues/2490#issuecomment-774506409
@lshg : Can you please take a look at this ticket and respond when you can?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [hudi] nsivabalan closed issue #2490: spark read hudi data from hive
Posted by GitBox <gi...@apache.org>.
nsivabalan closed issue #2490:
URL: https://github.com/apache/hudi/issues/2490
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [hudi] nsivabalan commented on issue #2490: spark read hudi data from hive
Posted by GitBox <gi...@apache.org>.
nsivabalan commented on issue #2490:
URL: https://github.com/apache/hudi/issues/2490#issuecomment-774506409
@lshg : Can you please take a look at this ticket and response when you can?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [hudi] n3nash commented on issue #2490: spark read hudi data from hive
Posted by GitBox <gi...@apache.org>.
n3nash commented on issue #2490:
URL: https://github.com/apache/hudi/issues/2490#issuecomment-771411738
@Ishg Any update ?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [hudi] nsivabalan commented on issue #2490: spark read hudi data from hive
Posted by GitBox <gi...@apache.org>.
nsivabalan commented on issue #2490:
URL: https://github.com/apache/hudi/issues/2490#issuecomment-810437410
Please respond when you get time. Will close it out due to inactivity in some time. But definitely reach out to us if you need any help.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org