You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by "sivabalan narayanan (Jira)" <ji...@apache.org> on 2022/06/02 21:27:00 UTC

[jira] [Commented] (HUDI-4156) AsyncIndexer fails for column stats partition

    [ https://issues.apache.org/jira/browse/HUDI-4156?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17545663#comment-17545663 ] 

sivabalan narayanan commented on HUDI-4156:
-------------------------------------------

local fix to unblock myself for now
{code:java}
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
index f5a96fb676..1e67020810 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
@@ -955,7 +955,6 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
     HoodieTableFileSystemView fsView = HoodieTableMetadataUtil.getFileSystemView(metadataMetaClient);
     for (Map.Entry<MetadataPartitionType, HoodieData<HoodieRecord>> entry : partitionRecordsMap.entrySet()) {
       final String partitionName = entry.getKey().getPartitionPath();
-      final int fileGroupCount = entry.getKey().getFileGroupCount();
       HoodieData<HoodieRecord> records = entry.getValue();
 
       List<FileSlice> fileSlices =
@@ -965,9 +964,10 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
         // so if there are no committed file slices, look for inflight slices
         fileSlices = HoodieTableMetadataUtil.getPartitionLatestFileSlicesIncludingInflight(metadataMetaClient, Option.ofNullable(fsView), partitionName);
       }
-      ValidationUtils.checkArgument(fileSlices.size() == fileGroupCount,
+      final int fileGroupCount = fileSlices.size();
+      /*ValidationUtils.checkArgument(fileSlices.size() == fileGroupCount,
           String.format("Invalid number of file groups for partition:%s, found=%d, required=%d",
-              partitionName, fileSlices.size(), fileGroupCount));
+              partitionName, fileSlices.size(), fileGroupCount));*/
 
       List<FileSlice> finalFileSlices = fileSlices;
       HoodieData<HoodieRecord> rddSinglePartitionRecords = records.map(r -> { {code}

> AsyncIndexer fails for column stats partition 
> ----------------------------------------------
>
>                 Key: HUDI-4156
>                 URL: https://issues.apache.org/jira/browse/HUDI-4156
>             Project: Apache Hudi
>          Issue Type: Bug
>          Components: metadata
>            Reporter: sivabalan narayanan
>            Assignee: Sagar Sumit
>            Priority: Major
>             Fix For: 0.11.1
>
>
> Tried to build col stats for a hudi table w/ async indexer and ran into below exception
>  
> Configs I had set are 
> {code:java}
> hoodie.metadata.enable=true
> hoodie.metadata.index.async=true
> hoodie.metadata.index.column.stats.enable=true
> hoodie.write.concurrency.mode=optimistic_concurrency_control
> hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.InProcessLockProvider {code}
> command
> {code:java}
> ./bin/spark-submit --class org.apache.hudi.utilities.HoodieIndexer /home/hadoop/hudi-utilities-bundle_2.12-0.12.0-SNAPSHOT.jar --props file:///home/hadoop/indexer.properties --mode scheduleandexecute --base-path TBL_PATH --table-name call_center --index-types COLUMN_STATS --parallelism 1 --spark-memory 10g {code}
>  
>  
> {code:java}
> 2022-05-26 00:14:27,936 INFO util.ClusteringUtils: Found 0 files in pending clustering operations
> 2022-05-26 00:14:27,937 INFO client.BaseHoodieClient: Stopping Timeline service !!
> 2022-05-26 00:14:27,937 INFO embedded.EmbeddedTimelineService: Closing Timeline server
> 2022-05-26 00:14:27,937 INFO service.TimelineService: Closing Timeline Service
> 2022-05-26 00:14:27,937 INFO javalin.Javalin: Stopping Javalin ...
> 2022-05-26 00:14:27,945 INFO javalin.Javalin: Javalin has stopped
> 2022-05-26 00:14:27,945 INFO service.TimelineService: Closed Timeline Service
> 2022-05-26 00:14:27,945 INFO embedded.EmbeddedTimelineService: Closed Timeline server
> 2022-05-26 00:14:27,945 INFO transaction.TransactionManager: Transaction manager closed
> 2022-05-26 00:14:27,946 ERROR utilities.UtilHelpers: Indexer failed
> java.lang.IllegalArgumentException: Invalid number of file groups for partition:column_stats, found=2, required=1
> 	at org.apache.hudi.common.util.ValidationUtils.checkArgument(ValidationUtils.java:40)
> 	at org.apache.hudi.metadata.HoodieBackedTableMetadataWriter.prepRecords(HoodieBackedTableMetadataWriter.java:968)
> 	at org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter.commit(SparkHoodieBackedTableMetadataWriter.java:132)
> 	at org.apache.hudi.metadata.HoodieBackedTableMetadataWriter.initialCommit(HoodieBackedTableMetadataWriter.java:1087)
> 	at org.apache.hudi.metadata.HoodieBackedTableMetadataWriter.buildMetadataPartitions(HoodieBackedTableMetadataWriter.java:858)
> 	at org.apache.hudi.table.action.index.RunIndexActionExecutor.execute(RunIndexActionExecutor.java:140)
> 	at org.apache.hudi.table.HoodieSparkCopyOnWriteTable.index(HoodieSparkCopyOnWriteTable.java:291)
> 	at org.apache.hudi.client.BaseHoodieWriteClient.index(BaseHoodieWriteClient.java:1027)
> 	at org.apache.hudi.utilities.HoodieIndexer.scheduleAndRunIndexing(HoodieIndexer.java:278)
> 	at org.apache.hudi.utilities.HoodieIndexer.lambda$start$1(HoodieIndexer.java:198)
> 	at org.apache.hudi.utilities.UtilHelpers.retry(UtilHelpers.java:541)
> 	at org.apache.hudi.utilities.HoodieIndexer.start(HoodieIndexer.java:185)
> 	at org.apache.hudi.utilities.HoodieIndexer.main(HoodieIndexer.java:154)
> 	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> 	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> 	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> 	at java.lang.reflect.Method.invoke(Method.java:498)
> 	at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
> 	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:955)
> 	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
> 	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
> 	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
> 	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1043)
> 	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1052)
> 	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
> 2022-05-26 00:14:27,947 ERROR utilities.HoodieIndexer: Indexing with basePath: s3a://sagars-testlake/TPC-DS/1TB/hudi_hand_tuned_may20_1/call_center, tableName: call_center, runningMode: scheduleandexecute failed
> 2022-05-26 00:14:27,954 INFO server.AbstractConnector: Stopped Spark@450794b4{HTTP/1.1, (http/1.1)}{0.0.0.0:8090}
> 2022-05-26 00:14:27,954 INFO ui.SparkUI: Stopped Spark web UI at http://ip-172-31-39-68.us-east-2.compute.internal:8090
> 2022-05-26 00:14:27,964 INFO spark.MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped! {code}
>  
>  



--
This message was sent by Atlassian Jira
(v8.20.7#820007)