You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@dolphinscheduler.apache.org by zh...@apache.org on 2022/12/28 06:21:53 UTC
[dolphinscheduler] 08/11: Solve the deadlock problem caused by queuing (#13191)
This is an automated email from the ASF dual-hosted git repository.
zhongjiajie pushed a commit to branch 3.0.4-prepare
in repository https://gitbox.apache.org/repos/asf/dolphinscheduler.git
commit bc1cf25f4d705efd5fe500a9b27691b1b83fb771
Author: sssqhai <35...@users.noreply.github.com>
AuthorDate: Fri Dec 16 19:55:02 2022 +0800
Solve the deadlock problem caused by queuing (#13191)
* Solve the deadlock problem caused by queuing
* Solve the deadlock problem caused by queuing
* Solve the deadlock problem caused by queuing
* Solve the deadlock problem caused by queuing,move the event to the tail by throwing a exception
Co-authored-by: wfs <wa...@cdqcp.cpm>
(cherry picked from commit 7a0a2c2a46a0224d0c9fb1f649c1f901d11e814b)
---
...ateHandler.java => StateEventHandleFailure.java} | 21 +++++++++------------
.../server/master/event/StateEventHandler.java | 5 +++--
.../master/event/TaskWaitTaskGroupStateHandler.java | 15 +++++++++++++--
.../master/runner/WorkflowExecuteRunnable.java | 8 ++++++++
4 files changed, 33 insertions(+), 16 deletions(-)
diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandleFailure.java
similarity index 59%
copy from dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java
copy to dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandleFailure.java
index 9a3c59a949..5e757c7858 100644
--- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java
+++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandleFailure.java
@@ -17,20 +17,17 @@
package org.apache.dolphinscheduler.server.master.event;
-import org.apache.dolphinscheduler.common.enums.StateEventType;
-import org.apache.dolphinscheduler.server.master.runner.WorkflowExecuteRunnable;
-
-import com.google.auto.service.AutoService;
+/**
+ * This exception represent the exception can be recovered, when we get this exception,
+ * we will move the event to the fail of the queue.
+ */
+public class StateEventHandleFailure extends Exception {
-@AutoService(StateEventHandler.class)
-public class TaskWaitTaskGroupStateHandler implements StateEventHandler {
- @Override
- public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, StateEvent stateEvent) {
- return workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent);
+ public StateEventHandleFailure(String message) {
+ super(message);
}
- @Override
- public StateEventType getEventType() {
- return StateEventType.WAIT_TASK_GROUP;
+ public StateEventHandleFailure(String message, Throwable throwable) {
+ super(message, throwable);
}
}
diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandler.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandler.java
index 00808b2e29..377ea71f62 100644
--- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandler.java
+++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandler.java
@@ -28,9 +28,10 @@ public interface StateEventHandler {
* @param stateEvent given state event.
* @throws StateEventHandleException this exception means it can be recovered.
* @throws StateEventHandleError this exception means it cannot be recovered, so the event need to drop.
+ * @throws StateEventHandleException this means it can be recovered.
*/
- boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, StateEvent stateEvent)
- throws StateEventHandleException, StateEventHandleError;
+ boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable,
+ StateEvent stateEvent) throws StateEventHandleException, StateEventHandleError, StateEventHandleFailure;
StateEventType getEventType();
}
diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java
index 9a3c59a949..bb0c7b8b70 100644
--- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java
+++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java
@@ -20,13 +20,24 @@ package org.apache.dolphinscheduler.server.master.event;
import org.apache.dolphinscheduler.common.enums.StateEventType;
import org.apache.dolphinscheduler.server.master.runner.WorkflowExecuteRunnable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import com.google.auto.service.AutoService;
@AutoService(StateEventHandler.class)
public class TaskWaitTaskGroupStateHandler implements StateEventHandler {
+
+ private static final Logger logger = LoggerFactory.getLogger(TaskWaitTaskGroupStateHandler.class);
+
@Override
- public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, StateEvent stateEvent) {
- return workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent);
+ public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable,
+ StateEvent stateEvent) throws StateEventHandleFailure {
+ logger.info("Handle task instance wait task group event, taskInstanceId: {}", stateEvent.getTaskInstanceId());
+ if (!workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent)) {
+ throw new StateEventHandleFailure("Task state event handle failed due to robing taskGroup resource failed");
+ }
+ return true;
}
@Override
diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java
index 5c0fec018d..fda109e1f4 100644
--- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java
+++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java
@@ -70,6 +70,7 @@ import org.apache.dolphinscheduler.server.master.dispatch.executor.NettyExecutor
import org.apache.dolphinscheduler.server.master.event.StateEvent;
import org.apache.dolphinscheduler.server.master.event.StateEventHandleError;
import org.apache.dolphinscheduler.server.master.event.StateEventHandleException;
+import org.apache.dolphinscheduler.server.master.event.StateEventHandleFailure;
import org.apache.dolphinscheduler.server.master.event.StateEventHandler;
import org.apache.dolphinscheduler.server.master.event.StateEventHandlerManager;
import org.apache.dolphinscheduler.server.master.metrics.TaskMetrics;
@@ -279,6 +280,13 @@ public class WorkflowExecuteRunnable implements Callable<WorkflowSubmitStatue> {
stateEvent,
stateEventHandleException);
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
+ } catch (StateEventHandleFailure stateEventHandleFailure) {
+ logger.error("State event handle failed, will move event to the tail: {}",
+ stateEvent,
+ stateEventHandleFailure);
+ this.stateEvents.remove(stateEvent);
+ this.stateEvents.offer(stateEvent);
+ ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
} catch (Exception e) {
// we catch the exception here, since if the state event handle failed, the state event will still keep in the stateEvents queue.
logger.error("State event handle error, get a unknown exception, will retry this event: {}",