You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@submarine.apache.org by pi...@apache.org on 2021/08/07 04:41:12 UTC
[submarine] branch master updated: SUBMARINE-952. add backoffLimit
and Failed status for experiment object
This is an automated email from the ASF dual-hosted git repository.
pingsutw pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/submarine.git
The following commit(s) were added to refs/heads/master by this push:
new d75f4f2 SUBMARINE-952. add backoffLimit and Failed status for experiment object
d75f4f2 is described below
commit d75f4f2b429f96e49042991a3af9f1fff153f740
Author: Brandon Lin <fa...@gmail.com>
AuthorDate: Tue Aug 3 23:24:35 2021 +0800
SUBMARINE-952. add backoffLimit and Failed status for experiment object
### What is this PR for?
just like we mentioned in Jira ticket, for now submarine will retry those retry able jobs endlessly even those job never had a chance to success. It's waste of resource obviously, so I add a MLJob property BackoffLimit to prevent this kind of situation, at same time I change the MLJobSpec from interface into abstract class to share property with TFJobSpec and PytorchJobSpec.
I also fixed a bug to respond the correct status of experiment in failure case.
### What type of PR is it?
Improvement
### Todos
N/A
### What is the Jira issue?
https://issues.apache.org/jira/browse/SUBMARINE-952
### How should this be tested?
modify the test case (https://github.com/apache/submarine/blob/master/submarine-test/test-e2e/src/test/java/org/apache/submarine/integration/experimentIT.java#L90) from {1024, 1024} to {512, 512},
and the experiment will hit OOMFailure, and the experiment status will change into failed after retry 3 times.
### Screenshots (if appropriate)
<img width="1380" alt="截圖 2021-08-01 下午5 03 43" src="https://user-images.githubusercontent.com/5687317/128044592-e2cee95c-2ee9-4702-88ff-d41950e003ec.png">
<img width="1394" alt="截圖 2021-08-03 下午11 10 39" src="https://user-images.githubusercontent.com/5687317/128044618-454afdb5-c1b8-4395-a75e-f470a7c41625.png">
### Questions:
* Do the license files need updating? No
* Are there breaking changes for older versions? No
* Does this need new documentation? No
Author: Brandon Lin <fa...@gmail.com>
Signed-off-by: Kevin <pi...@apache.org>
Closes #694 from FatalLin/SUBMARINE-952 and squashes the following commits:
a52fd849 [Brandon Lin] SUBMARINE-952. add backoffLimit and Failed status for experiment object
---
.../submarine/server/api/experiment/Experiment.java | 3 ++-
.../submarine/server/submitter/k8s/K8sSubmitter.java | 3 +++
.../server/submitter/k8s/model/MLJobSpec.java | 20 +++++++++++++++++---
.../k8s/model/pytorchjob/PyTorchJobSpec.java | 5 +++--
.../server/submitter/k8s/model/tfjob/TFJobSpec.java | 4 +++-
.../server/submitter/k8s/util/MLJobConverter.java | 6 +++++-
6 files changed, 33 insertions(+), 8 deletions(-)
diff --git a/submarine-server/server-api/src/main/java/org/apache/submarine/server/api/experiment/Experiment.java b/submarine-server/server-api/src/main/java/org/apache/submarine/server/api/experiment/Experiment.java
index ff080b7..74c8786 100644
--- a/submarine-server/server-api/src/main/java/org/apache/submarine/server/api/experiment/Experiment.java
+++ b/submarine-server/server-api/src/main/java/org/apache/submarine/server/api/experiment/Experiment.java
@@ -123,7 +123,8 @@ public class Experiment {
STATUS_CREATED("Created"),
STATUS_RUNNING("Running"),
STATUS_SUCCEEDED("Succeeded"),
- STATUS_DELETED("Deleted");
+ STATUS_DELETED("Deleted"),
+ STATUS_FAILED("Failed");
private String value;
Status(String value) {
diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java
index a530d16..23a862d 100644
--- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java
+++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java
@@ -138,6 +138,7 @@ public class K8sSubmitter implements Submitter {
Experiment experiment;
try {
MLJob mlJob = ExperimentSpecParser.parseJob(spec);
+
Object object = api.createNamespacedCustomObject(mlJob.getGroup(), mlJob.getVersion(),
mlJob.getMetadata().getNamespace(), mlJob.getPlural(), mlJob, "true");
experiment = parseExperimentResponseObject(object, ParseOp.PARSE_OP_RESULT);
@@ -160,11 +161,13 @@ public class K8sSubmitter implements Submitter {
Object object = api.getNamespacedCustomObject(mlJob.getGroup(), mlJob.getVersion(),
mlJob.getMetadata().getNamespace(), mlJob.getPlural(), mlJob.getMetadata().getName());
experiment = parseExperimentResponseObject(object, ParseOp.PARSE_OP_RESULT);
+
} catch (InvalidSpecException e) {
throw new SubmarineRuntimeException(200, e.getMessage());
} catch (ApiException e) {
throw new SubmarineRuntimeException(e.getCode(), e.getMessage());
}
+
return experiment;
}
diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/MLJobSpec.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/MLJobSpec.java
index 3048a04..a98ed92 100644
--- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/MLJobSpec.java
+++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/MLJobSpec.java
@@ -21,7 +21,21 @@ package org.apache.submarine.server.submitter.k8s.model;
import java.util.Map;
-public interface MLJobSpec {
- Map<MLJobReplicaType, MLJobReplicaSpec> getReplicaSpecs();
- void setReplicaSpecs(Map<MLJobReplicaType, MLJobReplicaSpec> replicaSpecs);
+import com.google.gson.annotations.SerializedName;
+
+public abstract class MLJobSpec {
+
+ @SerializedName("backoffLimit")
+ private Integer backoffLimit = 3;
+
+ public abstract Map<MLJobReplicaType, MLJobReplicaSpec> getReplicaSpecs();
+ public abstract void setReplicaSpecs(Map<MLJobReplicaType, MLJobReplicaSpec> replicaSpecs);
+ public Integer getBackoffLimit() {
+ return backoffLimit;
+ }
+
+ public void setBackoffLimit(Integer backoffLimit) {
+ this.backoffLimit = backoffLimit;
+ }
+
}
diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/pytorchjob/PyTorchJobSpec.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/pytorchjob/PyTorchJobSpec.java
index 33aa89d..bd0819f 100644
--- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/pytorchjob/PyTorchJobSpec.java
+++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/pytorchjob/PyTorchJobSpec.java
@@ -26,13 +26,13 @@ import org.apache.submarine.server.submitter.k8s.model.MLJobSpec;
import java.util.Map;
-public class PyTorchJobSpec implements MLJobSpec {
+public class PyTorchJobSpec extends MLJobSpec {
/**
* Key: Master, Worker
*/
@SerializedName("pytorchReplicaSpecs")
- private Map<MLJobReplicaType, MLJobReplicaSpec> replicaSpecs;
+ private Map<MLJobReplicaType, MLJobReplicaSpec> replicaSpecs;
/**
* Get the replica specs.
@@ -54,4 +54,5 @@ public class PyTorchJobSpec implements MLJobSpec {
Map<MLJobReplicaType, MLJobReplicaSpec> replicaSpecs) {
this.replicaSpecs = replicaSpecs;
}
+
}
diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/tfjob/TFJobSpec.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/tfjob/TFJobSpec.java
index a74f533..e2f3d01 100644
--- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/tfjob/TFJobSpec.java
+++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/tfjob/TFJobSpec.java
@@ -29,13 +29,14 @@ import java.util.Map;
/**
* The replica spec of TFJob.
*/
-public class TFJobSpec implements MLJobSpec {
+public class TFJobSpec extends MLJobSpec {
/**
* Key: Chief, Ps, Worker, Evaluator
*/
@SerializedName("tfReplicaSpecs")
private Map<MLJobReplicaType, MLJobReplicaSpec> tfReplicaSpecs;
+
/**
* Get the replica specs.
*
@@ -56,4 +57,5 @@ public class TFJobSpec implements MLJobSpec {
Map<MLJobReplicaType, MLJobReplicaSpec> tfReplicaSpecs) {
this.tfReplicaSpecs = tfReplicaSpecs;
}
+
}
diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/util/MLJobConverter.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/util/MLJobConverter.java
index 725491c..44a4c17 100644
--- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/util/MLJobConverter.java
+++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/util/MLJobConverter.java
@@ -68,7 +68,11 @@ public class MLJobConverter {
dateTime = status.getCompletionTime();
if (dateTime != null) {
experiment.setFinishedTime(dateTime.toString());
- experiment.setStatus(Experiment.Status.STATUS_SUCCEEDED.getValue());
+ if ("Succeeded".equalsIgnoreCase(conditions.get(conditions.size() - 1).getType())) {
+ experiment.setStatus(Experiment.Status.STATUS_SUCCEEDED.getValue());
+ } else if ("Failed".equalsIgnoreCase(conditions.get(conditions.size() - 1).getType())) {
+ experiment.setStatus(Experiment.Status.STATUS_FAILED.getValue());
+ }
}
}
return experiment;
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@submarine.apache.org
For additional commands, e-mail: dev-help@submarine.apache.org