You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@submarine.apache.org by pi...@apache.org on 2021/08/07 04:41:12 UTC

[submarine] branch master updated: SUBMARINE-952. add backoffLimit and Failed status for experiment object

This is an automated email from the ASF dual-hosted git repository.

pingsutw pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/submarine.git


The following commit(s) were added to refs/heads/master by this push:
     new d75f4f2  SUBMARINE-952. add backoffLimit and Failed status for experiment object
d75f4f2 is described below

commit d75f4f2b429f96e49042991a3af9f1fff153f740
Author: Brandon Lin <fa...@gmail.com>
AuthorDate: Tue Aug 3 23:24:35 2021 +0800

    SUBMARINE-952. add backoffLimit and Failed status for experiment object
    
    ### What is this PR for?
    just like we mentioned in Jira ticket, for now submarine will retry those retry able jobs endlessly even those job never had a chance to success. It's waste of resource obviously, so I add a MLJob property BackoffLimit to prevent this kind of situation, at same time I change the MLJobSpec from interface into abstract class to share property with TFJobSpec and PytorchJobSpec.
    I also fixed a bug to respond the correct status of experiment in failure case.
    
    ### What type of PR is it?
    Improvement
    
    ### Todos
    N/A
    
    ### What is the Jira issue?
    https://issues.apache.org/jira/browse/SUBMARINE-952
    
    ### How should this be tested?
    modify the test case (https://github.com/apache/submarine/blob/master/submarine-test/test-e2e/src/test/java/org/apache/submarine/integration/experimentIT.java#L90) from {1024, 1024} to {512, 512},
    and the experiment will hit OOMFailure, and the experiment status will change into failed after retry 3 times.
    ### Screenshots (if appropriate)
    <img width="1380" alt="截圖 2021-08-01 下午5 03 43" src="https://user-images.githubusercontent.com/5687317/128044592-e2cee95c-2ee9-4702-88ff-d41950e003ec.png">
    <img width="1394" alt="截圖 2021-08-03 下午11 10 39" src="https://user-images.githubusercontent.com/5687317/128044618-454afdb5-c1b8-4395-a75e-f470a7c41625.png">
    
    ### Questions:
    * Do the license files need updating? No
    * Are there breaking changes for older versions? No
    * Does this need new documentation? No
    
    Author: Brandon Lin <fa...@gmail.com>
    
    Signed-off-by: Kevin <pi...@apache.org>
    
    Closes #694 from FatalLin/SUBMARINE-952 and squashes the following commits:
    
    a52fd849 [Brandon Lin] SUBMARINE-952. add backoffLimit and Failed status for experiment object
---
 .../submarine/server/api/experiment/Experiment.java  |  3 ++-
 .../submarine/server/submitter/k8s/K8sSubmitter.java |  3 +++
 .../server/submitter/k8s/model/MLJobSpec.java        | 20 +++++++++++++++++---
 .../k8s/model/pytorchjob/PyTorchJobSpec.java         |  5 +++--
 .../server/submitter/k8s/model/tfjob/TFJobSpec.java  |  4 +++-
 .../server/submitter/k8s/util/MLJobConverter.java    |  6 +++++-
 6 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/submarine-server/server-api/src/main/java/org/apache/submarine/server/api/experiment/Experiment.java b/submarine-server/server-api/src/main/java/org/apache/submarine/server/api/experiment/Experiment.java
index ff080b7..74c8786 100644
--- a/submarine-server/server-api/src/main/java/org/apache/submarine/server/api/experiment/Experiment.java
+++ b/submarine-server/server-api/src/main/java/org/apache/submarine/server/api/experiment/Experiment.java
@@ -123,7 +123,8 @@ public class Experiment {
     STATUS_CREATED("Created"),
     STATUS_RUNNING("Running"),
     STATUS_SUCCEEDED("Succeeded"),
-    STATUS_DELETED("Deleted");
+    STATUS_DELETED("Deleted"),
+    STATUS_FAILED("Failed");
 
     private String value;
     Status(String value) {
diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java
index a530d16..23a862d 100644
--- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java
+++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java
@@ -138,6 +138,7 @@ public class K8sSubmitter implements Submitter {
     Experiment experiment;
     try {
       MLJob mlJob = ExperimentSpecParser.parseJob(spec);
+
       Object object = api.createNamespacedCustomObject(mlJob.getGroup(), mlJob.getVersion(),
           mlJob.getMetadata().getNamespace(), mlJob.getPlural(), mlJob, "true");
       experiment = parseExperimentResponseObject(object, ParseOp.PARSE_OP_RESULT);
@@ -160,11 +161,13 @@ public class K8sSubmitter implements Submitter {
       Object object = api.getNamespacedCustomObject(mlJob.getGroup(), mlJob.getVersion(),
           mlJob.getMetadata().getNamespace(), mlJob.getPlural(), mlJob.getMetadata().getName());
       experiment = parseExperimentResponseObject(object, ParseOp.PARSE_OP_RESULT);
+
     } catch (InvalidSpecException e) {
       throw new SubmarineRuntimeException(200, e.getMessage());
     } catch (ApiException e) {
       throw new SubmarineRuntimeException(e.getCode(), e.getMessage());
     }
+
     return experiment;
   }
 
diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/MLJobSpec.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/MLJobSpec.java
index 3048a04..a98ed92 100644
--- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/MLJobSpec.java
+++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/MLJobSpec.java
@@ -21,7 +21,21 @@ package org.apache.submarine.server.submitter.k8s.model;
 
 import java.util.Map;
 
-public interface MLJobSpec {
-  Map<MLJobReplicaType, MLJobReplicaSpec> getReplicaSpecs();
-  void setReplicaSpecs(Map<MLJobReplicaType, MLJobReplicaSpec> replicaSpecs);
+import com.google.gson.annotations.SerializedName;
+
+public abstract class MLJobSpec {
+  
+  @SerializedName("backoffLimit")
+  private Integer backoffLimit = 3;
+  
+  public abstract Map<MLJobReplicaType, MLJobReplicaSpec> getReplicaSpecs();
+  public abstract void setReplicaSpecs(Map<MLJobReplicaType, MLJobReplicaSpec> replicaSpecs);
+  public Integer getBackoffLimit() {
+    return backoffLimit;
+  }
+  
+  public void setBackoffLimit(Integer backoffLimit) {
+    this.backoffLimit = backoffLimit;
+  }
+
 }
diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/pytorchjob/PyTorchJobSpec.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/pytorchjob/PyTorchJobSpec.java
index 33aa89d..bd0819f 100644
--- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/pytorchjob/PyTorchJobSpec.java
+++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/pytorchjob/PyTorchJobSpec.java
@@ -26,13 +26,13 @@ import org.apache.submarine.server.submitter.k8s.model.MLJobSpec;
 
 import java.util.Map;
 
-public class PyTorchJobSpec implements MLJobSpec {
+public class PyTorchJobSpec extends MLJobSpec {
 
   /**
    * Key: Master, Worker
    */
   @SerializedName("pytorchReplicaSpecs")
-  private Map<MLJobReplicaType, MLJobReplicaSpec> replicaSpecs;
+  private Map<MLJobReplicaType, MLJobReplicaSpec> replicaSpecs; 
 
   /**
    * Get the replica specs.
@@ -54,4 +54,5 @@ public class PyTorchJobSpec implements MLJobSpec {
       Map<MLJobReplicaType, MLJobReplicaSpec> replicaSpecs) {
     this.replicaSpecs = replicaSpecs;
   }
+
 }
diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/tfjob/TFJobSpec.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/tfjob/TFJobSpec.java
index a74f533..e2f3d01 100644
--- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/tfjob/TFJobSpec.java
+++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/tfjob/TFJobSpec.java
@@ -29,13 +29,14 @@ import java.util.Map;
 /**
  * The replica spec of TFJob.
  */
-public class TFJobSpec implements MLJobSpec {
+public class TFJobSpec extends MLJobSpec {
   /**
    * Key: Chief, Ps, Worker, Evaluator
    */
   @SerializedName("tfReplicaSpecs")
   private Map<MLJobReplicaType, MLJobReplicaSpec> tfReplicaSpecs;
 
+
   /**
    * Get the replica specs.
    *
@@ -56,4 +57,5 @@ public class TFJobSpec implements MLJobSpec {
       Map<MLJobReplicaType, MLJobReplicaSpec> tfReplicaSpecs) {
     this.tfReplicaSpecs = tfReplicaSpecs;
   }
+
 }
diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/util/MLJobConverter.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/util/MLJobConverter.java
index 725491c..44a4c17 100644
--- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/util/MLJobConverter.java
+++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/util/MLJobConverter.java
@@ -68,7 +68,11 @@ public class MLJobConverter {
       dateTime = status.getCompletionTime();
       if (dateTime != null) {
         experiment.setFinishedTime(dateTime.toString());
-        experiment.setStatus(Experiment.Status.STATUS_SUCCEEDED.getValue());
+        if ("Succeeded".equalsIgnoreCase(conditions.get(conditions.size() - 1).getType())) {
+          experiment.setStatus(Experiment.Status.STATUS_SUCCEEDED.getValue());
+        } else if ("Failed".equalsIgnoreCase(conditions.get(conditions.size() - 1).getType())) {
+          experiment.setStatus(Experiment.Status.STATUS_FAILED.getValue());
+        }
       }
     }
     return experiment;

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@submarine.apache.org
For additional commands, e-mail: dev-help@submarine.apache.org