You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by GitBox <gi...@apache.org> on 2019/05/05 11:55:55 UTC

[GitHub] [airflow] chaimt commented on a change in pull request #4633: AIRFLOW-3791: Dataflow

chaimt commented on a change in pull request #4633: AIRFLOW-3791: Dataflow
URL: https://github.com/apache/airflow/pull/4633#discussion_r281017999
 
 

 ##########
 File path: airflow/contrib/hooks/gcp_dataflow_hook.py
 ##########
 @@ -31,88 +31,111 @@
 # This is the default location
 # https://cloud.google.com/dataflow/pipelines/specifying-exec-params
 DEFAULT_DATAFLOW_LOCATION = 'us-central1'
+FAILED_END_STATES = ['JOB_STATE_FAILED', 'JOB_STATE_CANCELLED']
+SUCCEEDED_END_STATES = ['JOB_STATE_DONE']
+END_STATES = SUCCEEDED_END_STATES + FAILED_END_STATES
 
 
 class _DataflowJob(LoggingMixin):
     def __init__(self, dataflow, project_number, name, location, poll_sleep=10,
-                 job_id=None):
+                 job_id=None, multiple_jobs=None):
         self._dataflow = dataflow
         self._project_number = project_number
         self._job_name = name
         self._job_location = location
+        self.multiple_jobs = multiple_jobs
         self._job_id = job_id
-        self._job = self._get_job()
+        self._jobs = self._get_jobs()
         self._poll_sleep = poll_sleep
 
+    def is_job_running(self):
+        for job in self._jobs:
+            if job['currentState'] not in END_STATES:
+                return True
+        return False
+
     def _get_job_id_from_name(self):
         jobs = self._dataflow.projects().locations().jobs().list(
             projectId=self._project_number,
             location=self._job_location
         ).execute(num_retries=5)
+        dataflow_jobs = []
         for job in jobs['jobs']:
-            if job['name'] == self._job_name:
-                self._job_id = job['id']
-                return job
-        return None
-
-    def _get_job(self):
-        if self._job_id:
-            job = self._dataflow.projects().locations().jobs().get(
+            if job['name'].startswith(self._job_name):
+                dataflow_jobs.append(job)
+        if len(dataflow_jobs) == 1:
+            self._job_id = dataflow_jobs[0]['id']
+        return dataflow_jobs
+
+    def _get_jobs(self):
+        if not self.multiple_jobs and self._job_id:
+            self._jobs = []
+            self._jobs.append(self._dataflow.projects().locations().jobs().get(
                 projectId=self._project_number,
                 location=self._job_location,
-                jobId=self._job_id).execute(num_retries=5)
+                jobId=self._job_id).execute(num_retries=5))
         elif self._job_name:
-            job = self._get_job_id_from_name()
+            self._jobs = self._get_job_id_from_name()
         else:
             raise Exception('Missing both dataflow job ID and name.')
 
-        if job and 'currentState' in job:
-            self.log.info(
-                'Google Cloud DataFlow job %s is %s',
-                job['name'], job['currentState']
-            )
-        elif job:
-            self.log.info(
-                'Google Cloud DataFlow with job_id %s has name %s',
-                self._job_id, job['name']
-            )
-        else:
-            self.log.info(
-                'Google Cloud DataFlow job not available yet..'
-            )
+        for job in self._jobs:
+            if job and 'currentState' in job:
+                self._job_state = job['currentState']
+                self.log.info(
+                    'Google Cloud DataFlow job %s is %s',
+                    job['name'], job['currentState']
+                )
+            elif job:
+                self.log.info(
+                    'Google Cloud DataFlow with job_id %s has name %s',
+                    self._job_id, job['name']
+                )
+            else:
+                self.log.info(
+                    'Google Cloud DataFlow job not available yet..'
+                )
 
-        return job
+        return self._jobs
 
     def wait_for_done(self):
         while True:
-            if self._job and 'currentState' in self._job:
-                if 'JOB_STATE_DONE' == self._job['currentState']:
-                    return True
-                elif 'JOB_STATE_RUNNING' == self._job['currentState'] and \
-                     'JOB_TYPE_STREAMING' == self._job['type']:
-                    return True
-                elif 'JOB_STATE_FAILED' == self._job['currentState']:
-                    raise Exception("Google Cloud Dataflow job {} has failed.".format(
-                        self._job['name']))
-                elif 'JOB_STATE_CANCELLED' == self._job['currentState']:
-                    raise Exception("Google Cloud Dataflow job {} was cancelled.".format(
-                        self._job['name']))
-                elif 'JOB_STATE_RUNNING' == self._job['currentState']:
-                    time.sleep(self._poll_sleep)
-                elif 'JOB_STATE_PENDING' == self._job['currentState']:
-                    time.sleep(15)
-                else:
-                    self.log.debug(str(self._job))
-                    raise Exception(
-                        "Google Cloud Dataflow job {} was unknown state: {}".format(
-                            self._job['name'], self._job['currentState']))
+            for job in self._jobs:
+                if job and 'currentState' in job:
+                    if 'JOB_STATE_DONE' == job['currentState']:
+                        # check all jobs are done
+                        count_not_done = 0
+                        for inner_jobs in self._jobs:
+                            if inner_jobs and 'currentState' in job:
+                                if not 'JOB_STATE_DONE' == inner_jobs['currentState']:
+                                    count_not_done += 1
+                        if count_not_done == 0:
+                            return True
+                    elif 'JOB_STATE_RUNNING' == job['currentState'] and \
+                            'JOB_TYPE_STREAMING' == job['type']:
+                        return True
+                    elif 'JOB_STATE_FAILED' == job['currentState']:
+                        raise Exception("Google Cloud Dataflow job {} has failed.".format(
+                            job['name']))
+                    elif 'JOB_STATE_CANCELLED' == job['currentState']:
+                        raise Exception("Google Cloud Dataflow job {} was cancelled.".format(
+                            job['name']))
+                    elif 'JOB_STATE_RUNNING' == job['currentState']:
+                        None
 
 Review comment:
   sure
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services