You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by GitBox <gi...@apache.org> on 2020/08/04 21:21:21 UTC

[GitHub] [airflow] jaketf commented on a change in pull request #9590: Improve idempotency of BigQueryInsertJobOperator

jaketf commented on a change in pull request #9590:
URL: https://github.com/apache/airflow/pull/9590#discussion_r465336820



##########
File path: airflow/providers/google/cloud/operators/bigquery.py
##########
@@ -1677,39 +1697,77 @@ def __init__(
         self.project_id = project_id
         self.gcp_conn_id = gcp_conn_id
         self.delegate_to = delegate_to
+        self.force_rerun = force_rerun
+        self.reattach_states: Set[str] = reattach_states or set()
 
     def prepare_template(self) -> None:
         # If .json is passed then we have to read the file
         if isinstance(self.configuration, str) and self.configuration.endswith('.json'):
             with open(self.configuration, 'r') as file:
                 self.configuration = json.loads(file.read())
 
+    def _submit_job(
+        self,
+        hook: BigQueryHook,
+        job_id: str,
+    ) -> BigQueryJob:
+        # Submit a new job
+        job = hook.insert_job(
+            configuration=self.configuration,
+            project_id=self.project_id,
+            location=self.location,
+            job_id=job_id,
+        )
+        # Start the job and wait for it to complete and get the result.
+        job.result()
+        return job
+
+    @staticmethod
+    def _handle_job_error(job: BigQueryJob) -> None:
+        if job.error_result:
+            raise AirflowException(f"BigQuery job {job.job_id} failed: {job.error_result}")
+
+    def _job_id(self, context):
+        if self.force_rerun:
+            hash_base = str(uuid.uuid4())
+        else:
+            hash_base = json.dumps(self.configuration, sort_keys=True)
+
+        uniqueness_suffix = hashlib.md5(hash_base.encode()).hexdigest()
+
+        if self.job_id:
+            return f"{self.job_id}_{uniqueness_suffix}"
+
+        exec_date = re.sub(r"\:|-|\+", "_", context['execution_date'].isoformat())
+        return f"airflow_{self.dag_id}_{self.task_id}_{exec_date}_{uniqueness_suffix}"
+
     def execute(self, context: Any):
         hook = BigQueryHook(
             gcp_conn_id=self.gcp_conn_id,
             delegate_to=self.delegate_to,
         )
 
-        job_id = self.job_id or f"airflow_{self.task_id}_{int(time())}"
+        job_id = self._job_id(context)
+
         try:
-            job = hook.insert_job(
-                configuration=self.configuration,
-                project_id=self.project_id,
-                location=self.location,
-                job_id=job_id,
-            )
-            # Start the job and wait for it to complete and get the result.
-            job.result()
+            job = self._submit_job(hook, job_id)
+            self._handle_job_error(job)
         except Conflict:
+            # If the job already exists retrieve it
             job = hook.get_job(
                 project_id=self.project_id,
                 location=self.location,
                 job_id=job_id,
             )
-            # Get existing job and wait for it to be ready
-            for time_to_wait in exponential_sleep_generator(initial=10, maximum=120):
-                sleep(time_to_wait)
-                job.reload()
-                if job.done():
-                    break
+            if job.state in self.reattach_states and not job.done():
+                # The job is still running so wait for it to be ready
+                job.result()
+                self._handle_job_error(job)
+            elif job.done():
+                # Same job configuration so we need force_rerun
+                raise AirflowException(
+                    f"Job with id: {job_id} already exists and is in {job.state} state. If you "
+                    f"want to force rerun it consider setting `force_rerun=True`."

Review comment:
       ```suggestion
                       f"want to force rerun it consider setting `force_rerun=True`."
                       f"Or, if you want to reattach in this scenario add {job.state} to `reattach_states`"
   ```

##########
File path: airflow/providers/google/cloud/operators/bigquery.py
##########
@@ -1666,6 +1684,8 @@ def __init__(
         project_id: Optional[str] = None,
         location: Optional[str] = None,
         job_id: Optional[str] = None,
+        force_rerun: bool = False,

Review comment:
       Hmm these defaults don't seem to make sense to to me.
   Why is `force_rerun = False` but `reattach_states = None` (eventually empty set) which means by default if there is an existing job id reattach candidate the user will be sad with this airflow exception telling them "should've set force_rerun to true.
   We should either default force_rerun=True or have a default set of reattach states.
   IMO user expectation is force_rerun=True.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org