You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by fo...@apache.org on 2018/01/31 11:52:19 UTC

incubator-airflow git commit: [AIRFLOW-2037] Add methods to get Hash values of a GCS object

Repository: incubator-airflow
Updated Branches:
  refs/heads/master 48202ad5b -> 80d2ee8ac


[AIRFLOW-2037] Add methods to get Hash values of a GCS object

- Added `get_md5hash` and `get_crc32c` in
`gcs_hook` to aid in Data integrity validations.

Closes #2977 from kaxil/hashing_gcs_hook


Project: http://git-wip-us.apache.org/repos/asf/incubator-airflow/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-airflow/commit/80d2ee8a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-airflow/tree/80d2ee8a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-airflow/diff/80d2ee8a

Branch: refs/heads/master
Commit: 80d2ee8acc671fafa049f86b127eb65a1d9699f8
Parents: 48202ad
Author: Kaxil Naik <ka...@gmail.com>
Authored: Wed Jan 31 12:52:13 2018 +0100
Committer: Fokko Driesprong <fo...@godatadriven.com>
Committed: Wed Jan 31 12:52:13 2018 +0100

----------------------------------------------------------------------
 airflow/contrib/hooks/gcs_hook.py | 52 ++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/80d2ee8a/airflow/contrib/hooks/gcs_hook.py
----------------------------------------------------------------------
diff --git a/airflow/contrib/hooks/gcs_hook.py b/airflow/contrib/hooks/gcs_hook.py
index 3103a5a..ac8f2e0 100644
--- a/airflow/contrib/hooks/gcs_hook.py
+++ b/airflow/contrib/hooks/gcs_hook.py
@@ -297,3 +297,55 @@ class GoogleCloudStorageHook(GoogleCloudBaseHook):
         except errors.HttpError as ex:
             if ex.resp['status'] == '404':
                 raise ValueError('Object Not Found')
+
+    def get_crc32c(self, bucket, object):
+        """
+        Gets the CRC32c checksum of an object in Google Cloud Storage.
+        :param bucket: The Google cloud storage bucket where the object is.
+        :type bucket: string
+        :param object: The name of the object to check in the Google cloud
+            storage bucket.
+        :type object: string
+        """
+        self.log.info('Retrieving the crc32c checksum of '
+                      'object: %s in bucket: %s', object, bucket)
+        service = self.get_conn()
+        try:
+            response = service.objects().get(
+                bucket=bucket,
+                object=object
+            ).execute()
+
+            crc32c = response['crc32c']
+            self.log.info('The crc32c checksum of %s is %s', object, crc32c)
+            return crc32c
+
+        except errors.HttpError as ex:
+            if ex.resp['status'] == '404':
+                raise ValueError('Object Not Found')
+
+    def get_md5hash(self, bucket, object):
+        """
+        Gets the MD5 hash of an object in Google Cloud Storage.
+        :param bucket: The Google cloud storage bucket where the object is.
+        :type bucket: string
+        :param object: The name of the object to check in the Google cloud
+            storage bucket.
+        :type object: string
+        """
+        self.log.info('Retrieving the MD5 hash of '
+                      'object: %s in bucket: %s', object, bucket)
+        service = self.get_conn()
+        try:
+            response = service.objects().get(
+                bucket=bucket,
+                object=object
+            ).execute()
+
+            md5hash = response['md5Hash']
+            self.log.info('The md5Hash of %s is %s', object, md5hash)
+            return md5hash
+
+        except errors.HttpError as ex:
+            if ex.resp['status'] == '404':
+                raise ValueError('Object Not Found')