You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by fo...@apache.org on 2018/01/31 11:52:19 UTC
incubator-airflow git commit: [AIRFLOW-2037] Add methods to get Hash
values of a GCS object
Repository: incubator-airflow
Updated Branches:
refs/heads/master 48202ad5b -> 80d2ee8ac
[AIRFLOW-2037] Add methods to get Hash values of a GCS object
- Added `get_md5hash` and `get_crc32c` in
`gcs_hook` to aid in Data integrity validations.
Closes #2977 from kaxil/hashing_gcs_hook
Project: http://git-wip-us.apache.org/repos/asf/incubator-airflow/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-airflow/commit/80d2ee8a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-airflow/tree/80d2ee8a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-airflow/diff/80d2ee8a
Branch: refs/heads/master
Commit: 80d2ee8acc671fafa049f86b127eb65a1d9699f8
Parents: 48202ad
Author: Kaxil Naik <ka...@gmail.com>
Authored: Wed Jan 31 12:52:13 2018 +0100
Committer: Fokko Driesprong <fo...@godatadriven.com>
Committed: Wed Jan 31 12:52:13 2018 +0100
----------------------------------------------------------------------
airflow/contrib/hooks/gcs_hook.py | 52 ++++++++++++++++++++++++++++++++++
1 file changed, 52 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/80d2ee8a/airflow/contrib/hooks/gcs_hook.py
----------------------------------------------------------------------
diff --git a/airflow/contrib/hooks/gcs_hook.py b/airflow/contrib/hooks/gcs_hook.py
index 3103a5a..ac8f2e0 100644
--- a/airflow/contrib/hooks/gcs_hook.py
+++ b/airflow/contrib/hooks/gcs_hook.py
@@ -297,3 +297,55 @@ class GoogleCloudStorageHook(GoogleCloudBaseHook):
except errors.HttpError as ex:
if ex.resp['status'] == '404':
raise ValueError('Object Not Found')
+
+ def get_crc32c(self, bucket, object):
+ """
+ Gets the CRC32c checksum of an object in Google Cloud Storage.
+ :param bucket: The Google cloud storage bucket where the object is.
+ :type bucket: string
+ :param object: The name of the object to check in the Google cloud
+ storage bucket.
+ :type object: string
+ """
+ self.log.info('Retrieving the crc32c checksum of '
+ 'object: %s in bucket: %s', object, bucket)
+ service = self.get_conn()
+ try:
+ response = service.objects().get(
+ bucket=bucket,
+ object=object
+ ).execute()
+
+ crc32c = response['crc32c']
+ self.log.info('The crc32c checksum of %s is %s', object, crc32c)
+ return crc32c
+
+ except errors.HttpError as ex:
+ if ex.resp['status'] == '404':
+ raise ValueError('Object Not Found')
+
+ def get_md5hash(self, bucket, object):
+ """
+ Gets the MD5 hash of an object in Google Cloud Storage.
+ :param bucket: The Google cloud storage bucket where the object is.
+ :type bucket: string
+ :param object: The name of the object to check in the Google cloud
+ storage bucket.
+ :type object: string
+ """
+ self.log.info('Retrieving the MD5 hash of '
+ 'object: %s in bucket: %s', object, bucket)
+ service = self.get_conn()
+ try:
+ response = service.objects().get(
+ bucket=bucket,
+ object=object
+ ).execute()
+
+ md5hash = response['md5Hash']
+ self.log.info('The md5Hash of %s is %s', object, md5hash)
+ return md5hash
+
+ except errors.HttpError as ex:
+ if ex.resp['status'] == '404':
+ raise ValueError('Object Not Found')