You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@libcloud.apache.org by to...@apache.org on 2018/01/08 11:11:25 UTC

[1/4] libcloud git commit: storage: Fix hash computation performance on upload

Repository: libcloud
Updated Branches:
  refs/heads/trunk ad689715e -> 484e02a63


storage: Fix hash computation performance on upload

The storage base driver computes the hash of uploaded files: individual
drivers use it to ensure the reported hash is correct. Before libcloud
2.x, this was done efficiently: the file was only read once, and we were
using `hash.update()` to avoid keeping the whole file in memory. With
the switch to the requests module, both of these optimizations were
removed inadvertently. It turns out the important one is using
`hash.update()`: computing the hash on the whole file in memory is
orders of magnitude slower.

Closes #1135

Signed-off-by: Tomaz Muraus <to...@tomaz.me>


Project: http://git-wip-us.apache.org/repos/asf/libcloud/repo
Commit: http://git-wip-us.apache.org/repos/asf/libcloud/commit/1e955af3
Tree: http://git-wip-us.apache.org/repos/asf/libcloud/tree/1e955af3
Diff: http://git-wip-us.apache.org/repos/asf/libcloud/diff/1e955af3

Branch: refs/heads/trunk
Commit: 1e955af39bb799d32808faa7ebe37455b8ad7e9e
Parents: ad68971
Author: Quentin Pradet <qu...@clustree.com>
Authored: Wed Oct 18 11:15:22 2017 +0400
Committer: Tomaz Muraus <to...@tomaz.me>
Committed: Mon Jan 8 11:19:41 2018 +0100

----------------------------------------------------------------------
 libcloud/storage/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/libcloud/blob/1e955af3/libcloud/storage/base.py
----------------------------------------------------------------------
diff --git a/libcloud/storage/base.py b/libcloud/storage/base.py
index b3de63d..617cfed 100644
--- a/libcloud/storage/base.py
+++ b/libcloud/storage/base.py
@@ -643,9 +643,9 @@ class StorageDriver(BaseDriver):
     def _hash_buffered_stream(self, stream, hasher, blocksize=65536):
         total_len = 0
         if hasattr(stream, '__next__'):
-            data = libcloud.utils.files.exhaust_iterator(iterator=stream)
-            hasher.update(b(data))
-            total_len = len(data)
+            for chunk in libcloud.utils.files.read_in_chunks(iterator=stream):
+                hasher.update(b(chunk))
+                total_len += len(chunk)
             return (hasher.hexdigest(), total_len)
         if not hasattr(stream, '__exit__'):
             for s in stream:


[3/4] libcloud git commit: Add a test case for #1135.

Posted by to...@apache.org.
Add a test case for #1135.


Project: http://git-wip-us.apache.org/repos/asf/libcloud/repo
Commit: http://git-wip-us.apache.org/repos/asf/libcloud/commit/0710d23d
Tree: http://git-wip-us.apache.org/repos/asf/libcloud/tree/0710d23d
Diff: http://git-wip-us.apache.org/repos/asf/libcloud/diff/0710d23d

Branch: refs/heads/trunk
Commit: 0710d23de52a1c2c4841937e4c7c17b81950b9ae
Parents: f053297
Author: Tomaz Muraus <to...@tomaz.me>
Authored: Mon Jan 8 11:58:10 2018 +0100
Committer: Tomaz Muraus <to...@tomaz.me>
Committed: Mon Jan 8 12:00:17 2018 +0100

----------------------------------------------------------------------
 libcloud/test/storage/test_base.py | 44 +++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/libcloud/blob/0710d23d/libcloud/test/storage/test_base.py
----------------------------------------------------------------------
diff --git a/libcloud/test/storage/test_base.py b/libcloud/test/storage/test_base.py
index 2f3b048..dabc0f5 100644
--- a/libcloud/test/storage/test_base.py
+++ b/libcloud/test/storage/test_base.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 
 import sys
+import hashlib
 
 from libcloud.utils.py3 import httplib
 from io import BytesIO
 
+import mock
 from mock import Mock
 
 from libcloud.utils.py3 import StringIO
@@ -28,6 +30,7 @@ from libcloud.storage.base import DEFAULT_CONTENT_TYPE
 
 from libcloud.test import unittest
 from libcloud.test import MockHttp
+from libcloud.test import BodyStream
 
 
 class BaseMockRawResponse(MockHttp):
@@ -131,6 +134,47 @@ class BaseStorageTests(unittest.TestCase):
                                 request_path='/',
                                 stream=iterator)
 
+    @mock.patch('libcloud.utils.files.exhaust_iterator')
+    @mock.patch('libcloud.utils.files.read_in_chunks')
+    def test_upload_object_hash_calculation_is_efficient(self, mock_read_in_chunks,
+                                                         mock_exhaust_iterator):
+        # Verify that we don't buffer whole file in memory when calculating
+        # object has when iterator has __next__ method, but instead read and calculate hash in chunks
+        size = 100
+
+        mock_read_in_chunks.return_value = 'a' * size
+
+        iterator = BodyStream('a' * size)
+
+        upload_func = Mock()
+        upload_func.return_value = True, '', size
+
+        # strict_mode is disabled, default content type should be used
+        self.driver1.connection = Mock()
+
+        self.assertEqual(mock_read_in_chunks.call_count, 0)
+        self.assertEqual(mock_exhaust_iterator.call_count, 0)
+
+        result = self.driver1._upload_object(object_name='test',
+                                             content_type=None,
+                                             upload_func=upload_func,
+                                             upload_func_kwargs={},
+                                             request_path='/',
+                                             stream=iterator)
+
+        hasher = hashlib.md5()
+        hasher.update('a' * size)
+        expected_hash = hasher.hexdigest()
+
+        self.assertEqual(result['data_hash'], expected_hash)
+        self.assertEqual(result['bytes_transferred'], size)
+
+        headers = self.driver1.connection.request.call_args[-1]['headers']
+        self.assertEqual(headers['Content-Type'], DEFAULT_CONTENT_TYPE)
+
+        self.assertEqual(mock_read_in_chunks.call_count, 1)
+        self.assertEqual(mock_exhaust_iterator.call_count, 0)
+
 
 if __name__ == '__main__':
     sys.exit(unittest.main())


[4/4] libcloud git commit: Update the code to use the same code path in case iterator object has next() method. next() and __next__() should be equivalent so we should follow the same code path in such scenario (no need to fall back to while loop at the

Posted by to...@apache.org.
Update the code to use the same code path in case iterator object has
next() method. next() and __next__() should be equivalent so we should
follow the same code path in such scenario (no need to fall back to
while loop at the end of method).

Also add a test case for it.


Project: http://git-wip-us.apache.org/repos/asf/libcloud/repo
Commit: http://git-wip-us.apache.org/repos/asf/libcloud/commit/484e02a6
Tree: http://git-wip-us.apache.org/repos/asf/libcloud/tree/484e02a6
Diff: http://git-wip-us.apache.org/repos/asf/libcloud/diff/484e02a6

Branch: refs/heads/trunk
Commit: 484e02a6395f3f05989a90b88bdb4c9b583ab4d7
Parents: 0710d23
Author: Tomaz Muraus <to...@tomaz.me>
Authored: Mon Jan 8 12:04:09 2018 +0100
Committer: Tomaz Muraus <to...@tomaz.me>
Committed: Mon Jan 8 12:04:09 2018 +0100

----------------------------------------------------------------------
 libcloud/storage/base.py           |  7 +++++-
 libcloud/test/storage/test_base.py | 39 ++++++++++++++++++++++++++++++---
 2 files changed, 42 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/libcloud/blob/484e02a6/libcloud/storage/base.py
----------------------------------------------------------------------
diff --git a/libcloud/storage/base.py b/libcloud/storage/base.py
index 617cfed..052725b 100644
--- a/libcloud/storage/base.py
+++ b/libcloud/storage/base.py
@@ -642,22 +642,27 @@ class StorageDriver(BaseDriver):
 
     def _hash_buffered_stream(self, stream, hasher, blocksize=65536):
         total_len = 0
-        if hasattr(stream, '__next__'):
+
+        if hasattr(stream, '__next__') or hasattr(stream, 'next'):
             for chunk in libcloud.utils.files.read_in_chunks(iterator=stream):
                 hasher.update(b(chunk))
                 total_len += len(chunk)
+
             return (hasher.hexdigest(), total_len)
+
         if not hasattr(stream, '__exit__'):
             for s in stream:
                 hasher.update(s)
                 total_len = total_len + len(s)
             return (hasher.hexdigest(), total_len)
+
         with stream:
             buf = stream.read(blocksize)
             while len(buf) > 0:
                 total_len = total_len + len(buf)
                 hasher.update(buf)
                 buf = stream.read(blocksize)
+
         return (hasher.hexdigest(), total_len)
 
     def _get_hash_function(self):

http://git-wip-us.apache.org/repos/asf/libcloud/blob/484e02a6/libcloud/test/storage/test_base.py
----------------------------------------------------------------------
diff --git a/libcloud/test/storage/test_base.py b/libcloud/test/storage/test_base.py
index dabc0f5..87a6954 100644
--- a/libcloud/test/storage/test_base.py
+++ b/libcloud/test/storage/test_base.py
@@ -142,16 +142,18 @@ class BaseStorageTests(unittest.TestCase):
         # object has when iterator has __next__ method, but instead read and calculate hash in chunks
         size = 100
 
+        self.driver1.connection = Mock()
+
+        # stream has __next__ method and next() method
         mock_read_in_chunks.return_value = 'a' * size
 
         iterator = BodyStream('a' * size)
+        self.assertTrue(hasattr(iterator, '__next__'))
+        self.assertTrue(hasattr(iterator, 'next'))
 
         upload_func = Mock()
         upload_func.return_value = True, '', size
 
-        # strict_mode is disabled, default content type should be used
-        self.driver1.connection = Mock()
-
         self.assertEqual(mock_read_in_chunks.call_count, 0)
         self.assertEqual(mock_exhaust_iterator.call_count, 0)
 
@@ -175,6 +177,37 @@ class BaseStorageTests(unittest.TestCase):
         self.assertEqual(mock_read_in_chunks.call_count, 1)
         self.assertEqual(mock_exhaust_iterator.call_count, 0)
 
+        # stream has only has next() method
+        mock_read_in_chunks.return_value = 'b' * size
+
+        iterator = iter([str(v) for v in ['b' * size]])
+
+        self.assertFalse(hasattr(iterator, '__next__'))
+        self.assertTrue(hasattr(iterator, 'next'))
+
+        self.assertEqual(mock_read_in_chunks.call_count, 1)
+        self.assertEqual(mock_exhaust_iterator.call_count, 0)
+
+        result = self.driver1._upload_object(object_name='test',
+                                             content_type=None,
+                                             upload_func=upload_func,
+                                             upload_func_kwargs={},
+                                             request_path='/',
+                                             stream=iterator)
+
+        hasher = hashlib.md5()
+        hasher.update('b' * size)
+        expected_hash = hasher.hexdigest()
+
+        self.assertEqual(result['data_hash'], expected_hash)
+        self.assertEqual(result['bytes_transferred'], size)
+
+        headers = self.driver1.connection.request.call_args[-1]['headers']
+        self.assertEqual(headers['Content-Type'], DEFAULT_CONTENT_TYPE)
+
+        self.assertEqual(mock_read_in_chunks.call_count, 2)
+        self.assertEqual(mock_exhaust_iterator.call_count, 0)
+
 
 if __name__ == '__main__':
     sys.exit(unittest.main())


[2/4] libcloud git commit: Add changelog entry for #1135.

Posted by to...@apache.org.
Add changelog entry for #1135.


Project: http://git-wip-us.apache.org/repos/asf/libcloud/repo
Commit: http://git-wip-us.apache.org/repos/asf/libcloud/commit/f0532973
Tree: http://git-wip-us.apache.org/repos/asf/libcloud/tree/f0532973
Diff: http://git-wip-us.apache.org/repos/asf/libcloud/diff/f0532973

Branch: refs/heads/trunk
Commit: f05329730f78b865330fc7cf04b63fd48bec6a54
Parents: 1e955af
Author: Tomaz Muraus <to...@tomaz.me>
Authored: Mon Jan 8 11:21:35 2018 +0100
Committer: Tomaz Muraus <to...@tomaz.me>
Committed: Mon Jan 8 11:21:35 2018 +0100

----------------------------------------------------------------------
 CHANGES.rst | 5 +++++
 1 file changed, 5 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/libcloud/blob/f0532973/CHANGES.rst
----------------------------------------------------------------------
diff --git a/CHANGES.rst b/CHANGES.rst
index f402a56..b24bb8b 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -135,6 +135,11 @@ Storage
   (GITHUB-1132)
   [Quentin Pradet]
 
+- Fix a regression with hash computation performance and memory usage on object
+  upload inadvertently introduced in 2.0.0 and make it more efficient.
+  (GITHUB-1135)
+  [Quentin Pradet]
+
 Changes in Apache Libcloud 2.2.1
 --------------------------------