You are viewing a plain text version of this content. The canonical link for it is here.

Posted to notifications@libcloud.apache.org by to...@apache.org on 2018/01/08 11:11:25 UTC

[1/4] libcloud git commit: storage: Fix hash computation performance on upload

Repository: libcloud
Updated Branches:
  refs/heads/trunk ad689715e -> 484e02a63


storage: Fix hash computation performance on upload

The storage base driver computes the hash of uploaded files: individual
drivers use it to ensure the reported hash is correct. Before libcloud
2.x, this was done efficiently: the file was only read once, and we were
using `hash.update()` to avoid keeping the whole file in memory. With
the switch to the requests module, both of these optimizations were
removed inadvertently. It turns out the important one is using
`hash.update()`: computing the hash on the whole file in memory is
orders of magnitude slower.

Closes #1135

Signed-off-by: Tomaz Muraus <to...@tomaz.me>


Project: http://git-wip-us.apache.org/repos/asf/libcloud/repo
Commit: http://git-wip-us.apache.org/repos/asf/libcloud/commit/1e955af3
Tree: http://git-wip-us.apache.org/repos/asf/libcloud/tree/1e955af3
Diff: http://git-wip-us.apache.org/repos/asf/libcloud/diff/1e955af3

Branch: refs/heads/trunk
Commit: 1e955af39bb799d32808faa7ebe37455b8ad7e9e
Parents: ad68971
Author: Quentin Pradet <qu...@clustree.com>
Authored: Wed Oct 18 11:15:22 2017 +0400
Committer: Tomaz Muraus <to...@tomaz.me>
Committed: Mon Jan 8 11:19:41 2018 +0100

----------------------------------------------------------------------
 libcloud/storage/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/libcloud/blob/1e955af3/libcloud/storage/base.py
----------------------------------------------------------------------
diff --git a/libcloud/storage/base.py b/libcloud/storage/base.py
index b3de63d..617cfed 100644
--- a/libcloud/storage/base.py
+++ b/libcloud/storage/base.py
@@ -643,9 +643,9 @@ class StorageDriver(BaseDriver):
     def _hash_buffered_stream(self, stream, hasher, blocksize=65536):
         total_len = 0
         if hasattr(stream, '__next__'):
-            data = libcloud.utils.files.exhaust_iterator(iterator=stream)
-            hasher.update(b(data))
-            total_len = len(data)
+            for chunk in libcloud.utils.files.read_in_chunks(iterator=stream):
+                hasher.update(b(chunk))
+                total_len += len(chunk)
             return (hasher.hexdigest(), total_len)
         if not hasattr(stream, '__exit__'):
             for s in stream:

[3/4] libcloud git commit: Add a test case for #1135.

Posted by to...@apache.org.

Add a test case for #1135.


Project: http://git-wip-us.apache.org/repos/asf/libcloud/repo
Commit: http://git-wip-us.apache.org/repos/asf/libcloud/commit/0710d23d
Tree: http://git-wip-us.apache.org/repos/asf/libcloud/tree/0710d23d
Diff: http://git-wip-us.apache.org/repos/asf/libcloud/diff/0710d23d

Branch: refs/heads/trunk
Commit: 0710d23de52a1c2c4841937e4c7c17b81950b9ae
Parents: f053297
Author: Tomaz Muraus <to...@tomaz.me>
Authored: Mon Jan 8 11:58:10 2018 +0100
Committer: Tomaz Muraus <to...@tomaz.me>
Committed: Mon Jan 8 12:00:17 2018 +0100

----------------------------------------------------------------------
 libcloud/test/storage/test_base.py | 44 +++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/libcloud/blob/0710d23d/libcloud/test/storage/test_base.py
----------------------------------------------------------------------
diff --git a/libcloud/test/storage/test_base.py b/libcloud/test/storage/test_base.py
index 2f3b048..dabc0f5 100644
--- a/libcloud/test/storage/test_base.py
+++ b/libcloud/test/storage/test_base.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 
 import sys
+import hashlib
 
 from libcloud.utils.py3 import httplib
 from io import BytesIO
 
+import mock
 from mock import Mock
 
 from libcloud.utils.py3 import StringIO
@@ -28,6 +30,7 @@ from libcloud.storage.base import DEFAULT_CONTENT_TYPE
 
 from libcloud.test import unittest
 from libcloud.test import MockHttp
+from libcloud.test import BodyStream
 
 
 class BaseMockRawResponse(MockHttp):
@@ -131,6 +134,47 @@ class BaseStorageTests(unittest.TestCase):
                                 request_path='/',
                                 stream=iterator)
 
+    @mock.patch('libcloud.utils.files.exhaust_iterator')
+    @mock.patch('libcloud.utils.files.read_in_chunks')
+    def test_upload_object_hash_calculation_is_efficient(self, mock_read_in_chunks,
+                                                         mock_exhaust_iterator):
+        # Verify that we don't buffer whole file in memory when calculating
+        # object has when iterator has __next__ method, but instead read and calculate hash in chunks
+        size = 100
+
+        mock_read_in_chunks.return_value = 'a' * size
+
+        iterator = BodyStream('a' * size)
+
+        upload_func = Mock()
+        upload_func.return_value = True, '', size
+
+        # strict_mode is disabled, default content type should be used
+        self.driver1.connection = Mock()
+
+        self.assertEqual(mock_read_in_chunks.call_count, 0)
+        self.assertEqual(mock_exhaust_iterator.call_count, 0)
+
+        result = self.driver1._upload_object(object_name='test',
+                                             content_type=None,
+                                             upload_func=upload_func,
+                                             upload_func_kwargs={},
+                                             request_path='/',
+                                             stream=iterator)
+
+        hasher = hashlib.md5()
+        hasher.update('a' * size)
+        expected_hash = hasher.hexdigest()
+
+        self.assertEqual(result['data_hash'], expected_hash)
+        self.assertEqual(result['bytes_transferred'], size)
+
+        headers = self.driver1.connection.request.call_args[-1]['headers']
+        self.assertEqual(headers['Content-Type'], DEFAULT_CONTENT_TYPE)
+
+        self.assertEqual(mock_read_in_chunks.call_count, 1)
+        self.assertEqual(mock_exhaust_iterator.call_count, 0)
+
 
 if __name__ == '__main__':
     sys.exit(unittest.main())

[4/4] libcloud git commit: Update the code to use the same code path in case iterator object has next() method. next() and next() should be equivalent so we should follow the same code path in such scenario (no need to fall back to while loop at the

Posted by to...@apache.org.

Update the code to use the same code path in case iterator object has
next() method. next() and __next__() should be equivalent so we should
follow the same code path in such scenario (no need to fall back to
while loop at the end of method).

Also add a test case for it.


Project: http://git-wip-us.apache.org/repos/asf/libcloud/repo
Commit: http://git-wip-us.apache.org/repos/asf/libcloud/commit/484e02a6
Tree: http://git-wip-us.apache.org/repos/asf/libcloud/tree/484e02a6
Diff: http://git-wip-us.apache.org/repos/asf/libcloud/diff/484e02a6

Branch: refs/heads/trunk
Commit: 484e02a6395f3f05989a90b88bdb4c9b583ab4d7
Parents: 0710d23
Author: Tomaz Muraus <to...@tomaz.me>
Authored: Mon Jan 8 12:04:09 2018 +0100
Committer: Tomaz Muraus <to...@tomaz.me>
Committed: Mon Jan 8 12:04:09 2018 +0100

----------------------------------------------------------------------
 libcloud/storage/base.py           |  7 +++++-
 libcloud/test/storage/test_base.py | 39 ++++++++++++++++++++++++++++++---
 2 files changed, 42 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/libcloud/blob/484e02a6/libcloud/storage/base.py
----------------------------------------------------------------------
diff --git a/libcloud/storage/base.py b/libcloud/storage/base.py
index 617cfed..052725b 100644
--- a/libcloud/storage/base.py
+++ b/libcloud/storage/base.py
@@ -642,22 +642,27 @@ class StorageDriver(BaseDriver):
 
     def _hash_buffered_stream(self, stream, hasher, blocksize=65536):
         total_len = 0
-        if hasattr(stream, '__next__'):
+
+        if hasattr(stream, '__next__') or hasattr(stream, 'next'):
             for chunk in libcloud.utils.files.read_in_chunks(iterator=stream):
                 hasher.update(b(chunk))
                 total_len += len(chunk)
+
             return (hasher.hexdigest(), total_len)
+
         if not hasattr(stream, '__exit__'):
             for s in stream:
                 hasher.update(s)
                 total_len = total_len + len(s)
             return (hasher.hexdigest(), total_len)
+
         with stream:
             buf = stream.read(blocksize)
             while len(buf) > 0:
                 total_len = total_len + len(buf)
                 hasher.update(buf)
                 buf = stream.read(blocksize)
+
         return (hasher.hexdigest(), total_len)
 
     def _get_hash_function(self):

http://git-wip-us.apache.org/repos/asf/libcloud/blob/484e02a6/libcloud/test/storage/test_base.py
----------------------------------------------------------------------
diff --git a/libcloud/test/storage/test_base.py b/libcloud/test/storage/test_base.py
index dabc0f5..87a6954 100644
--- a/libcloud/test/storage/test_base.py
+++ b/libcloud/test/storage/test_base.py
@@ -142,16 +142,18 @@ class BaseStorageTests(unittest.TestCase):
         # object has when iterator has __next__ method, but instead read and calculate hash in chunks
         size = 100
 
+        self.driver1.connection = Mock()
+
+        # stream has __next__ method and next() method
         mock_read_in_chunks.return_value = 'a' * size
 
         iterator = BodyStream('a' * size)
+        self.assertTrue(hasattr(iterator, '__next__'))
+        self.assertTrue(hasattr(iterator, 'next'))
 
         upload_func = Mock()
         upload_func.return_value = True, '', size
 
-        # strict_mode is disabled, default content type should be used
-        self.driver1.connection = Mock()
-
         self.assertEqual(mock_read_in_chunks.call_count, 0)
         self.assertEqual(mock_exhaust_iterator.call_count, 0)
 
@@ -175,6 +177,37 @@ class BaseStorageTests(unittest.TestCase):
         self.assertEqual(mock_read_in_chunks.call_count, 1)
         self.assertEqual(mock_exhaust_iterator.call_count, 0)
 
+        # stream has only has next() method
+        mock_read_in_chunks.return_value = 'b' * size
+
+        iterator = iter([str(v) for v in ['b' * size]])
+
+        self.assertFalse(hasattr(iterator, '__next__'))
+        self.assertTrue(hasattr(iterator, 'next'))
+
+        self.assertEqual(mock_read_in_chunks.call_count, 1)
+        self.assertEqual(mock_exhaust_iterator.call_count, 0)
+
+        result = self.driver1._upload_object(object_name='test',
+                                             content_type=None,
+                                             upload_func=upload_func,
+                                             upload_func_kwargs={},
+                                             request_path='/',
+                                             stream=iterator)
+
+        hasher = hashlib.md5()
+        hasher.update('b' * size)
+        expected_hash = hasher.hexdigest()
+
+        self.assertEqual(result['data_hash'], expected_hash)
+        self.assertEqual(result['bytes_transferred'], size)
+
+        headers = self.driver1.connection.request.call_args[-1]['headers']
+        self.assertEqual(headers['Content-Type'], DEFAULT_CONTENT_TYPE)
+
+        self.assertEqual(mock_read_in_chunks.call_count, 2)
+        self.assertEqual(mock_exhaust_iterator.call_count, 0)
+
 
 if __name__ == '__main__':
     sys.exit(unittest.main())

[2/4] libcloud git commit: Add changelog entry for #1135.

Posted by to...@apache.org.

Add changelog entry for #1135.


Project: http://git-wip-us.apache.org/repos/asf/libcloud/repo
Commit: http://git-wip-us.apache.org/repos/asf/libcloud/commit/f0532973
Tree: http://git-wip-us.apache.org/repos/asf/libcloud/tree/f0532973
Diff: http://git-wip-us.apache.org/repos/asf/libcloud/diff/f0532973

Branch: refs/heads/trunk
Commit: f05329730f78b865330fc7cf04b63fd48bec6a54
Parents: 1e955af
Author: Tomaz Muraus <to...@tomaz.me>
Authored: Mon Jan 8 11:21:35 2018 +0100
Committer: Tomaz Muraus <to...@tomaz.me>
Committed: Mon Jan 8 11:21:35 2018 +0100

----------------------------------------------------------------------
 CHANGES.rst | 5 +++++
 1 file changed, 5 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/libcloud/blob/f0532973/CHANGES.rst
----------------------------------------------------------------------
diff --git a/CHANGES.rst b/CHANGES.rst
index f402a56..b24bb8b 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -135,6 +135,11 @@ Storage
   (GITHUB-1132)
   [Quentin Pradet]
 
+- Fix a regression with hash computation performance and memory usage on object
+  upload inadvertently introduced in 2.0.0 and make it more efficient.
+  (GITHUB-1135)
+  [Quentin Pradet]
+
 Changes in Apache Libcloud 2.2.1
 --------------------------------

[1/4] libcloud git commit: storage: Fix hash computation performance on upload

[3/4] libcloud git commit: Add a test case for #1135.

[4/4] libcloud git commit: Update the code to use the same code path in case iterator object has next() method. next() and __next__() should be equivalent so we should follow the same code path in such scenario (no need to fall back to while loop at the

[2/4] libcloud git commit: Add changelog entry for #1135.

[4/4] libcloud git commit: Update the code to use the same code path in case iterator object has next() method. next() and next() should be equivalent so we should follow the same code path in such scenario (no need to fall back to while loop at the