You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@libcloud.apache.org by to...@apache.org on 2018/01/08 11:11:25 UTC
[1/4] libcloud git commit: storage: Fix hash computation performance
on upload
Repository: libcloud
Updated Branches:
refs/heads/trunk ad689715e -> 484e02a63
storage: Fix hash computation performance on upload
The storage base driver computes the hash of uploaded files: individual
drivers use it to ensure the reported hash is correct. Before libcloud
2.x, this was done efficiently: the file was only read once, and we were
using `hash.update()` to avoid keeping the whole file in memory. With
the switch to the requests module, both of these optimizations were
removed inadvertently. It turns out the important one is using
`hash.update()`: computing the hash on the whole file in memory is
orders of magnitude slower.
Closes #1135
Signed-off-by: Tomaz Muraus <to...@tomaz.me>
Project: http://git-wip-us.apache.org/repos/asf/libcloud/repo
Commit: http://git-wip-us.apache.org/repos/asf/libcloud/commit/1e955af3
Tree: http://git-wip-us.apache.org/repos/asf/libcloud/tree/1e955af3
Diff: http://git-wip-us.apache.org/repos/asf/libcloud/diff/1e955af3
Branch: refs/heads/trunk
Commit: 1e955af39bb799d32808faa7ebe37455b8ad7e9e
Parents: ad68971
Author: Quentin Pradet <qu...@clustree.com>
Authored: Wed Oct 18 11:15:22 2017 +0400
Committer: Tomaz Muraus <to...@tomaz.me>
Committed: Mon Jan 8 11:19:41 2018 +0100
----------------------------------------------------------------------
libcloud/storage/base.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/libcloud/blob/1e955af3/libcloud/storage/base.py
----------------------------------------------------------------------
diff --git a/libcloud/storage/base.py b/libcloud/storage/base.py
index b3de63d..617cfed 100644
--- a/libcloud/storage/base.py
+++ b/libcloud/storage/base.py
@@ -643,9 +643,9 @@ class StorageDriver(BaseDriver):
def _hash_buffered_stream(self, stream, hasher, blocksize=65536):
total_len = 0
if hasattr(stream, '__next__'):
- data = libcloud.utils.files.exhaust_iterator(iterator=stream)
- hasher.update(b(data))
- total_len = len(data)
+ for chunk in libcloud.utils.files.read_in_chunks(iterator=stream):
+ hasher.update(b(chunk))
+ total_len += len(chunk)
return (hasher.hexdigest(), total_len)
if not hasattr(stream, '__exit__'):
for s in stream:
[3/4] libcloud git commit: Add a test case for #1135.
Posted by to...@apache.org.
Add a test case for #1135.
Project: http://git-wip-us.apache.org/repos/asf/libcloud/repo
Commit: http://git-wip-us.apache.org/repos/asf/libcloud/commit/0710d23d
Tree: http://git-wip-us.apache.org/repos/asf/libcloud/tree/0710d23d
Diff: http://git-wip-us.apache.org/repos/asf/libcloud/diff/0710d23d
Branch: refs/heads/trunk
Commit: 0710d23de52a1c2c4841937e4c7c17b81950b9ae
Parents: f053297
Author: Tomaz Muraus <to...@tomaz.me>
Authored: Mon Jan 8 11:58:10 2018 +0100
Committer: Tomaz Muraus <to...@tomaz.me>
Committed: Mon Jan 8 12:00:17 2018 +0100
----------------------------------------------------------------------
libcloud/test/storage/test_base.py | 44 +++++++++++++++++++++++++++++++++
1 file changed, 44 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/libcloud/blob/0710d23d/libcloud/test/storage/test_base.py
----------------------------------------------------------------------
diff --git a/libcloud/test/storage/test_base.py b/libcloud/test/storage/test_base.py
index 2f3b048..dabc0f5 100644
--- a/libcloud/test/storage/test_base.py
+++ b/libcloud/test/storage/test_base.py
@@ -14,10 +14,12 @@
# limitations under the License.
import sys
+import hashlib
from libcloud.utils.py3 import httplib
from io import BytesIO
+import mock
from mock import Mock
from libcloud.utils.py3 import StringIO
@@ -28,6 +30,7 @@ from libcloud.storage.base import DEFAULT_CONTENT_TYPE
from libcloud.test import unittest
from libcloud.test import MockHttp
+from libcloud.test import BodyStream
class BaseMockRawResponse(MockHttp):
@@ -131,6 +134,47 @@ class BaseStorageTests(unittest.TestCase):
request_path='/',
stream=iterator)
+ @mock.patch('libcloud.utils.files.exhaust_iterator')
+ @mock.patch('libcloud.utils.files.read_in_chunks')
+ def test_upload_object_hash_calculation_is_efficient(self, mock_read_in_chunks,
+ mock_exhaust_iterator):
+ # Verify that we don't buffer whole file in memory when calculating
+ # object has when iterator has __next__ method, but instead read and calculate hash in chunks
+ size = 100
+
+ mock_read_in_chunks.return_value = 'a' * size
+
+ iterator = BodyStream('a' * size)
+
+ upload_func = Mock()
+ upload_func.return_value = True, '', size
+
+ # strict_mode is disabled, default content type should be used
+ self.driver1.connection = Mock()
+
+ self.assertEqual(mock_read_in_chunks.call_count, 0)
+ self.assertEqual(mock_exhaust_iterator.call_count, 0)
+
+ result = self.driver1._upload_object(object_name='test',
+ content_type=None,
+ upload_func=upload_func,
+ upload_func_kwargs={},
+ request_path='/',
+ stream=iterator)
+
+ hasher = hashlib.md5()
+ hasher.update('a' * size)
+ expected_hash = hasher.hexdigest()
+
+ self.assertEqual(result['data_hash'], expected_hash)
+ self.assertEqual(result['bytes_transferred'], size)
+
+ headers = self.driver1.connection.request.call_args[-1]['headers']
+ self.assertEqual(headers['Content-Type'], DEFAULT_CONTENT_TYPE)
+
+ self.assertEqual(mock_read_in_chunks.call_count, 1)
+ self.assertEqual(mock_exhaust_iterator.call_count, 0)
+
if __name__ == '__main__':
sys.exit(unittest.main())
[4/4] libcloud git commit: Update the code to use the same code path
in case iterator object has next() method. next() and __next__() should be
equivalent so we should follow the same code path in such scenario (no need
to fall back to while loop at the
Posted by to...@apache.org.
Update the code to use the same code path in case iterator object has
next() method. next() and __next__() should be equivalent so we should
follow the same code path in such scenario (no need to fall back to
while loop at the end of method).
Also add a test case for it.
Project: http://git-wip-us.apache.org/repos/asf/libcloud/repo
Commit: http://git-wip-us.apache.org/repos/asf/libcloud/commit/484e02a6
Tree: http://git-wip-us.apache.org/repos/asf/libcloud/tree/484e02a6
Diff: http://git-wip-us.apache.org/repos/asf/libcloud/diff/484e02a6
Branch: refs/heads/trunk
Commit: 484e02a6395f3f05989a90b88bdb4c9b583ab4d7
Parents: 0710d23
Author: Tomaz Muraus <to...@tomaz.me>
Authored: Mon Jan 8 12:04:09 2018 +0100
Committer: Tomaz Muraus <to...@tomaz.me>
Committed: Mon Jan 8 12:04:09 2018 +0100
----------------------------------------------------------------------
libcloud/storage/base.py | 7 +++++-
libcloud/test/storage/test_base.py | 39 ++++++++++++++++++++++++++++++---
2 files changed, 42 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/libcloud/blob/484e02a6/libcloud/storage/base.py
----------------------------------------------------------------------
diff --git a/libcloud/storage/base.py b/libcloud/storage/base.py
index 617cfed..052725b 100644
--- a/libcloud/storage/base.py
+++ b/libcloud/storage/base.py
@@ -642,22 +642,27 @@ class StorageDriver(BaseDriver):
def _hash_buffered_stream(self, stream, hasher, blocksize=65536):
total_len = 0
- if hasattr(stream, '__next__'):
+
+ if hasattr(stream, '__next__') or hasattr(stream, 'next'):
for chunk in libcloud.utils.files.read_in_chunks(iterator=stream):
hasher.update(b(chunk))
total_len += len(chunk)
+
return (hasher.hexdigest(), total_len)
+
if not hasattr(stream, '__exit__'):
for s in stream:
hasher.update(s)
total_len = total_len + len(s)
return (hasher.hexdigest(), total_len)
+
with stream:
buf = stream.read(blocksize)
while len(buf) > 0:
total_len = total_len + len(buf)
hasher.update(buf)
buf = stream.read(blocksize)
+
return (hasher.hexdigest(), total_len)
def _get_hash_function(self):
http://git-wip-us.apache.org/repos/asf/libcloud/blob/484e02a6/libcloud/test/storage/test_base.py
----------------------------------------------------------------------
diff --git a/libcloud/test/storage/test_base.py b/libcloud/test/storage/test_base.py
index dabc0f5..87a6954 100644
--- a/libcloud/test/storage/test_base.py
+++ b/libcloud/test/storage/test_base.py
@@ -142,16 +142,18 @@ class BaseStorageTests(unittest.TestCase):
# object has when iterator has __next__ method, but instead read and calculate hash in chunks
size = 100
+ self.driver1.connection = Mock()
+
+ # stream has __next__ method and next() method
mock_read_in_chunks.return_value = 'a' * size
iterator = BodyStream('a' * size)
+ self.assertTrue(hasattr(iterator, '__next__'))
+ self.assertTrue(hasattr(iterator, 'next'))
upload_func = Mock()
upload_func.return_value = True, '', size
- # strict_mode is disabled, default content type should be used
- self.driver1.connection = Mock()
-
self.assertEqual(mock_read_in_chunks.call_count, 0)
self.assertEqual(mock_exhaust_iterator.call_count, 0)
@@ -175,6 +177,37 @@ class BaseStorageTests(unittest.TestCase):
self.assertEqual(mock_read_in_chunks.call_count, 1)
self.assertEqual(mock_exhaust_iterator.call_count, 0)
+ # stream has only has next() method
+ mock_read_in_chunks.return_value = 'b' * size
+
+ iterator = iter([str(v) for v in ['b' * size]])
+
+ self.assertFalse(hasattr(iterator, '__next__'))
+ self.assertTrue(hasattr(iterator, 'next'))
+
+ self.assertEqual(mock_read_in_chunks.call_count, 1)
+ self.assertEqual(mock_exhaust_iterator.call_count, 0)
+
+ result = self.driver1._upload_object(object_name='test',
+ content_type=None,
+ upload_func=upload_func,
+ upload_func_kwargs={},
+ request_path='/',
+ stream=iterator)
+
+ hasher = hashlib.md5()
+ hasher.update('b' * size)
+ expected_hash = hasher.hexdigest()
+
+ self.assertEqual(result['data_hash'], expected_hash)
+ self.assertEqual(result['bytes_transferred'], size)
+
+ headers = self.driver1.connection.request.call_args[-1]['headers']
+ self.assertEqual(headers['Content-Type'], DEFAULT_CONTENT_TYPE)
+
+ self.assertEqual(mock_read_in_chunks.call_count, 2)
+ self.assertEqual(mock_exhaust_iterator.call_count, 0)
+
if __name__ == '__main__':
sys.exit(unittest.main())
[2/4] libcloud git commit: Add changelog entry for #1135.
Posted by to...@apache.org.
Add changelog entry for #1135.
Project: http://git-wip-us.apache.org/repos/asf/libcloud/repo
Commit: http://git-wip-us.apache.org/repos/asf/libcloud/commit/f0532973
Tree: http://git-wip-us.apache.org/repos/asf/libcloud/tree/f0532973
Diff: http://git-wip-us.apache.org/repos/asf/libcloud/diff/f0532973
Branch: refs/heads/trunk
Commit: f05329730f78b865330fc7cf04b63fd48bec6a54
Parents: 1e955af
Author: Tomaz Muraus <to...@tomaz.me>
Authored: Mon Jan 8 11:21:35 2018 +0100
Committer: Tomaz Muraus <to...@tomaz.me>
Committed: Mon Jan 8 11:21:35 2018 +0100
----------------------------------------------------------------------
CHANGES.rst | 5 +++++
1 file changed, 5 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/libcloud/blob/f0532973/CHANGES.rst
----------------------------------------------------------------------
diff --git a/CHANGES.rst b/CHANGES.rst
index f402a56..b24bb8b 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -135,6 +135,11 @@ Storage
(GITHUB-1132)
[Quentin Pradet]
+- Fix a regression with hash computation performance and memory usage on object
+ upload inadvertently introduced in 2.0.0 and make it more efficient.
+ (GITHUB-1135)
+ [Quentin Pradet]
+
Changes in Apache Libcloud 2.2.1
--------------------------------