|
@@ -1,741 +1,741 @@
|
|
|
-# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
|
-#
|
|
|
-# Licensed under the Apache License, Version 2.0 (the "License"). You
|
|
|
-# may not use this file except in compliance with the License. A copy of
|
|
|
-# the License is located at
|
|
|
-#
|
|
|
-# http://aws.amazon.com/apache2.0/
|
|
|
-#
|
|
|
-# or in the "license" file accompanying this file. This file is
|
|
|
-# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
|
|
-# ANY KIND, either express or implied. See the License for the specific
|
|
|
-# language governing permissions and limitations under the License.
|
|
|
-"""Abstractions over S3's upload/download operations.
|
|
|
-
|
|
|
-This module provides high level abstractions for efficient
|
|
|
-uploads/downloads. It handles several things for the user:
|
|
|
-
|
|
|
-* Automatically switching to multipart transfers when
|
|
|
- a file is over a specific size threshold
|
|
|
-* Uploading/downloading a file in parallel
|
|
|
-* Throttling based on max bandwidth
|
|
|
-* Progress callbacks to monitor transfers
|
|
|
-* Retries. While botocore handles retries for streaming uploads,
|
|
|
- it is not possible for it to handle retries for streaming
|
|
|
- downloads. This module handles retries for both cases so
|
|
|
- you don't need to implement any retry logic yourself.
|
|
|
-
|
|
|
-This module has a reasonable set of defaults. It also allows you
|
|
|
-to configure many aspects of the transfer process including:
|
|
|
-
|
|
|
-* Multipart threshold size
|
|
|
-* Max parallel downloads
|
|
|
-* Max bandwidth
|
|
|
-* Socket timeouts
|
|
|
-* Retry amounts
|
|
|
-
|
|
|
-There is no support for s3->s3 multipart copies at this
|
|
|
-time.
|
|
|
-
|
|
|
-
|
|
|
-.. _ref_s3transfer_usage:
|
|
|
-
|
|
|
-Usage
|
|
|
-=====
|
|
|
-
|
|
|
-The simplest way to use this module is:
|
|
|
-
|
|
|
-.. code-block:: python
|
|
|
-
|
|
|
- client = boto3.client('s3', 'us-west-2')
|
|
|
- transfer = S3Transfer(client)
|
|
|
- # Upload /tmp/myfile to s3://bucket/key
|
|
|
- transfer.upload_file('/tmp/myfile', 'bucket', 'key')
|
|
|
-
|
|
|
- # Download s3://bucket/key to /tmp/myfile
|
|
|
- transfer.download_file('bucket', 'key', '/tmp/myfile')
|
|
|
-
|
|
|
-The ``upload_file`` and ``download_file`` methods also accept
|
|
|
-``**kwargs``, which will be forwarded through to the corresponding
|
|
|
-client operation. Here are a few examples using ``upload_file``::
|
|
|
-
|
|
|
- # Making the object public
|
|
|
- transfer.upload_file('/tmp/myfile', 'bucket', 'key',
|
|
|
- extra_args={'ACL': 'public-read'})
|
|
|
-
|
|
|
- # Setting metadata
|
|
|
- transfer.upload_file('/tmp/myfile', 'bucket', 'key',
|
|
|
- extra_args={'Metadata': {'a': 'b', 'c': 'd'}})
|
|
|
-
|
|
|
- # Setting content type
|
|
|
- transfer.upload_file('/tmp/myfile.json', 'bucket', 'key',
|
|
|
- extra_args={'ContentType': "application/json"})
|
|
|
-
|
|
|
-
|
|
|
-The ``S3Transfer`` clas also supports progress callbacks so you can
|
|
|
-provide transfer progress to users. Both the ``upload_file`` and
|
|
|
-``download_file`` methods take an optional ``callback`` parameter.
|
|
|
-Here's an example of how to print a simple progress percentage
|
|
|
-to the user:
|
|
|
-
|
|
|
-.. code-block:: python
|
|
|
-
|
|
|
- class ProgressPercentage(object):
|
|
|
- def __init__(self, filename):
|
|
|
- self._filename = filename
|
|
|
- self._size = float(os.path.getsize(filename))
|
|
|
- self._seen_so_far = 0
|
|
|
- self._lock = threading.Lock()
|
|
|
-
|
|
|
- def __call__(self, bytes_amount):
|
|
|
- # To simplify we'll assume this is hooked up
|
|
|
- # to a single filename.
|
|
|
- with self._lock:
|
|
|
- self._seen_so_far += bytes_amount
|
|
|
- percentage = (self._seen_so_far / self._size) * 100
|
|
|
- sys.stdout.write(
|
|
|
- "\r%s %s / %s (%.2f%%)" % (self._filename, self._seen_so_far,
|
|
|
- self._size, percentage))
|
|
|
- sys.stdout.flush()
|
|
|
-
|
|
|
-
|
|
|
- transfer = S3Transfer(boto3.client('s3', 'us-west-2'))
|
|
|
- # Upload /tmp/myfile to s3://bucket/key and print upload progress.
|
|
|
- transfer.upload_file('/tmp/myfile', 'bucket', 'key',
|
|
|
- callback=ProgressPercentage('/tmp/myfile'))
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-You can also provide a TransferConfig object to the S3Transfer
|
|
|
-object that gives you more fine grained control over the
|
|
|
-transfer. For example:
|
|
|
-
|
|
|
-.. code-block:: python
|
|
|
-
|
|
|
- client = boto3.client('s3', 'us-west-2')
|
|
|
- config = TransferConfig(
|
|
|
- multipart_threshold=8 * 1024 * 1024,
|
|
|
- max_concurrency=10,
|
|
|
- num_download_attempts=10,
|
|
|
- )
|
|
|
- transfer = S3Transfer(client, config)
|
|
|
- transfer.upload_file('/tmp/foo', 'bucket', 'key')
|
|
|
-
|
|
|
-
|
|
|
-"""
|
|
|
-import os
|
|
|
-import math
|
|
|
-import functools
|
|
|
-import logging
|
|
|
-import socket
|
|
|
-import threading
|
|
|
-import random
|
|
|
-import string
|
|
|
-import concurrent.futures
|
|
|
-
|
|
|
-from botocore.compat import six
|
|
|
+# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
|
+#
|
|
|
+# Licensed under the Apache License, Version 2.0 (the "License"). You
|
|
|
+# may not use this file except in compliance with the License. A copy of
|
|
|
+# the License is located at
|
|
|
+#
|
|
|
+# http://aws.amazon.com/apache2.0/
|
|
|
+#
|
|
|
+# or in the "license" file accompanying this file. This file is
|
|
|
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
|
|
+# ANY KIND, either express or implied. See the License for the specific
|
|
|
+# language governing permissions and limitations under the License.
|
|
|
+"""Abstractions over S3's upload/download operations.
|
|
|
+
|
|
|
+This module provides high level abstractions for efficient
|
|
|
+uploads/downloads. It handles several things for the user:
|
|
|
+
|
|
|
+* Automatically switching to multipart transfers when
|
|
|
+ a file is over a specific size threshold
|
|
|
+* Uploading/downloading a file in parallel
|
|
|
+* Throttling based on max bandwidth
|
|
|
+* Progress callbacks to monitor transfers
|
|
|
+* Retries. While botocore handles retries for streaming uploads,
|
|
|
+ it is not possible for it to handle retries for streaming
|
|
|
+ downloads. This module handles retries for both cases so
|
|
|
+ you don't need to implement any retry logic yourself.
|
|
|
+
|
|
|
+This module has a reasonable set of defaults. It also allows you
|
|
|
+to configure many aspects of the transfer process including:
|
|
|
+
|
|
|
+* Multipart threshold size
|
|
|
+* Max parallel downloads
|
|
|
+* Max bandwidth
|
|
|
+* Socket timeouts
|
|
|
+* Retry amounts
|
|
|
+
|
|
|
+There is no support for s3->s3 multipart copies at this
|
|
|
+time.
|
|
|
+
|
|
|
+
|
|
|
+.. _ref_s3transfer_usage:
|
|
|
+
|
|
|
+Usage
|
|
|
+=====
|
|
|
+
|
|
|
+The simplest way to use this module is:
|
|
|
+
|
|
|
+.. code-block:: python
|
|
|
+
|
|
|
+ client = boto3.client('s3', 'us-west-2')
|
|
|
+ transfer = S3Transfer(client)
|
|
|
+ # Upload /tmp/myfile to s3://bucket/key
|
|
|
+ transfer.upload_file('/tmp/myfile', 'bucket', 'key')
|
|
|
+
|
|
|
+ # Download s3://bucket/key to /tmp/myfile
|
|
|
+ transfer.download_file('bucket', 'key', '/tmp/myfile')
|
|
|
+
|
|
|
+The ``upload_file`` and ``download_file`` methods also accept
|
|
|
+``**kwargs``, which will be forwarded through to the corresponding
|
|
|
+client operation. Here are a few examples using ``upload_file``::
|
|
|
+
|
|
|
+ # Making the object public
|
|
|
+ transfer.upload_file('/tmp/myfile', 'bucket', 'key',
|
|
|
+ extra_args={'ACL': 'public-read'})
|
|
|
+
|
|
|
+ # Setting metadata
|
|
|
+ transfer.upload_file('/tmp/myfile', 'bucket', 'key',
|
|
|
+ extra_args={'Metadata': {'a': 'b', 'c': 'd'}})
|
|
|
+
|
|
|
+ # Setting content type
|
|
|
+ transfer.upload_file('/tmp/myfile.json', 'bucket', 'key',
|
|
|
+ extra_args={'ContentType': "application/json"})
|
|
|
+
|
|
|
+
|
|
|
+The ``S3Transfer`` clas also supports progress callbacks so you can
|
|
|
+provide transfer progress to users. Both the ``upload_file`` and
|
|
|
+``download_file`` methods take an optional ``callback`` parameter.
|
|
|
+Here's an example of how to print a simple progress percentage
|
|
|
+to the user:
|
|
|
+
|
|
|
+.. code-block:: python
|
|
|
+
|
|
|
+ class ProgressPercentage(object):
|
|
|
+ def __init__(self, filename):
|
|
|
+ self._filename = filename
|
|
|
+ self._size = float(os.path.getsize(filename))
|
|
|
+ self._seen_so_far = 0
|
|
|
+ self._lock = threading.Lock()
|
|
|
+
|
|
|
+ def __call__(self, bytes_amount):
|
|
|
+ # To simplify we'll assume this is hooked up
|
|
|
+ # to a single filename.
|
|
|
+ with self._lock:
|
|
|
+ self._seen_so_far += bytes_amount
|
|
|
+ percentage = (self._seen_so_far / self._size) * 100
|
|
|
+ sys.stdout.write(
|
|
|
+ "\r%s %s / %s (%.2f%%)" % (self._filename, self._seen_so_far,
|
|
|
+ self._size, percentage))
|
|
|
+ sys.stdout.flush()
|
|
|
+
|
|
|
+
|
|
|
+ transfer = S3Transfer(boto3.client('s3', 'us-west-2'))
|
|
|
+ # Upload /tmp/myfile to s3://bucket/key and print upload progress.
|
|
|
+ transfer.upload_file('/tmp/myfile', 'bucket', 'key',
|
|
|
+ callback=ProgressPercentage('/tmp/myfile'))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+You can also provide a TransferConfig object to the S3Transfer
|
|
|
+object that gives you more fine grained control over the
|
|
|
+transfer. For example:
|
|
|
+
|
|
|
+.. code-block:: python
|
|
|
+
|
|
|
+ client = boto3.client('s3', 'us-west-2')
|
|
|
+ config = TransferConfig(
|
|
|
+ multipart_threshold=8 * 1024 * 1024,
|
|
|
+ max_concurrency=10,
|
|
|
+ num_download_attempts=10,
|
|
|
+ )
|
|
|
+ transfer = S3Transfer(client, config)
|
|
|
+ transfer.upload_file('/tmp/foo', 'bucket', 'key')
|
|
|
+
|
|
|
+
|
|
|
+"""
|
|
|
+import os
|
|
|
+import math
|
|
|
+import functools
|
|
|
+import logging
|
|
|
+import socket
|
|
|
+import threading
|
|
|
+import random
|
|
|
+import string
|
|
|
+import concurrent.futures
|
|
|
+
|
|
|
+from botocore.compat import six
|
|
|
from botocore.vendored.requests.packages.urllib3.exceptions import \
|
|
|
- ReadTimeoutError
|
|
|
-from botocore.exceptions import IncompleteReadError
|
|
|
-
|
|
|
-import s3transfer.compat
|
|
|
-from s3transfer.exceptions import RetriesExceededError, S3UploadFailedError
|
|
|
-
|
|
|
-
|
|
|
-__author__ = 'Amazon Web Services'
|
|
|
+ ReadTimeoutError
|
|
|
+from botocore.exceptions import IncompleteReadError
|
|
|
+
|
|
|
+import s3transfer.compat
|
|
|
+from s3transfer.exceptions import RetriesExceededError, S3UploadFailedError
|
|
|
+
|
|
|
+
|
|
|
+__author__ = 'Amazon Web Services'
|
|
|
__version__ = '0.4.2'
|
|
|
-
|
|
|
-
|
|
|
-class NullHandler(logging.Handler):
|
|
|
- def emit(self, record):
|
|
|
- pass
|
|
|
-
|
|
|
-
|
|
|
-logger = logging.getLogger(__name__)
|
|
|
-logger.addHandler(NullHandler())
|
|
|
-
|
|
|
-queue = six.moves.queue
|
|
|
-
|
|
|
-MB = 1024 * 1024
|
|
|
-SHUTDOWN_SENTINEL = object()
|
|
|
-
|
|
|
-
|
|
|
-def random_file_extension(num_digits=8):
|
|
|
- return ''.join(random.choice(string.hexdigits) for _ in range(num_digits))
|
|
|
-
|
|
|
-
|
|
|
-def disable_upload_callbacks(request, operation_name, **kwargs):
|
|
|
- if operation_name in ['PutObject', 'UploadPart'] and \
|
|
|
- hasattr(request.body, 'disable_callback'):
|
|
|
- request.body.disable_callback()
|
|
|
-
|
|
|
-
|
|
|
-def enable_upload_callbacks(request, operation_name, **kwargs):
|
|
|
- if operation_name in ['PutObject', 'UploadPart'] and \
|
|
|
- hasattr(request.body, 'enable_callback'):
|
|
|
- request.body.enable_callback()
|
|
|
-
|
|
|
-
|
|
|
-class QueueShutdownError(Exception):
|
|
|
- pass
|
|
|
-
|
|
|
-
|
|
|
-class ReadFileChunk(object):
|
|
|
- def __init__(self, fileobj, start_byte, chunk_size, full_file_size,
|
|
|
- callback=None, enable_callback=True):
|
|
|
- """
|
|
|
-
|
|
|
- Given a file object shown below:
|
|
|
-
|
|
|
- |___________________________________________________|
|
|
|
- 0 | | full_file_size
|
|
|
- |----chunk_size---|
|
|
|
- start_byte
|
|
|
-
|
|
|
- :type fileobj: file
|
|
|
- :param fileobj: File like object
|
|
|
-
|
|
|
- :type start_byte: int
|
|
|
- :param start_byte: The first byte from which to start reading.
|
|
|
-
|
|
|
- :type chunk_size: int
|
|
|
- :param chunk_size: The max chunk size to read. Trying to read
|
|
|
- pass the end of the chunk size will behave like you've
|
|
|
- reached the end of the file.
|
|
|
-
|
|
|
- :type full_file_size: int
|
|
|
- :param full_file_size: The entire content length associated
|
|
|
- with ``fileobj``.
|
|
|
-
|
|
|
- :type callback: function(amount_read)
|
|
|
- :param callback: Called whenever data is read from this object.
|
|
|
-
|
|
|
- """
|
|
|
- self._fileobj = fileobj
|
|
|
- self._start_byte = start_byte
|
|
|
- self._size = self._calculate_file_size(
|
|
|
- self._fileobj, requested_size=chunk_size,
|
|
|
- start_byte=start_byte, actual_file_size=full_file_size)
|
|
|
- self._fileobj.seek(self._start_byte)
|
|
|
- self._amount_read = 0
|
|
|
- self._callback = callback
|
|
|
- self._callback_enabled = enable_callback
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def from_filename(cls, filename, start_byte, chunk_size, callback=None,
|
|
|
- enable_callback=True):
|
|
|
- """Convenience factory function to create from a filename.
|
|
|
-
|
|
|
- :type start_byte: int
|
|
|
- :param start_byte: The first byte from which to start reading.
|
|
|
-
|
|
|
- :type chunk_size: int
|
|
|
- :param chunk_size: The max chunk size to read. Trying to read
|
|
|
- pass the end of the chunk size will behave like you've
|
|
|
- reached the end of the file.
|
|
|
-
|
|
|
- :type full_file_size: int
|
|
|
- :param full_file_size: The entire content length associated
|
|
|
- with ``fileobj``.
|
|
|
-
|
|
|
- :type callback: function(amount_read)
|
|
|
- :param callback: Called whenever data is read from this object.
|
|
|
-
|
|
|
- :type enable_callback: bool
|
|
|
- :param enable_callback: Indicate whether to invoke callback
|
|
|
- during read() calls.
|
|
|
-
|
|
|
- :rtype: ``ReadFileChunk``
|
|
|
- :return: A new instance of ``ReadFileChunk``
|
|
|
-
|
|
|
- """
|
|
|
- f = open(filename, 'rb')
|
|
|
- file_size = os.fstat(f.fileno()).st_size
|
|
|
- return cls(f, start_byte, chunk_size, file_size, callback,
|
|
|
- enable_callback)
|
|
|
-
|
|
|
- def _calculate_file_size(self, fileobj, requested_size, start_byte,
|
|
|
- actual_file_size):
|
|
|
- max_chunk_size = actual_file_size - start_byte
|
|
|
- return min(max_chunk_size, requested_size)
|
|
|
-
|
|
|
- def read(self, amount=None):
|
|
|
- if amount is None:
|
|
|
- amount_to_read = self._size - self._amount_read
|
|
|
- else:
|
|
|
- amount_to_read = min(self._size - self._amount_read, amount)
|
|
|
- data = self._fileobj.read(amount_to_read)
|
|
|
- self._amount_read += len(data)
|
|
|
- if self._callback is not None and self._callback_enabled:
|
|
|
- self._callback(len(data))
|
|
|
- return data
|
|
|
-
|
|
|
- def enable_callback(self):
|
|
|
- self._callback_enabled = True
|
|
|
-
|
|
|
- def disable_callback(self):
|
|
|
- self._callback_enabled = False
|
|
|
-
|
|
|
- def seek(self, where):
|
|
|
- self._fileobj.seek(self._start_byte + where)
|
|
|
- if self._callback is not None and self._callback_enabled:
|
|
|
- # To also rewind the callback() for an accurate progress report
|
|
|
- self._callback(where - self._amount_read)
|
|
|
- self._amount_read = where
|
|
|
-
|
|
|
- def close(self):
|
|
|
- self._fileobj.close()
|
|
|
-
|
|
|
- def tell(self):
|
|
|
- return self._amount_read
|
|
|
-
|
|
|
- def __len__(self):
|
|
|
- # __len__ is defined because requests will try to determine the length
|
|
|
- # of the stream to set a content length. In the normal case
|
|
|
- # of the file it will just stat the file, but we need to change that
|
|
|
- # behavior. By providing a __len__, requests will use that instead
|
|
|
- # of stat'ing the file.
|
|
|
- return self._size
|
|
|
-
|
|
|
- def __enter__(self):
|
|
|
- return self
|
|
|
-
|
|
|
- def __exit__(self, *args, **kwargs):
|
|
|
- self.close()
|
|
|
-
|
|
|
- def __iter__(self):
|
|
|
- # This is a workaround for http://bugs.python.org/issue17575
|
|
|
- # Basically httplib will try to iterate over the contents, even
|
|
|
- # if its a file like object. This wasn't noticed because we've
|
|
|
- # already exhausted the stream so iterating over the file immediately
|
|
|
- # stops, which is what we're simulating here.
|
|
|
- return iter([])
|
|
|
-
|
|
|
-
|
|
|
-class StreamReaderProgress(object):
|
|
|
- """Wrapper for a read only stream that adds progress callbacks."""
|
|
|
- def __init__(self, stream, callback=None):
|
|
|
- self._stream = stream
|
|
|
- self._callback = callback
|
|
|
-
|
|
|
- def read(self, *args, **kwargs):
|
|
|
- value = self._stream.read(*args, **kwargs)
|
|
|
- if self._callback is not None:
|
|
|
- self._callback(len(value))
|
|
|
- return value
|
|
|
-
|
|
|
-
|
|
|
-class OSUtils(object):
|
|
|
- def get_file_size(self, filename):
|
|
|
- return os.path.getsize(filename)
|
|
|
-
|
|
|
- def open_file_chunk_reader(self, filename, start_byte, size, callback):
|
|
|
- return ReadFileChunk.from_filename(filename, start_byte,
|
|
|
- size, callback,
|
|
|
- enable_callback=False)
|
|
|
-
|
|
|
- def open(self, filename, mode):
|
|
|
- return open(filename, mode)
|
|
|
-
|
|
|
- def remove_file(self, filename):
|
|
|
- """Remove a file, noop if file does not exist."""
|
|
|
- # Unlike os.remove, if the file does not exist,
|
|
|
- # then this method does nothing.
|
|
|
- try:
|
|
|
- os.remove(filename)
|
|
|
- except OSError:
|
|
|
- pass
|
|
|
-
|
|
|
- def rename_file(self, current_filename, new_filename):
|
|
|
- s3transfer.compat.rename_file(current_filename, new_filename)
|
|
|
-
|
|
|
-
|
|
|
-class MultipartUploader(object):
|
|
|
- # These are the extra_args that need to be forwarded onto
|
|
|
- # subsequent upload_parts.
|
|
|
- UPLOAD_PART_ARGS = [
|
|
|
- 'SSECustomerKey',
|
|
|
- 'SSECustomerAlgorithm',
|
|
|
- 'SSECustomerKeyMD5',
|
|
|
- 'RequestPayer',
|
|
|
- ]
|
|
|
-
|
|
|
- def __init__(self, client, config, osutil,
|
|
|
- executor_cls=concurrent.futures.ThreadPoolExecutor):
|
|
|
- self._client = client
|
|
|
- self._config = config
|
|
|
- self._os = osutil
|
|
|
- self._executor_cls = executor_cls
|
|
|
-
|
|
|
- def _extra_upload_part_args(self, extra_args):
|
|
|
- # Only the args in UPLOAD_PART_ARGS actually need to be passed
|
|
|
- # onto the upload_part calls.
|
|
|
- upload_parts_args = {}
|
|
|
- for key, value in extra_args.items():
|
|
|
- if key in self.UPLOAD_PART_ARGS:
|
|
|
- upload_parts_args[key] = value
|
|
|
- return upload_parts_args
|
|
|
-
|
|
|
- def upload_file(self, filename, bucket, key, callback, extra_args):
|
|
|
- response = self._client.create_multipart_upload(Bucket=bucket,
|
|
|
- Key=key, **extra_args)
|
|
|
- upload_id = response['UploadId']
|
|
|
- try:
|
|
|
- parts = self._upload_parts(upload_id, filename, bucket, key,
|
|
|
- callback, extra_args)
|
|
|
- except Exception as e:
|
|
|
- logger.debug("Exception raised while uploading parts, "
|
|
|
- "aborting multipart upload.", exc_info=True)
|
|
|
- self._client.abort_multipart_upload(
|
|
|
- Bucket=bucket, Key=key, UploadId=upload_id)
|
|
|
- raise S3UploadFailedError(
|
|
|
- "Failed to upload %s to %s: %s" % (
|
|
|
- filename, '/'.join([bucket, key]), e))
|
|
|
- self._client.complete_multipart_upload(
|
|
|
- Bucket=bucket, Key=key, UploadId=upload_id,
|
|
|
- MultipartUpload={'Parts': parts})
|
|
|
-
|
|
|
- def _upload_parts(self, upload_id, filename, bucket, key, callback,
|
|
|
- extra_args):
|
|
|
- upload_parts_extra_args = self._extra_upload_part_args(extra_args)
|
|
|
- parts = []
|
|
|
- part_size = self._config.multipart_chunksize
|
|
|
- num_parts = int(
|
|
|
- math.ceil(self._os.get_file_size(filename) / float(part_size)))
|
|
|
- max_workers = self._config.max_concurrency
|
|
|
- with self._executor_cls(max_workers=max_workers) as executor:
|
|
|
- upload_partial = functools.partial(
|
|
|
- self._upload_one_part, filename, bucket, key, upload_id,
|
|
|
- part_size, upload_parts_extra_args, callback)
|
|
|
- for part in executor.map(upload_partial, range(1, num_parts + 1)):
|
|
|
- parts.append(part)
|
|
|
- return parts
|
|
|
-
|
|
|
- def _upload_one_part(self, filename, bucket, key,
|
|
|
- upload_id, part_size, extra_args,
|
|
|
- callback, part_number):
|
|
|
- open_chunk_reader = self._os.open_file_chunk_reader
|
|
|
- with open_chunk_reader(filename, part_size * (part_number - 1),
|
|
|
- part_size, callback) as body:
|
|
|
- response = self._client.upload_part(
|
|
|
- Bucket=bucket, Key=key,
|
|
|
- UploadId=upload_id, PartNumber=part_number, Body=body,
|
|
|
- **extra_args)
|
|
|
- etag = response['ETag']
|
|
|
- return {'ETag': etag, 'PartNumber': part_number}
|
|
|
-
|
|
|
-
|
|
|
-class ShutdownQueue(queue.Queue):
|
|
|
- """A queue implementation that can be shutdown.
|
|
|
-
|
|
|
- Shutting down a queue means that this class adds a
|
|
|
- trigger_shutdown method that will trigger all subsequent
|
|
|
- calls to put() to fail with a ``QueueShutdownError``.
|
|
|
-
|
|
|
- It purposefully deviates from queue.Queue, and is *not* meant
|
|
|
- to be a drop in replacement for ``queue.Queue``.
|
|
|
-
|
|
|
- """
|
|
|
- def _init(self, maxsize):
|
|
|
- self._shutdown = False
|
|
|
- self._shutdown_lock = threading.Lock()
|
|
|
- # queue.Queue is an old style class so we don't use super().
|
|
|
- return queue.Queue._init(self, maxsize)
|
|
|
-
|
|
|
- def trigger_shutdown(self):
|
|
|
- with self._shutdown_lock:
|
|
|
- self._shutdown = True
|
|
|
- logger.debug("The IO queue is now shutdown.")
|
|
|
-
|
|
|
- def put(self, item):
|
|
|
- # Note: this is not sufficient, it's still possible to deadlock!
|
|
|
- # Need to hook into the condition vars used by this class.
|
|
|
- with self._shutdown_lock:
|
|
|
- if self._shutdown:
|
|
|
- raise QueueShutdownError("Cannot put item to queue when "
|
|
|
- "queue has been shutdown.")
|
|
|
- return queue.Queue.put(self, item)
|
|
|
-
|
|
|
-
|
|
|
-class MultipartDownloader(object):
|
|
|
- def __init__(self, client, config, osutil,
|
|
|
- executor_cls=concurrent.futures.ThreadPoolExecutor):
|
|
|
- self._client = client
|
|
|
- self._config = config
|
|
|
- self._os = osutil
|
|
|
- self._executor_cls = executor_cls
|
|
|
- self._ioqueue = ShutdownQueue(self._config.max_io_queue)
|
|
|
-
|
|
|
- def download_file(self, bucket, key, filename, object_size,
|
|
|
- extra_args, callback=None):
|
|
|
- with self._executor_cls(max_workers=2) as controller:
|
|
|
- # 1 thread for the future that manages the uploading of files
|
|
|
- # 1 thread for the future that manages IO writes.
|
|
|
- download_parts_handler = functools.partial(
|
|
|
- self._download_file_as_future,
|
|
|
- bucket, key, filename, object_size, callback)
|
|
|
- parts_future = controller.submit(download_parts_handler)
|
|
|
-
|
|
|
- io_writes_handler = functools.partial(
|
|
|
- self._perform_io_writes, filename)
|
|
|
- io_future = controller.submit(io_writes_handler)
|
|
|
- results = concurrent.futures.wait(
|
|
|
- [parts_future, io_future],
|
|
|
- return_when=concurrent.futures.FIRST_EXCEPTION)
|
|
|
- self._process_future_results(results)
|
|
|
-
|
|
|
- def _process_future_results(self, futures):
|
|
|
- finished, unfinished = futures
|
|
|
- for future in finished:
|
|
|
- future.result()
|
|
|
-
|
|
|
- def _download_file_as_future(self, bucket, key, filename, object_size,
|
|
|
- callback):
|
|
|
- part_size = self._config.multipart_chunksize
|
|
|
- num_parts = int(math.ceil(object_size / float(part_size)))
|
|
|
- max_workers = self._config.max_concurrency
|
|
|
- download_partial = functools.partial(
|
|
|
- self._download_range, bucket, key, filename,
|
|
|
- part_size, num_parts, callback)
|
|
|
- try:
|
|
|
- with self._executor_cls(max_workers=max_workers) as executor:
|
|
|
- list(executor.map(download_partial, range(num_parts)))
|
|
|
- finally:
|
|
|
- self._ioqueue.put(SHUTDOWN_SENTINEL)
|
|
|
-
|
|
|
- def _calculate_range_param(self, part_size, part_index, num_parts):
|
|
|
- start_range = part_index * part_size
|
|
|
- if part_index == num_parts - 1:
|
|
|
- end_range = ''
|
|
|
- else:
|
|
|
- end_range = start_range + part_size - 1
|
|
|
- range_param = 'bytes=%s-%s' % (start_range, end_range)
|
|
|
- return range_param
|
|
|
-
|
|
|
- def _download_range(self, bucket, key, filename,
|
|
|
- part_size, num_parts, callback, part_index):
|
|
|
- try:
|
|
|
- range_param = self._calculate_range_param(
|
|
|
- part_size, part_index, num_parts)
|
|
|
-
|
|
|
- max_attempts = self._config.num_download_attempts
|
|
|
- last_exception = None
|
|
|
- for i in range(max_attempts):
|
|
|
- try:
|
|
|
- logger.debug("Making get_object call.")
|
|
|
- response = self._client.get_object(
|
|
|
- Bucket=bucket, Key=key, Range=range_param)
|
|
|
- streaming_body = StreamReaderProgress(
|
|
|
- response['Body'], callback)
|
|
|
- buffer_size = 1024 * 16
|
|
|
- current_index = part_size * part_index
|
|
|
- for chunk in iter(lambda: streaming_body.read(buffer_size),
|
|
|
- b''):
|
|
|
- self._ioqueue.put((current_index, chunk))
|
|
|
- current_index += len(chunk)
|
|
|
- return
|
|
|
- except (socket.timeout, socket.error,
|
|
|
- ReadTimeoutError, IncompleteReadError) as e:
|
|
|
- logger.debug("Retrying exception caught (%s), "
|
|
|
- "retrying request, (attempt %s / %s)", e, i,
|
|
|
- max_attempts, exc_info=True)
|
|
|
- last_exception = e
|
|
|
- continue
|
|
|
- raise RetriesExceededError(last_exception)
|
|
|
- finally:
|
|
|
- logger.debug("EXITING _download_range for part: %s", part_index)
|
|
|
-
|
|
|
- def _perform_io_writes(self, filename):
|
|
|
- with self._os.open(filename, 'wb') as f:
|
|
|
- while True:
|
|
|
- task = self._ioqueue.get()
|
|
|
- if task is SHUTDOWN_SENTINEL:
|
|
|
- logger.debug("Shutdown sentinel received in IO handler, "
|
|
|
- "shutting down IO handler.")
|
|
|
- return
|
|
|
- else:
|
|
|
- try:
|
|
|
- offset, data = task
|
|
|
- f.seek(offset)
|
|
|
- f.write(data)
|
|
|
- except Exception as e:
|
|
|
- logger.debug("Caught exception in IO thread: %s",
|
|
|
- e, exc_info=True)
|
|
|
- self._ioqueue.trigger_shutdown()
|
|
|
- raise
|
|
|
-
|
|
|
-
|
|
|
-class TransferConfig(object):
|
|
|
- def __init__(self,
|
|
|
- multipart_threshold=8 * MB,
|
|
|
- max_concurrency=10,
|
|
|
- multipart_chunksize=8 * MB,
|
|
|
- num_download_attempts=5,
|
|
|
- max_io_queue=100):
|
|
|
- self.multipart_threshold = multipart_threshold
|
|
|
- self.max_concurrency = max_concurrency
|
|
|
- self.multipart_chunksize = multipart_chunksize
|
|
|
- self.num_download_attempts = num_download_attempts
|
|
|
- self.max_io_queue = max_io_queue
|
|
|
-
|
|
|
-
|
|
|
-class S3Transfer(object):
|
|
|
-
|
|
|
- ALLOWED_DOWNLOAD_ARGS = [
|
|
|
- 'VersionId',
|
|
|
- 'SSECustomerAlgorithm',
|
|
|
- 'SSECustomerKey',
|
|
|
- 'SSECustomerKeyMD5',
|
|
|
- 'RequestPayer',
|
|
|
- ]
|
|
|
-
|
|
|
- ALLOWED_UPLOAD_ARGS = [
|
|
|
- 'ACL',
|
|
|
- 'CacheControl',
|
|
|
- 'ContentDisposition',
|
|
|
- 'ContentEncoding',
|
|
|
- 'ContentLanguage',
|
|
|
- 'ContentType',
|
|
|
- 'Expires',
|
|
|
- 'GrantFullControl',
|
|
|
- 'GrantRead',
|
|
|
- 'GrantReadACP',
|
|
|
- 'GrantWriteACL',
|
|
|
- 'Metadata',
|
|
|
- 'RequestPayer',
|
|
|
- 'ServerSideEncryption',
|
|
|
- 'StorageClass',
|
|
|
- 'SSECustomerAlgorithm',
|
|
|
- 'SSECustomerKey',
|
|
|
- 'SSECustomerKeyMD5',
|
|
|
- 'SSEKMSKeyId',
|
|
|
+
|
|
|
+
|
|
|
+class NullHandler(logging.Handler):
|
|
|
+ def emit(self, record):
|
|
|
+ pass
|
|
|
+
|
|
|
+
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+logger.addHandler(NullHandler())
|
|
|
+
|
|
|
+queue = six.moves.queue
|
|
|
+
|
|
|
+MB = 1024 * 1024
|
|
|
+SHUTDOWN_SENTINEL = object()
|
|
|
+
|
|
|
+
|
|
|
+def random_file_extension(num_digits=8):
|
|
|
+ return ''.join(random.choice(string.hexdigits) for _ in range(num_digits))
|
|
|
+
|
|
|
+
|
|
|
+def disable_upload_callbacks(request, operation_name, **kwargs):
|
|
|
+ if operation_name in ['PutObject', 'UploadPart'] and \
|
|
|
+ hasattr(request.body, 'disable_callback'):
|
|
|
+ request.body.disable_callback()
|
|
|
+
|
|
|
+
|
|
|
+def enable_upload_callbacks(request, operation_name, **kwargs):
|
|
|
+ if operation_name in ['PutObject', 'UploadPart'] and \
|
|
|
+ hasattr(request.body, 'enable_callback'):
|
|
|
+ request.body.enable_callback()
|
|
|
+
|
|
|
+
|
|
|
+class QueueShutdownError(Exception):
|
|
|
+ pass
|
|
|
+
|
|
|
+
|
|
|
+class ReadFileChunk(object):
|
|
|
+ def __init__(self, fileobj, start_byte, chunk_size, full_file_size,
|
|
|
+ callback=None, enable_callback=True):
|
|
|
+ """
|
|
|
+
|
|
|
+ Given a file object shown below:
|
|
|
+
|
|
|
+ |___________________________________________________|
|
|
|
+ 0 | | full_file_size
|
|
|
+ |----chunk_size---|
|
|
|
+ start_byte
|
|
|
+
|
|
|
+ :type fileobj: file
|
|
|
+ :param fileobj: File like object
|
|
|
+
|
|
|
+ :type start_byte: int
|
|
|
+ :param start_byte: The first byte from which to start reading.
|
|
|
+
|
|
|
+ :type chunk_size: int
|
|
|
+ :param chunk_size: The max chunk size to read. Trying to read
|
|
|
+ pass the end of the chunk size will behave like you've
|
|
|
+ reached the end of the file.
|
|
|
+
|
|
|
+ :type full_file_size: int
|
|
|
+ :param full_file_size: The entire content length associated
|
|
|
+ with ``fileobj``.
|
|
|
+
|
|
|
+ :type callback: function(amount_read)
|
|
|
+ :param callback: Called whenever data is read from this object.
|
|
|
+
|
|
|
+ """
|
|
|
+ self._fileobj = fileobj
|
|
|
+ self._start_byte = start_byte
|
|
|
+ self._size = self._calculate_file_size(
|
|
|
+ self._fileobj, requested_size=chunk_size,
|
|
|
+ start_byte=start_byte, actual_file_size=full_file_size)
|
|
|
+ self._fileobj.seek(self._start_byte)
|
|
|
+ self._amount_read = 0
|
|
|
+ self._callback = callback
|
|
|
+ self._callback_enabled = enable_callback
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def from_filename(cls, filename, start_byte, chunk_size, callback=None,
|
|
|
+ enable_callback=True):
|
|
|
+ """Convenience factory function to create from a filename.
|
|
|
+
|
|
|
+ :type start_byte: int
|
|
|
+ :param start_byte: The first byte from which to start reading.
|
|
|
+
|
|
|
+ :type chunk_size: int
|
|
|
+ :param chunk_size: The max chunk size to read. Trying to read
|
|
|
+ pass the end of the chunk size will behave like you've
|
|
|
+ reached the end of the file.
|
|
|
+
|
|
|
+ :type full_file_size: int
|
|
|
+ :param full_file_size: The entire content length associated
|
|
|
+ with ``fileobj``.
|
|
|
+
|
|
|
+ :type callback: function(amount_read)
|
|
|
+ :param callback: Called whenever data is read from this object.
|
|
|
+
|
|
|
+ :type enable_callback: bool
|
|
|
+ :param enable_callback: Indicate whether to invoke callback
|
|
|
+ during read() calls.
|
|
|
+
|
|
|
+ :rtype: ``ReadFileChunk``
|
|
|
+ :return: A new instance of ``ReadFileChunk``
|
|
|
+
|
|
|
+ """
|
|
|
+ f = open(filename, 'rb')
|
|
|
+ file_size = os.fstat(f.fileno()).st_size
|
|
|
+ return cls(f, start_byte, chunk_size, file_size, callback,
|
|
|
+ enable_callback)
|
|
|
+
|
|
|
+ def _calculate_file_size(self, fileobj, requested_size, start_byte,
|
|
|
+ actual_file_size):
|
|
|
+ max_chunk_size = actual_file_size - start_byte
|
|
|
+ return min(max_chunk_size, requested_size)
|
|
|
+
|
|
|
+ def read(self, amount=None):
|
|
|
+ if amount is None:
|
|
|
+ amount_to_read = self._size - self._amount_read
|
|
|
+ else:
|
|
|
+ amount_to_read = min(self._size - self._amount_read, amount)
|
|
|
+ data = self._fileobj.read(amount_to_read)
|
|
|
+ self._amount_read += len(data)
|
|
|
+ if self._callback is not None and self._callback_enabled:
|
|
|
+ self._callback(len(data))
|
|
|
+ return data
|
|
|
+
|
|
|
+ def enable_callback(self):
|
|
|
+ self._callback_enabled = True
|
|
|
+
|
|
|
+ def disable_callback(self):
|
|
|
+ self._callback_enabled = False
|
|
|
+
|
|
|
+ def seek(self, where):
|
|
|
+ self._fileobj.seek(self._start_byte + where)
|
|
|
+ if self._callback is not None and self._callback_enabled:
|
|
|
+ # To also rewind the callback() for an accurate progress report
|
|
|
+ self._callback(where - self._amount_read)
|
|
|
+ self._amount_read = where
|
|
|
+
|
|
|
+ def close(self):
|
|
|
+ self._fileobj.close()
|
|
|
+
|
|
|
+ def tell(self):
|
|
|
+ return self._amount_read
|
|
|
+
|
|
|
+ def __len__(self):
|
|
|
+ # __len__ is defined because requests will try to determine the length
|
|
|
+ # of the stream to set a content length. In the normal case
|
|
|
+ # of the file it will just stat the file, but we need to change that
|
|
|
+ # behavior. By providing a __len__, requests will use that instead
|
|
|
+ # of stat'ing the file.
|
|
|
+ return self._size
|
|
|
+
|
|
|
+ def __enter__(self):
|
|
|
+ return self
|
|
|
+
|
|
|
+ def __exit__(self, *args, **kwargs):
|
|
|
+ self.close()
|
|
|
+
|
|
|
+ def __iter__(self):
|
|
|
+ # This is a workaround for http://bugs.python.org/issue17575
|
|
|
+ # Basically httplib will try to iterate over the contents, even
|
|
|
+ # if its a file like object. This wasn't noticed because we've
|
|
|
+ # already exhausted the stream so iterating over the file immediately
|
|
|
+ # stops, which is what we're simulating here.
|
|
|
+ return iter([])
|
|
|
+
|
|
|
+
|
|
|
+class StreamReaderProgress(object):
|
|
|
+ """Wrapper for a read only stream that adds progress callbacks."""
|
|
|
+ def __init__(self, stream, callback=None):
|
|
|
+ self._stream = stream
|
|
|
+ self._callback = callback
|
|
|
+
|
|
|
+ def read(self, *args, **kwargs):
|
|
|
+ value = self._stream.read(*args, **kwargs)
|
|
|
+ if self._callback is not None:
|
|
|
+ self._callback(len(value))
|
|
|
+ return value
|
|
|
+
|
|
|
+
|
|
|
+class OSUtils(object):
|
|
|
+ def get_file_size(self, filename):
|
|
|
+ return os.path.getsize(filename)
|
|
|
+
|
|
|
+ def open_file_chunk_reader(self, filename, start_byte, size, callback):
|
|
|
+ return ReadFileChunk.from_filename(filename, start_byte,
|
|
|
+ size, callback,
|
|
|
+ enable_callback=False)
|
|
|
+
|
|
|
+ def open(self, filename, mode):
|
|
|
+ return open(filename, mode)
|
|
|
+
|
|
|
+ def remove_file(self, filename):
|
|
|
+ """Remove a file, noop if file does not exist."""
|
|
|
+ # Unlike os.remove, if the file does not exist,
|
|
|
+ # then this method does nothing.
|
|
|
+ try:
|
|
|
+ os.remove(filename)
|
|
|
+ except OSError:
|
|
|
+ pass
|
|
|
+
|
|
|
+ def rename_file(self, current_filename, new_filename):
|
|
|
+ s3transfer.compat.rename_file(current_filename, new_filename)
|
|
|
+
|
|
|
+
|
|
|
+class MultipartUploader(object):
|
|
|
+ # These are the extra_args that need to be forwarded onto
|
|
|
+ # subsequent upload_parts.
|
|
|
+ UPLOAD_PART_ARGS = [
|
|
|
+ 'SSECustomerKey',
|
|
|
+ 'SSECustomerAlgorithm',
|
|
|
+ 'SSECustomerKeyMD5',
|
|
|
+ 'RequestPayer',
|
|
|
+ ]
|
|
|
+
|
|
|
+ def __init__(self, client, config, osutil,
|
|
|
+ executor_cls=concurrent.futures.ThreadPoolExecutor):
|
|
|
+ self._client = client
|
|
|
+ self._config = config
|
|
|
+ self._os = osutil
|
|
|
+ self._executor_cls = executor_cls
|
|
|
+
|
|
|
+ def _extra_upload_part_args(self, extra_args):
|
|
|
+ # Only the args in UPLOAD_PART_ARGS actually need to be passed
|
|
|
+ # onto the upload_part calls.
|
|
|
+ upload_parts_args = {}
|
|
|
+ for key, value in extra_args.items():
|
|
|
+ if key in self.UPLOAD_PART_ARGS:
|
|
|
+ upload_parts_args[key] = value
|
|
|
+ return upload_parts_args
|
|
|
+
|
|
|
+ def upload_file(self, filename, bucket, key, callback, extra_args):
|
|
|
+ response = self._client.create_multipart_upload(Bucket=bucket,
|
|
|
+ Key=key, **extra_args)
|
|
|
+ upload_id = response['UploadId']
|
|
|
+ try:
|
|
|
+ parts = self._upload_parts(upload_id, filename, bucket, key,
|
|
|
+ callback, extra_args)
|
|
|
+ except Exception as e:
|
|
|
+ logger.debug("Exception raised while uploading parts, "
|
|
|
+ "aborting multipart upload.", exc_info=True)
|
|
|
+ self._client.abort_multipart_upload(
|
|
|
+ Bucket=bucket, Key=key, UploadId=upload_id)
|
|
|
+ raise S3UploadFailedError(
|
|
|
+ "Failed to upload %s to %s: %s" % (
|
|
|
+ filename, '/'.join([bucket, key]), e))
|
|
|
+ self._client.complete_multipart_upload(
|
|
|
+ Bucket=bucket, Key=key, UploadId=upload_id,
|
|
|
+ MultipartUpload={'Parts': parts})
|
|
|
+
|
|
|
+ def _upload_parts(self, upload_id, filename, bucket, key, callback,
|
|
|
+ extra_args):
|
|
|
+ upload_parts_extra_args = self._extra_upload_part_args(extra_args)
|
|
|
+ parts = []
|
|
|
+ part_size = self._config.multipart_chunksize
|
|
|
+ num_parts = int(
|
|
|
+ math.ceil(self._os.get_file_size(filename) / float(part_size)))
|
|
|
+ max_workers = self._config.max_concurrency
|
|
|
+ with self._executor_cls(max_workers=max_workers) as executor:
|
|
|
+ upload_partial = functools.partial(
|
|
|
+ self._upload_one_part, filename, bucket, key, upload_id,
|
|
|
+ part_size, upload_parts_extra_args, callback)
|
|
|
+ for part in executor.map(upload_partial, range(1, num_parts + 1)):
|
|
|
+ parts.append(part)
|
|
|
+ return parts
|
|
|
+
|
|
|
+ def _upload_one_part(self, filename, bucket, key,
|
|
|
+ upload_id, part_size, extra_args,
|
|
|
+ callback, part_number):
|
|
|
+ open_chunk_reader = self._os.open_file_chunk_reader
|
|
|
+ with open_chunk_reader(filename, part_size * (part_number - 1),
|
|
|
+ part_size, callback) as body:
|
|
|
+ response = self._client.upload_part(
|
|
|
+ Bucket=bucket, Key=key,
|
|
|
+ UploadId=upload_id, PartNumber=part_number, Body=body,
|
|
|
+ **extra_args)
|
|
|
+ etag = response['ETag']
|
|
|
+ return {'ETag': etag, 'PartNumber': part_number}
|
|
|
+
|
|
|
+
|
|
|
+class ShutdownQueue(queue.Queue):
|
|
|
+ """A queue implementation that can be shutdown.
|
|
|
+
|
|
|
+ Shutting down a queue means that this class adds a
|
|
|
+ trigger_shutdown method that will trigger all subsequent
|
|
|
+ calls to put() to fail with a ``QueueShutdownError``.
|
|
|
+
|
|
|
+ It purposefully deviates from queue.Queue, and is *not* meant
|
|
|
+ to be a drop in replacement for ``queue.Queue``.
|
|
|
+
|
|
|
+ """
|
|
|
+ def _init(self, maxsize):
|
|
|
+ self._shutdown = False
|
|
|
+ self._shutdown_lock = threading.Lock()
|
|
|
+ # queue.Queue is an old style class so we don't use super().
|
|
|
+ return queue.Queue._init(self, maxsize)
|
|
|
+
|
|
|
+ def trigger_shutdown(self):
|
|
|
+ with self._shutdown_lock:
|
|
|
+ self._shutdown = True
|
|
|
+ logger.debug("The IO queue is now shutdown.")
|
|
|
+
|
|
|
+ def put(self, item):
|
|
|
+ # Note: this is not sufficient, it's still possible to deadlock!
|
|
|
+ # Need to hook into the condition vars used by this class.
|
|
|
+ with self._shutdown_lock:
|
|
|
+ if self._shutdown:
|
|
|
+ raise QueueShutdownError("Cannot put item to queue when "
|
|
|
+ "queue has been shutdown.")
|
|
|
+ return queue.Queue.put(self, item)
|
|
|
+
|
|
|
+
|
|
|
+class MultipartDownloader(object):
|
|
|
+ def __init__(self, client, config, osutil,
|
|
|
+ executor_cls=concurrent.futures.ThreadPoolExecutor):
|
|
|
+ self._client = client
|
|
|
+ self._config = config
|
|
|
+ self._os = osutil
|
|
|
+ self._executor_cls = executor_cls
|
|
|
+ self._ioqueue = ShutdownQueue(self._config.max_io_queue)
|
|
|
+
|
|
|
+ def download_file(self, bucket, key, filename, object_size,
|
|
|
+ extra_args, callback=None):
|
|
|
+ with self._executor_cls(max_workers=2) as controller:
|
|
|
+ # 1 thread for the future that manages the uploading of files
|
|
|
+ # 1 thread for the future that manages IO writes.
|
|
|
+ download_parts_handler = functools.partial(
|
|
|
+ self._download_file_as_future,
|
|
|
+ bucket, key, filename, object_size, callback)
|
|
|
+ parts_future = controller.submit(download_parts_handler)
|
|
|
+
|
|
|
+ io_writes_handler = functools.partial(
|
|
|
+ self._perform_io_writes, filename)
|
|
|
+ io_future = controller.submit(io_writes_handler)
|
|
|
+ results = concurrent.futures.wait(
|
|
|
+ [parts_future, io_future],
|
|
|
+ return_when=concurrent.futures.FIRST_EXCEPTION)
|
|
|
+ self._process_future_results(results)
|
|
|
+
|
|
|
+ def _process_future_results(self, futures):
|
|
|
+ finished, unfinished = futures
|
|
|
+ for future in finished:
|
|
|
+ future.result()
|
|
|
+
|
|
|
+ def _download_file_as_future(self, bucket, key, filename, object_size,
|
|
|
+ callback):
|
|
|
+ part_size = self._config.multipart_chunksize
|
|
|
+ num_parts = int(math.ceil(object_size / float(part_size)))
|
|
|
+ max_workers = self._config.max_concurrency
|
|
|
+ download_partial = functools.partial(
|
|
|
+ self._download_range, bucket, key, filename,
|
|
|
+ part_size, num_parts, callback)
|
|
|
+ try:
|
|
|
+ with self._executor_cls(max_workers=max_workers) as executor:
|
|
|
+ list(executor.map(download_partial, range(num_parts)))
|
|
|
+ finally:
|
|
|
+ self._ioqueue.put(SHUTDOWN_SENTINEL)
|
|
|
+
|
|
|
+ def _calculate_range_param(self, part_size, part_index, num_parts):
|
|
|
+ start_range = part_index * part_size
|
|
|
+ if part_index == num_parts - 1:
|
|
|
+ end_range = ''
|
|
|
+ else:
|
|
|
+ end_range = start_range + part_size - 1
|
|
|
+ range_param = 'bytes=%s-%s' % (start_range, end_range)
|
|
|
+ return range_param
|
|
|
+
|
|
|
+ def _download_range(self, bucket, key, filename,
|
|
|
+ part_size, num_parts, callback, part_index):
|
|
|
+ try:
|
|
|
+ range_param = self._calculate_range_param(
|
|
|
+ part_size, part_index, num_parts)
|
|
|
+
|
|
|
+ max_attempts = self._config.num_download_attempts
|
|
|
+ last_exception = None
|
|
|
+ for i in range(max_attempts):
|
|
|
+ try:
|
|
|
+ logger.debug("Making get_object call.")
|
|
|
+ response = self._client.get_object(
|
|
|
+ Bucket=bucket, Key=key, Range=range_param)
|
|
|
+ streaming_body = StreamReaderProgress(
|
|
|
+ response['Body'], callback)
|
|
|
+ buffer_size = 1024 * 16
|
|
|
+ current_index = part_size * part_index
|
|
|
+ for chunk in iter(lambda: streaming_body.read(buffer_size),
|
|
|
+ b''):
|
|
|
+ self._ioqueue.put((current_index, chunk))
|
|
|
+ current_index += len(chunk)
|
|
|
+ return
|
|
|
+ except (socket.timeout, socket.error,
|
|
|
+ ReadTimeoutError, IncompleteReadError) as e:
|
|
|
+ logger.debug("Retrying exception caught (%s), "
|
|
|
+ "retrying request, (attempt %s / %s)", e, i,
|
|
|
+ max_attempts, exc_info=True)
|
|
|
+ last_exception = e
|
|
|
+ continue
|
|
|
+ raise RetriesExceededError(last_exception)
|
|
|
+ finally:
|
|
|
+ logger.debug("EXITING _download_range for part: %s", part_index)
|
|
|
+
|
|
|
+ def _perform_io_writes(self, filename):
|
|
|
+ with self._os.open(filename, 'wb') as f:
|
|
|
+ while True:
|
|
|
+ task = self._ioqueue.get()
|
|
|
+ if task is SHUTDOWN_SENTINEL:
|
|
|
+ logger.debug("Shutdown sentinel received in IO handler, "
|
|
|
+ "shutting down IO handler.")
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ try:
|
|
|
+ offset, data = task
|
|
|
+ f.seek(offset)
|
|
|
+ f.write(data)
|
|
|
+ except Exception as e:
|
|
|
+ logger.debug("Caught exception in IO thread: %s",
|
|
|
+ e, exc_info=True)
|
|
|
+ self._ioqueue.trigger_shutdown()
|
|
|
+ raise
|
|
|
+
|
|
|
+
|
|
|
+class TransferConfig(object):
|
|
|
+ def __init__(self,
|
|
|
+ multipart_threshold=8 * MB,
|
|
|
+ max_concurrency=10,
|
|
|
+ multipart_chunksize=8 * MB,
|
|
|
+ num_download_attempts=5,
|
|
|
+ max_io_queue=100):
|
|
|
+ self.multipart_threshold = multipart_threshold
|
|
|
+ self.max_concurrency = max_concurrency
|
|
|
+ self.multipart_chunksize = multipart_chunksize
|
|
|
+ self.num_download_attempts = num_download_attempts
|
|
|
+ self.max_io_queue = max_io_queue
|
|
|
+
|
|
|
+
|
|
|
+class S3Transfer(object):
|
|
|
+
|
|
|
+ ALLOWED_DOWNLOAD_ARGS = [
|
|
|
+ 'VersionId',
|
|
|
+ 'SSECustomerAlgorithm',
|
|
|
+ 'SSECustomerKey',
|
|
|
+ 'SSECustomerKeyMD5',
|
|
|
+ 'RequestPayer',
|
|
|
+ ]
|
|
|
+
|
|
|
+ ALLOWED_UPLOAD_ARGS = [
|
|
|
+ 'ACL',
|
|
|
+ 'CacheControl',
|
|
|
+ 'ContentDisposition',
|
|
|
+ 'ContentEncoding',
|
|
|
+ 'ContentLanguage',
|
|
|
+ 'ContentType',
|
|
|
+ 'Expires',
|
|
|
+ 'GrantFullControl',
|
|
|
+ 'GrantRead',
|
|
|
+ 'GrantReadACP',
|
|
|
+ 'GrantWriteACL',
|
|
|
+ 'Metadata',
|
|
|
+ 'RequestPayer',
|
|
|
+ 'ServerSideEncryption',
|
|
|
+ 'StorageClass',
|
|
|
+ 'SSECustomerAlgorithm',
|
|
|
+ 'SSECustomerKey',
|
|
|
+ 'SSECustomerKeyMD5',
|
|
|
+ 'SSEKMSKeyId',
|
|
|
'SSEKMSEncryptionContext',
|
|
|
'Tagging',
|
|
|
- ]
|
|
|
-
|
|
|
- def __init__(self, client, config=None, osutil=None):
|
|
|
- self._client = client
|
|
|
- if config is None:
|
|
|
- config = TransferConfig()
|
|
|
- self._config = config
|
|
|
- if osutil is None:
|
|
|
- osutil = OSUtils()
|
|
|
- self._osutil = osutil
|
|
|
-
|
|
|
- def upload_file(self, filename, bucket, key,
|
|
|
- callback=None, extra_args=None):
|
|
|
- """Upload a file to an S3 object.
|
|
|
-
|
|
|
- Variants have also been injected into S3 client, Bucket and Object.
|
|
|
- You don't have to use S3Transfer.upload_file() directly.
|
|
|
- """
|
|
|
- if extra_args is None:
|
|
|
- extra_args = {}
|
|
|
- self._validate_all_known_args(extra_args, self.ALLOWED_UPLOAD_ARGS)
|
|
|
- events = self._client.meta.events
|
|
|
- events.register_first('request-created.s3',
|
|
|
- disable_upload_callbacks,
|
|
|
- unique_id='s3upload-callback-disable')
|
|
|
- events.register_last('request-created.s3',
|
|
|
- enable_upload_callbacks,
|
|
|
- unique_id='s3upload-callback-enable')
|
|
|
- if self._osutil.get_file_size(filename) >= \
|
|
|
- self._config.multipart_threshold:
|
|
|
- self._multipart_upload(filename, bucket, key, callback, extra_args)
|
|
|
- else:
|
|
|
- self._put_object(filename, bucket, key, callback, extra_args)
|
|
|
-
|
|
|
- def _put_object(self, filename, bucket, key, callback, extra_args):
|
|
|
- # We're using open_file_chunk_reader so we can take advantage of the
|
|
|
- # progress callback functionality.
|
|
|
- open_chunk_reader = self._osutil.open_file_chunk_reader
|
|
|
- with open_chunk_reader(filename, 0,
|
|
|
- self._osutil.get_file_size(filename),
|
|
|
- callback=callback) as body:
|
|
|
- self._client.put_object(Bucket=bucket, Key=key, Body=body,
|
|
|
- **extra_args)
|
|
|
-
|
|
|
- def download_file(self, bucket, key, filename, extra_args=None,
|
|
|
- callback=None):
|
|
|
- """Download an S3 object to a file.
|
|
|
-
|
|
|
- Variants have also been injected into S3 client, Bucket and Object.
|
|
|
- You don't have to use S3Transfer.download_file() directly.
|
|
|
- """
|
|
|
- # This method will issue a ``head_object`` request to determine
|
|
|
- # the size of the S3 object. This is used to determine if the
|
|
|
- # object is downloaded in parallel.
|
|
|
- if extra_args is None:
|
|
|
- extra_args = {}
|
|
|
- self._validate_all_known_args(extra_args, self.ALLOWED_DOWNLOAD_ARGS)
|
|
|
- object_size = self._object_size(bucket, key, extra_args)
|
|
|
- temp_filename = filename + os.extsep + random_file_extension()
|
|
|
- try:
|
|
|
- self._download_file(bucket, key, temp_filename, object_size,
|
|
|
- extra_args, callback)
|
|
|
- except Exception:
|
|
|
- logger.debug("Exception caught in download_file, removing partial "
|
|
|
- "file: %s", temp_filename, exc_info=True)
|
|
|
- self._osutil.remove_file(temp_filename)
|
|
|
- raise
|
|
|
- else:
|
|
|
- self._osutil.rename_file(temp_filename, filename)
|
|
|
-
|
|
|
- def _download_file(self, bucket, key, filename, object_size,
|
|
|
- extra_args, callback):
|
|
|
- if object_size >= self._config.multipart_threshold:
|
|
|
- self._ranged_download(bucket, key, filename, object_size,
|
|
|
- extra_args, callback)
|
|
|
- else:
|
|
|
- self._get_object(bucket, key, filename, extra_args, callback)
|
|
|
-
|
|
|
- def _validate_all_known_args(self, actual, allowed):
|
|
|
- for kwarg in actual:
|
|
|
- if kwarg not in allowed:
|
|
|
- raise ValueError(
|
|
|
- "Invalid extra_args key '%s', "
|
|
|
- "must be one of: %s" % (
|
|
|
- kwarg, ', '.join(allowed)))
|
|
|
-
|
|
|
- def _ranged_download(self, bucket, key, filename, object_size,
|
|
|
- extra_args, callback):
|
|
|
- downloader = MultipartDownloader(self._client, self._config,
|
|
|
- self._osutil)
|
|
|
- downloader.download_file(bucket, key, filename, object_size,
|
|
|
- extra_args, callback)
|
|
|
-
|
|
|
- def _get_object(self, bucket, key, filename, extra_args, callback):
|
|
|
- # precondition: num_download_attempts > 0
|
|
|
- max_attempts = self._config.num_download_attempts
|
|
|
- last_exception = None
|
|
|
- for i in range(max_attempts):
|
|
|
- try:
|
|
|
- return self._do_get_object(bucket, key, filename,
|
|
|
- extra_args, callback)
|
|
|
- except (socket.timeout, socket.error,
|
|
|
- ReadTimeoutError, IncompleteReadError) as e:
|
|
|
- # TODO: we need a way to reset the callback if the
|
|
|
- # download failed.
|
|
|
- logger.debug("Retrying exception caught (%s), "
|
|
|
- "retrying request, (attempt %s / %s)", e, i,
|
|
|
- max_attempts, exc_info=True)
|
|
|
- last_exception = e
|
|
|
- continue
|
|
|
- raise RetriesExceededError(last_exception)
|
|
|
-
|
|
|
- def _do_get_object(self, bucket, key, filename, extra_args, callback):
|
|
|
- response = self._client.get_object(Bucket=bucket, Key=key,
|
|
|
- **extra_args)
|
|
|
- streaming_body = StreamReaderProgress(
|
|
|
- response['Body'], callback)
|
|
|
- with self._osutil.open(filename, 'wb') as f:
|
|
|
- for chunk in iter(lambda: streaming_body.read(8192), b''):
|
|
|
- f.write(chunk)
|
|
|
-
|
|
|
- def _object_size(self, bucket, key, extra_args):
|
|
|
- return self._client.head_object(
|
|
|
- Bucket=bucket, Key=key, **extra_args)['ContentLength']
|
|
|
-
|
|
|
- def _multipart_upload(self, filename, bucket, key, callback, extra_args):
|
|
|
- uploader = MultipartUploader(self._client, self._config, self._osutil)
|
|
|
- uploader.upload_file(filename, bucket, key, callback, extra_args)
|
|
|
+ ]
|
|
|
+
|
|
|
+ def __init__(self, client, config=None, osutil=None):
|
|
|
+ self._client = client
|
|
|
+ if config is None:
|
|
|
+ config = TransferConfig()
|
|
|
+ self._config = config
|
|
|
+ if osutil is None:
|
|
|
+ osutil = OSUtils()
|
|
|
+ self._osutil = osutil
|
|
|
+
|
|
|
+ def upload_file(self, filename, bucket, key,
|
|
|
+ callback=None, extra_args=None):
|
|
|
+ """Upload a file to an S3 object.
|
|
|
+
|
|
|
+ Variants have also been injected into S3 client, Bucket and Object.
|
|
|
+ You don't have to use S3Transfer.upload_file() directly.
|
|
|
+ """
|
|
|
+ if extra_args is None:
|
|
|
+ extra_args = {}
|
|
|
+ self._validate_all_known_args(extra_args, self.ALLOWED_UPLOAD_ARGS)
|
|
|
+ events = self._client.meta.events
|
|
|
+ events.register_first('request-created.s3',
|
|
|
+ disable_upload_callbacks,
|
|
|
+ unique_id='s3upload-callback-disable')
|
|
|
+ events.register_last('request-created.s3',
|
|
|
+ enable_upload_callbacks,
|
|
|
+ unique_id='s3upload-callback-enable')
|
|
|
+ if self._osutil.get_file_size(filename) >= \
|
|
|
+ self._config.multipart_threshold:
|
|
|
+ self._multipart_upload(filename, bucket, key, callback, extra_args)
|
|
|
+ else:
|
|
|
+ self._put_object(filename, bucket, key, callback, extra_args)
|
|
|
+
|
|
|
+ def _put_object(self, filename, bucket, key, callback, extra_args):
|
|
|
+ # We're using open_file_chunk_reader so we can take advantage of the
|
|
|
+ # progress callback functionality.
|
|
|
+ open_chunk_reader = self._osutil.open_file_chunk_reader
|
|
|
+ with open_chunk_reader(filename, 0,
|
|
|
+ self._osutil.get_file_size(filename),
|
|
|
+ callback=callback) as body:
|
|
|
+ self._client.put_object(Bucket=bucket, Key=key, Body=body,
|
|
|
+ **extra_args)
|
|
|
+
|
|
|
+ def download_file(self, bucket, key, filename, extra_args=None,
|
|
|
+ callback=None):
|
|
|
+ """Download an S3 object to a file.
|
|
|
+
|
|
|
+ Variants have also been injected into S3 client, Bucket and Object.
|
|
|
+ You don't have to use S3Transfer.download_file() directly.
|
|
|
+ """
|
|
|
+ # This method will issue a ``head_object`` request to determine
|
|
|
+ # the size of the S3 object. This is used to determine if the
|
|
|
+ # object is downloaded in parallel.
|
|
|
+ if extra_args is None:
|
|
|
+ extra_args = {}
|
|
|
+ self._validate_all_known_args(extra_args, self.ALLOWED_DOWNLOAD_ARGS)
|
|
|
+ object_size = self._object_size(bucket, key, extra_args)
|
|
|
+ temp_filename = filename + os.extsep + random_file_extension()
|
|
|
+ try:
|
|
|
+ self._download_file(bucket, key, temp_filename, object_size,
|
|
|
+ extra_args, callback)
|
|
|
+ except Exception:
|
|
|
+ logger.debug("Exception caught in download_file, removing partial "
|
|
|
+ "file: %s", temp_filename, exc_info=True)
|
|
|
+ self._osutil.remove_file(temp_filename)
|
|
|
+ raise
|
|
|
+ else:
|
|
|
+ self._osutil.rename_file(temp_filename, filename)
|
|
|
+
|
|
|
+ def _download_file(self, bucket, key, filename, object_size,
|
|
|
+ extra_args, callback):
|
|
|
+ if object_size >= self._config.multipart_threshold:
|
|
|
+ self._ranged_download(bucket, key, filename, object_size,
|
|
|
+ extra_args, callback)
|
|
|
+ else:
|
|
|
+ self._get_object(bucket, key, filename, extra_args, callback)
|
|
|
+
|
|
|
+ def _validate_all_known_args(self, actual, allowed):
|
|
|
+ for kwarg in actual:
|
|
|
+ if kwarg not in allowed:
|
|
|
+ raise ValueError(
|
|
|
+ "Invalid extra_args key '%s', "
|
|
|
+ "must be one of: %s" % (
|
|
|
+ kwarg, ', '.join(allowed)))
|
|
|
+
|
|
|
+ def _ranged_download(self, bucket, key, filename, object_size,
|
|
|
+ extra_args, callback):
|
|
|
+ downloader = MultipartDownloader(self._client, self._config,
|
|
|
+ self._osutil)
|
|
|
+ downloader.download_file(bucket, key, filename, object_size,
|
|
|
+ extra_args, callback)
|
|
|
+
|
|
|
+ def _get_object(self, bucket, key, filename, extra_args, callback):
|
|
|
+ # precondition: num_download_attempts > 0
|
|
|
+ max_attempts = self._config.num_download_attempts
|
|
|
+ last_exception = None
|
|
|
+ for i in range(max_attempts):
|
|
|
+ try:
|
|
|
+ return self._do_get_object(bucket, key, filename,
|
|
|
+ extra_args, callback)
|
|
|
+ except (socket.timeout, socket.error,
|
|
|
+ ReadTimeoutError, IncompleteReadError) as e:
|
|
|
+ # TODO: we need a way to reset the callback if the
|
|
|
+ # download failed.
|
|
|
+ logger.debug("Retrying exception caught (%s), "
|
|
|
+ "retrying request, (attempt %s / %s)", e, i,
|
|
|
+ max_attempts, exc_info=True)
|
|
|
+ last_exception = e
|
|
|
+ continue
|
|
|
+ raise RetriesExceededError(last_exception)
|
|
|
+
|
|
|
+ def _do_get_object(self, bucket, key, filename, extra_args, callback):
|
|
|
+ response = self._client.get_object(Bucket=bucket, Key=key,
|
|
|
+ **extra_args)
|
|
|
+ streaming_body = StreamReaderProgress(
|
|
|
+ response['Body'], callback)
|
|
|
+ with self._osutil.open(filename, 'wb') as f:
|
|
|
+ for chunk in iter(lambda: streaming_body.read(8192), b''):
|
|
|
+ f.write(chunk)
|
|
|
+
|
|
|
+ def _object_size(self, bucket, key, extra_args):
|
|
|
+ return self._client.head_object(
|
|
|
+ Bucket=bucket, Key=key, **extra_args)['ContentLength']
|
|
|
+
|
|
|
+ def _multipart_upload(self, filename, bucket, key, callback, extra_args):
|
|
|
+ uploader = MultipartUploader(self._client, self._config, self._osutil)
|
|
|
+ uploader.upload_file(filename, bucket, key, callback, extra_args)
|