-
-
Save nateware/4735384 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
# Compare a file on S3 to see if we have the latest version | |
# If not, upload it and invalidate CloudFront | |
import fnmatch | |
import os | |
import boto | |
import pprint | |
import re | |
import hashlib | |
from boto.s3.key import Key | |
# Where source is checked out | |
SOURCE_DIR = '/local/path/to/source/code' | |
BUCKET_NAME = 'my-s3-bucket-name' | |
# Connect to S3 and get bucket | |
conn = boto.connect_s3() | |
bucket = conn.get_bucket(BUCKET_NAME) | |
# Shortcut to MD5 | |
def get_md5(filename): | |
f = open(filename, 'rb') | |
m = hashlib.md5() | |
while True: | |
data = f.read(10240) | |
if len(data) == 0: | |
break | |
m.update(data) | |
return m.hexdigest() | |
def to_uri(filename): | |
return re.sub(SOURCE_DIR, '', f) | |
# Assemble a list of all files from SOURCE_DIR | |
files = [] | |
for root, dirnames, filenames in os.walk(SOURCE_DIR): | |
for filename in filenames: | |
files.append(os.path.join(root, filename)) | |
# Compare them to S3 checksums | |
files_to_upload = [] | |
for f in files: | |
uri = to_uri(f) | |
key = bucket.get_key(uri) | |
if key is None: | |
# new file, upload | |
files_to_upload.append(f) | |
else: | |
# check MD5 | |
md5 = get_md5(f) | |
etag = key.etag.strip('"').strip("'") | |
if etag != md5: | |
print(f + ": " + md5 + " != " + etag) | |
files_to_upload.append(f) | |
# Upload + invalidate the ones that are different | |
for f in files_to_upload: | |
uri = to_uri(f) | |
key = Key(bucket) | |
key.key = uri | |
key.set_contents_from_filename(f) | |
# CloudFront invalidation code goes here | |
Use the s3 file header, something like:
head=self.s3.head_object(Bucket=bucket, Key=path)
#HEAD: {u'AcceptRanges': 'bytes', u'ContentType': 'binary/octet-stream', 'ResponseMetadata': {'HTTPStatusCode': 200, 'HostId': 'j0KrohvVeb8gcs3TIjw2j4d/39JU8h3CS4sUe9IndL6RYHDy8Ruc1DUvMto+4/rxVP9f/wxMGHU=', 'RequestId': 'C2819F65ACE2FD7F'}, u'LastModified': datetime.datetime(2017, 3, 15, 12, 54, 5 , tzinfo=tzutc()), u'ContentLength': 441296, u'ETag': '"3c1952332ae963529293922eebde44dc"', u'Metadata': {}}
lastmodified=head['LastModified']
Thanks, it works. But why does bash command: openssl md5 -binary PATH/TO/FILE | base64
give different result? . For my case each of files is smaller than 5GB. Command is from aws docs ( https://aws.amazon.com/premiumsupport/knowledge-center/data-integrity-s3/ )
Hello,
I am using this python script to upload files that have changed or newly created from local folder to S3 folder. The script does not work. It just failed at getting bucket name. I am using boto with python2.7. Any help much appreciated.
Many thanks.
Here is the error
Traceback (most recent call last): File "s3update.py", line 20, in <module> bucket = conn.get_bucket(BUCKET_NAME) File "/usr/lib/python2.7/site-packages/boto/s3/connection.py", line 506, in get_bucket return self.head_bucket(bucket_name, headers=headers) File "/usr/lib/python2.7/site-packages/boto/s3/connection.py", line 539, in head_bucket raise err boto.exception.S3ResponseError: S3ResponseError: 403 Forbidden
@viatcheslavmogilevsky if you're on MacOS, Apple rolls their own brand of OpenSSL as of this post, and it may be slightly different than what would be used on an actual Linux VM/Container. If you're using MacOS
did you consider using on MD5Sum tag on s3 objects -- seems to me a cleaner solution if you have control over the s3 object upload
I believe s3 etags do not always equal the file's md5 hash: http://stackoverflow.com/questions/12186993/what-is-the-algorithm-to-compute-the-amazon-s3-etag-for-a-file-larger-than-5gb