Skip to content

Commit

Permalink
Fix checksum migration and checksum function (wooey#237)
Browse files Browse the repository at this point in the history
* Fix checksum migration to resolve path

* Add a handler for bytes in checksum and send in raw bytes from migration

* Stream checksum path instead of getting storage object

* Add explicit path vs buffer toggle to checksum function

* Fix get_storage_object to return an open handle optionally
  • Loading branch information
Chris7 authored Apr 15, 2018
1 parent 5fbf580 commit aea3014
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 36 deletions.
69 changes: 42 additions & 27 deletions wooey/backend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import six
import traceback
from collections import OrderedDict, defaultdict
from contextlib import contextmanager
# Python2.7 encoding= support
from io import open
from itertools import chain
Expand Down Expand Up @@ -214,12 +215,15 @@ def get_current_scripts():
return scripts


def get_storage_object(path, local=False):
@contextmanager
def get_storage_object(path, local=False, close=True):
storage = get_storage(local=local)
with storage.open(path) as obj:
obj.url = storage.url(path)
obj.path = storage.path(path)
return obj
obj = storage.open(path)
obj.url = storage.url(path)
obj.path = storage.path(path)
yield obj
if close:
obj.close()


def add_wooey_script(script_version=None, script_path=None, group=None, script_name=None):
Expand All @@ -234,7 +238,8 @@ def add_wooey_script(script_version=None, script_path=None, group=None, script_n
# check if the script exists
script_path = script_path or script_version.script_path.name
script_name = script_name or (script_version.script.script_name if script_version else os.path.basename(os.path.splitext(script_path)[0]))
checksum = get_checksum(get_storage_object(script_path).path)
with get_storage_object(script_path) as so:
checksum = get_checksum(buff=so.read())
existing_version = None
try:
existing_version = ScriptVersion.objects.get(checksum=checksum, script__script_name=script_name)
Expand Down Expand Up @@ -289,7 +294,8 @@ def add_wooey_script(script_version=None, script_path=None, group=None, script_n
if not current_file.closed:
current_file.close()

script = get_storage_object(new_path, local=True).path
with get_storage_object(new_path, local=True) as so:
script = so.path
with local_storage.open(new_path) as local_handle:
local_file = local_handle.name
else:
Expand All @@ -302,7 +308,8 @@ def add_wooey_script(script_version=None, script_path=None, group=None, script_n
else:
with local_storage.open(script_path) as local_handle:
local_file = local_handle.name
script = get_storage_object(local_file, local=True).path
with get_storage_object(local_file, local=True) as so:
script = so.path
if isinstance(group, ScriptGroup):
group = group.group_name
if group is None:
Expand Down Expand Up @@ -513,7 +520,7 @@ def mkdirs(path):
def get_upload_path(filepath, checksum=None):
filename = os.path.split(filepath)[1]
if checksum is None:
checksum = get_checksum(filepath)
checksum = get_checksum(path=filepath)
return os.path.join(wooey_settings.WOOEY_FILE_DIR, checksum[:2], checksum[-2:], checksum, filename)


Expand Down Expand Up @@ -637,7 +644,7 @@ def create_job_fileinfo(job):
d = {'parameter': field, 'file': value}
if field.parameter.is_output:
full_path = os.path.join(job.save_path, os.path.split(local_storage.path(value))[1])
checksum = get_checksum(value, extra=[job.pk, full_path, 'output'])
checksum = get_checksum(path=value, extra=[job.pk, full_path, 'output'])
d['checksum'] = checksum
files.append(d)
except ValueError:
Expand All @@ -660,13 +667,18 @@ def create_job_fileinfo(job):
full_path = os.path.join(job.save_path, filename)
# this is to make the job output have a unique checksum. If this file is then re-uploaded, it will create
# a new file to reference in the uploads directory and not link back to the job output.
checksum = get_checksum(filepath, extra=[job.pk, full_path, 'output'])
checksum = get_checksum(path=filepath, extra=[job.pk, full_path, 'output'])
try:
storage_file = get_storage_object(full_path)
with get_storage_object(full_path) as storage_file:
d = {
'name': filename,
'file': storage_file,
'size_bytes': storage_file.size,
'checksum': checksum
}
except:
sys.stderr.write('Error in accessing stored file {}:\n{}'.format(full_path, traceback.format_exc()))
continue
d = {'name': filename, 'file': storage_file, 'size_bytes': storage_file.size, 'checksum': checksum}
if filename.endswith('.tar.gz') or filename.endswith('.zip'):
file_groups['archives'].append(d)
else:
Expand Down Expand Up @@ -706,7 +718,7 @@ def create_job_fileinfo(job):
parameter = group_file.get('parameter')

# get the checksum of the file to see if we need to save it
checksum = group_file.get('checksum', get_checksum(filepath))
checksum = group_file.get('checksum', get_checksum(path=filepath))
try:
wooey_file = WooeyFile.objects.get(checksum=checksum)
file_created = False
Expand Down Expand Up @@ -738,7 +750,7 @@ def create_job_fileinfo(job):
continue


def get_checksum(path, extra=None):
def get_checksum(path=None, buff=None, extra=None):
import hashlib
BLOCKSIZE = 65536
hasher = hashlib.sha1()
Expand All @@ -748,20 +760,23 @@ def get_checksum(path, extra=None):
hasher.update(six.u(str(i)).encode('utf-8'))
elif isinstance(extra, six.string_types):
hasher.update(extra)
if isinstance(path, six.string_types):
with open(path, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
if buff is not None:
hasher.update(buff)
elif path is not None:
if isinstance(path, six.string_types):
with open(path, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
else:
start = path.tell()
path.seek(0)
buf = path.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
else:
start = path.tell()
path.seek(0)
buf = path.read(BLOCKSIZE)
path.seek(start)
while len(buf) > 0:
hasher.update(buf)
buf = path.read(BLOCKSIZE)
path.seek(start)
return hasher.hexdigest()


Expand Down
16 changes: 14 additions & 2 deletions wooey/forms/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,21 @@ def get_field(param, initial=None):
initial = [os.path.split(i.name)[1] for i in initial]
elif initial is not None and list(filter(None, initial)): # for python3, we need to evaluate the filter object
if isinstance(initial, (list, tuple)):
initial = [utils.get_storage_object(value) if not hasattr(value, 'path') else value for value in initial if value is not None]
_initial = []
for value in initial:
if not hasattr(value, 'path'):
with utils.get_storage_object(value, close=False) as so:
_initial.append(so)
else:
_initial.append(value)
_initial.append()
initial = _initial
else:
initial = utils.get_storage_object(initial) if not hasattr(initial, 'path') else initial
if not hasattr(initial, 'path'):
with utils.get_storage_object(initial, close=False) as so:
initial = so
else:
initial = initial
field_kwargs['widget'] = forms.ClearableFileInput()
if not multiple_choices and isinstance(initial, list):
initial = initial[0]
Expand Down
2 changes: 1 addition & 1 deletion wooey/migrations/0017_wooeyfile_generate_checksums.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def gen_checksums(apps, schema_editor):
from ..backend.utils import get_checksum
for obj in WooeyFile.objects.all():
try:
obj.checksum = get_checksum(obj.filepath.path)
obj.checksum = get_checksum(path=obj.filepath.path)
obj.save()
except IOError:
print(obj.filepath, 'not found')
Expand Down
2 changes: 1 addition & 1 deletion wooey/migrations/0034_update-checksums.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def update_checksums(apps, schema_editor):
ScriptVersion = apps.get_model('wooey', 'ScriptVersion')
from wooey.backend import utils
for obj in ScriptVersion.objects.all():
checksum = utils.get_checksum(utils.get_storage_object(obj.script_path).path)
checksum = utils.get_checksum(buff=obj.script_path.read())
obj.checksum = checksum
obj.save()

Expand Down
8 changes: 4 additions & 4 deletions wooey/models/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,8 +473,8 @@ def value(self):
field = self.parameter.form_field
if field == self.FILE:
try:
file_obj = utils.get_storage_object(value)
value = file_obj
with utils.get_storage_object(value, close=False) as value:
pass
except IOError:
# this can occur when the storage object is not yet made for output
if self.parameter.is_output:
Expand Down Expand Up @@ -513,7 +513,7 @@ def value(self, value):
if value:
local_storage = utils.get_storage(local=True)
current_path = local_storage.path(value)
checksum = utils.get_checksum(value)
checksum = utils.get_checksum(path=value)
path = utils.get_upload_path(current_path, checksum=checksum)
if hasattr(value, 'size'):
filesize = value.size
Expand Down Expand Up @@ -542,7 +542,7 @@ def value(self, value):
# save ourself first, we have to do this because we are referenced in WooeyFile
self.save()
if checksum is None:
checksum = utils.get_checksum(local_path)
checksum = utils.get_checksum(path=local_path)
wooey_file, file_created = WooeyFile.objects.get_or_create(checksum=checksum)
if file_created:
wooey_file.filetype = fileinfo.get('type')
Expand Down
2 changes: 1 addition & 1 deletion wooey/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def script_version_presave(instance, **kwargs):
if 'script_path' in instance.changed_fields and not skip_script(instance):
# If the script checksum is not changed, do not run the script addition code (but update the
# path)
checksum = utils.get_checksum(instance.script_path.path)
checksum = utils.get_checksum(path=instance.script_path.path)
if checksum != instance.checksum and not ScriptVersion.objects.filter(checksum=checksum, script_id=instance.script_id).exists():
instance.checksum = checksum
instance.script_iteration += 1
Expand Down

0 comments on commit aea3014

Please sign in to comment.