Skip to content

Commit

Permalink
Merge pull request EUDAT-B2SHARE#1543 from dinosk/checksums
Browse files Browse the repository at this point in the history
files: checksum verification
  • Loading branch information
nharraud authored Oct 6, 2017
2 parents 58c6e56 + cd19c20 commit 54f2eda
Show file tree
Hide file tree
Showing 9 changed files with 361 additions and 16 deletions.
28 changes: 28 additions & 0 deletions b2share/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,34 @@
'schedule': timedelta(minutes=15),
'args': [['file-download-agg']]
},
# Check file checksums
'file-checks': {
'task': 'invenio_files_rest.tasks.schedule_checksum_verification',
'schedule': timedelta(hours=1),
'kwargs': {
# Manually check and calculate checksums of files biannually
'frequency': {'days': 180},
'batch_interval': {'hours': 1},
# Split batches based on max number of files
'max_count': 0,
# Split batches based on total files size
'max_size': 0,
},
},
# Check file checksums which have previously failed the scan
'file-checks-failed': {
'task': 'b2share.modules.files.tasks.schedule_failed_checksum_files',
'schedule': timedelta(hours=1),
'kwargs': {
# Manually check and calculate checksums of files biannually
'frequency': {'days': 7},
'batch_interval': {'hours': 1},
# Split batches based on max number of files
'max_count': 0,
# Split batches based on total files size
'max_size': 0,
},
},
}


Expand Down
135 changes: 135 additions & 0 deletions b2share/modules/files/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# -*- coding: utf-8 -*-
#
# This file is part of EUDAT B2Share.
# Copyright (C) 2017 CERN.
#
# B2Share is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# B2Share is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with B2Share; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Celery background tasks."""
import json
from celery import shared_task
from flask import current_app, url_for
from flask_babelex import lazy_gettext as _
from invenio_db import db
from invenio_records_files.models import RecordsBuckets
from invenio_files_rest.models import FileInstance, ObjectVersion
from invenio_files_rest.tasks import schedule_checksum_verification
from invenio_mail.tasks import send_email
from sqlalchemy import or_


def failed_checksum_files_query():
"""Get all files that failed their previous checksum verification."""
return FileInstance.query.filter(or_(FileInstance.last_check==None,
FileInstance.last_check==False))


def _format_file_info(file_info):
"""Format file information."""
return ('Record Id: {0}<br/> Bucket Id: {1}<br/> '
'File key: {2}<br/> Path: {3}<br/> Previous'
' Checksum: {4}').format(
str(file_info[0]), str(file_info[1]),
str(file_info[2]), str(file_info[3]),
str(file_info[4])
)

def notify_admin():
"""Send email to admin with the info about checksum verification errors."""
# Get all files that didn't match their checksums
results = db.session.query(RecordsBuckets.record_id,
ObjectVersion.bucket_id,
ObjectVersion.key,
FileInstance.uri,
FileInstance.checksum).\
filter(RecordsBuckets.bucket_id == ObjectVersion.bucket_id,
ObjectVersion.file_id == FileInstance.id,
FileInstance.last_check == False).all()
# Get all files for which an error occurred while verifying their checksum
error_count = FileInstance.query.filter_by(last_check=None).count()
if error_count != 0:
error_files = db.session.query(RecordsBuckets.record_id,
ObjectVersion.bucket_id,
ObjectVersion.key,
FileInstance.uri,
FileInstance.checksum).\
filter(RecordsBuckets.bucket_id == ObjectVersion.bucket_id,
ObjectVersion.file_id == FileInstance.id,
FileInstance.last_check == None).limit(100).all()

msg_content = ''
if results:
msg_content = '{0} "{1}" :<br/><br/> {2}<br/><br/>'.format(
_('List of files with modified checksums on the B2SHARE server '),
# We reuse JSONSCHEMAS_HOST because SERVER_NAME is not set
current_app.config['JSONSCHEMAS_HOST'],
'<br/><br/>'.join([_format_file_info(info) for info in results])
)
if error_count != 0:
msg_content += ('{0}: {1}<br/><br/>{2}:<br/><br/>{3}<br/><br/>'.format(
_('Number of files for which an error occurred during the '
'checksum verification (other than a wrong checksum)'),
str(error_count),
_('Partial list of files with errors'),
'<br/><br/>'.join([
_format_file_info(info) for info in error_files
])
))
if msg_content:
msg_content += '-- <br>{0} "{1}"'.format(
_('B2SHARE automatic task for server'),
current_app.config['JSONSCHEMAS_HOST']
)
support = str(current_app.config.get('SUPPORT_EMAIL'))
send_email(dict(
subject=_('B2SHARE Checksum Verification Report'),
sender=support,
recipients=[support],
html=msg_content,
body=msg_content
))


@shared_task(ignore_result=True)
def schedule_failed_checksum_files(**kwargs):
"""Schedule files checksum check for files which failed their check.
This happens when a previous scan scheduled by either of the tasks
schedule_checksum_verification or schedule_failed_checksum_files
failed with an exception.
Exceptions can be raised for example if the filesystem is not available. It
doesn't mean that the files are corrupted but the check should still run
again.
In general we want schedule_failed_checksum_files to run more often than
schedule_checksum_verification as few files should fail their scan and
when it happens we want to recheck them rapidly.
This task also notifies B2SHARE_SUPPORT email of any errors.
:param dict kwargs: parameter forwarded to schedule_checksum_verification.
"""
assert 'files_query' not in kwargs
schedule_checksum_verification.s(
files_query=failed_checksum_files_query,
**kwargs
).apply()
notify_admin()
2 changes: 1 addition & 1 deletion b2share/modules/records/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def update_expired_embargoes():
datetime.now(timezone.utc).isoformat()
),
allow_leading_wildcard=False
).fields([])
).fields([])
record_ids = [hit.meta.id for hit in s.scan()]
if record_ids:
logger.info('Changing access of {} embargoed publications'
Expand Down
3 changes: 2 additions & 1 deletion b2share/modules/upgrade/upgrades/upgrade_2_0_0_to_2_0_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ def alembic_upgrade_to_2_0_2(alembic, verbose):
# Upgrade alembic recipes for B2SHARE 2.0.2.
for revision in [
'456bf6bcb1e6', # b2share-upgrade
'e12419831262' # invenio-accounts
'e12419831262', # invenio-accounts
'f741aa746a7d' # invenio-files-rest
]:
alembic_upgrade(revision)
db.session.commit()
Expand Down
15 changes: 8 additions & 7 deletions requirements.in
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
celery<4.0,>=3.1
celery>=3.1.24
datacite>=0.3.0
dcxml>=0.1.0
dojson>=1.2.1
doschema>=1.0.0a1
easywebdav>=1.2.0
elasticsearch<3.0.0,>=2.0.0
elasticsearch-dsl<3.0.0,>=2.0.0
Flask-Login<0.4,>=0.3.0
Flask-Login<0.4,>=0.3.2
httplib2>=0.9.2
invenio-access<1.1.0,>=1.0.0a11
invenio-accounts<1.1.0,>=1.0.0b5
invenio-accounts-rest<1.1.0,>=1.0.0a3
invenio-accounts-rest<1.1.0,>=1.0.0a4
invenio-base<1.1.0,>=1.0.0a14
invenio-celery<1.1.0,>=1.0.0b1
invenio-config<1.1.0,>=1.0.0b2
invenio-db<1.1.0,>=1.0.0b5
invenio-db[postgresql,versioning]<1.1.0,>=1.0.0b7
invenio-deposit<1.1.0,>=1.0.0a8
invenio-files-rest<1.1.0,>=1.0.0a17
invenio-files-rest<1.1.0,>=1.0.0a21
invenio-indexer>=1.0.0a9
invenio-logging>=1.0.0a3
invenio-mail<1.1.0,>=1.0.0b1
Expand All @@ -28,8 +28,9 @@ invenio-pidstore<1.1.0,>=v1.0.0b1
invenio-query-parser<1.1.0,>=0.6.0
invenio-records<1.1.0,>=1.0.0b1
invenio-records-files<1.1.0,>=1.0.0a9
invenio-records-rest<1.1.0,>=1.0.0a16
invenio-records-rest<1.1.0,>=1.0.0a17
invenio-rest[cors]<1.1.0,>=1.0.0a10
invenio-search<1.1.0,>=1.0.0a9
invenio-search<1.1.0,>=1.0.0a10
invenio-stats>=1.0.0a7
jsonresolver[jsonschema]>=0.2.1
psycopg2>=2.6.1
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ invenio-assets==1.0.0b6 # via invenio-deposit, invenio-search-ui
invenio-base==1.0.0a14
invenio-celery==1.0.0b3
invenio-config==1.0.0b3
invenio-db[versioning]==1.0.0b8
invenio-db[postgresql,versioning]==1.0.0b8
invenio-deposit==1.0.0a8
invenio-files-rest==1.0.0a17
invenio-files-rest==1.0.0a21
invenio-i18n==1.0.0b4 # via invenio-accounts
invenio-indexer==1.0.0a9
invenio-jsonschemas==1.0.0a5 # via invenio-deposit, invenio-marc21
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
'invenio-config>=1.0.0b2,<1.1.0',
'invenio-db[postgresql,versioning]>=1.0.0b7,<1.1.0',
'invenio-deposit>=1.0.0a8,<1.1.0',
'invenio-files-rest>=1.0.0a17,<1.1.0',
'invenio-files-rest>=1.0.0a21,<1.1.0',
'invenio-mail>=1.0.0b1,<1.1.0',
'invenio-marc21>=1.0.0a3',
'invenio-oaiserver>=1.0.0a9,<1.1.0',
Expand Down Expand Up @@ -229,6 +229,7 @@ def run_tests(self):
],
'invenio_celery.tasks': [
'b2share_records = b2share.modules.records.tasks',
'b2share_files = b2share.modules.files.tasks',
],
'invenio_access.actions': [
'create_deposit_need = '
Expand Down
Loading

0 comments on commit 54f2eda

Please sign in to comment.