Skip to content

Commit

Permalink
Allow Deop of Ed source to be Harvested with its custom schema
Browse files Browse the repository at this point in the history
  • Loading branch information
avdata99 committed Dec 10, 2019
1 parent e2e614d commit 7b1ce01
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 3 deletions.
5 changes: 4 additions & 1 deletion ckanext/datajson/harvester_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
def clean_tags(tags):
ret = []
pattern = re.compile('[^A-Za-z0-9\s_\-!?]+')

for tag in tags:
tag = pattern.sub('', tag).strip()
if len(tag) > MAX_TAG_LENGTH:
Expand All @@ -43,7 +44,7 @@ def clean_tags(tags):
log.error('tag is short: {}'.format(tag))
tag += '_' * (MIN_TAG_LENGTH - len(tag))
if tag != '':
ret.append(tag.capitalize())
ret.append(tag.lower().replace(' ', '-')) # copyin CKAN behaviour
return ret


Expand Down Expand Up @@ -723,6 +724,7 @@ def import_stage(self, harvest_object):

log.warn('updating package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url))
pkg = get_action('package_update')(self.context(), pkg)
log.info('Package updated {}'.format(pkg))
else:
# It doesn't exist yet. Create a new one.
pkg['name'] = self.make_package_name(dataset_processed["title"], harvest_object.guid)
Expand All @@ -739,6 +741,7 @@ def import_stage(self, harvest_object):
except:
log.error('failed to create package %s from %s' % (pkg["name"], harvest_object.source.url))
raise
log.info('Package created {}'.format(pkg))

# Flag the other HarvestObjects linking to this package as not current anymore
for ob in model.Session.query(HarvestObject).filter_by(package_id=pkg["id"]):
Expand Down
2 changes: 2 additions & 0 deletions ckanext/datajson/harvester_datajson.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ckanext.datajson.harvester_base import DatasetHarvesterBase
from parse_datajson import parse_datajson_entry
from parse_dep_of_ed import parse_datajson_entry_for_dep_of_ed_schema
import logging
log = logging.getLogger(__name__)

Expand Down Expand Up @@ -85,6 +86,7 @@ def load_remote_catalog(self, harvest_job):

def set_dataset_info(self, pkg, dataset, dataset_defaults, schema_version):
parse_datajson_entry(dataset, pkg, dataset_defaults, schema_version)
parse_datajson_entry_for_dep_of_ed_schema(dataset, pkg, dataset_defaults, schema_version)

# helper function to remove BOM
def lstrip_bom(str_):
Expand Down
10 changes: 8 additions & 2 deletions ckanext/datajson/parse_datajson.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from ckan.lib.munge import munge_title_to_name

import logging
import re


log = logging.getLogger(__name__)


def parse_datajson_entry(datajson, package, defaults, schema_version):
# four fields need extra handling, which are
# 1.tag, 2.license, 3.maintainer_email, 4.publisher_hierarchy,
# 5.resources

log.info('Parsing datajson entry: {}'.format(package))
# 1. package["tags"]
package["tags"] = [ { "name": munge_title_to_name(t) } for t in
package.get("tags", "") if t.strip() != ""]
Expand Down Expand Up @@ -142,6 +146,8 @@ def parse_datajson_entry(datajson, package, defaults, schema_version):
r['accessURL'] = accessurl_value

package["resources"].append(r)

log.info('Finished Parsing datajson entry: {}'.format(package))

def extra(package, key, value):
if not value: return
Expand Down
32 changes: 32 additions & 0 deletions ckanext/datajson/parse_dep_of_ed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Temporal fixes to fit the Dep of Ed schema
"""
import logging
import ckan.model as model
log = logging.getLogger(__name__)


def parse_datajson_entry_for_dep_of_ed_schema(datajson, package, defaults, schema_version):
# temporal FIX
log.info('Parsing datajson entry for dep of ed : {}'.format(package))

is_private = package.get('private', False)
package['private'] = is_private

if schema_version == '1.1':
author_email = package.get('contact_email', '[email protected]')
author = package.get('contact_name', 'Unknown author')
elif schema_version == '1.0':
author_email = package.get('maintainer_email', '[email protected]')
author = package.get('maintainer', 'Unknown author')

package['author'] = author
package['author_email'] = author_email

# require vocabularies created !
# paster --plugin=ckanext-ed ed create_ed_vocabularies

spatial = package.get('spatial', 'United States')
package['spatial'] = spatial

log.info('Finished Parsing datajson entry for dep of ed : {}'.format(package))

0 comments on commit 7b1ce01

Please sign in to comment.