diff options
Diffstat (limited to 'python/fatcat_tools/importers/crossref.py')
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 41 |
1 files changed, 22 insertions, 19 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 22abd08d..cbb6deb5 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -6,7 +6,7 @@ import datetime import itertools import subprocess import fatcat_client -from .common import EntityImporter +from .common import EntityImporter, clean # The docs/guide should be the cannonical home for these mappings; update there @@ -169,7 +169,7 @@ class CrossrefImporter(EntityImporter): raw_affiliation = am.get('affiliation')[0]['name'] if len(am.get('affiliation')) > 1: # note: affiliation => affiliations - extra['affiliations'] = [a['name'] for a in am.get('affiliation')[1:]] + extra['affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]] if am.get('sequence') and am.get('sequence') != "additional": extra['sequence'] = am.get('sequence') if not extra: @@ -178,8 +178,8 @@ class CrossrefImporter(EntityImporter): contribs.append(fatcat_client.ReleaseContrib( creator_id=creator_id, index=index, - raw_name=raw_name, - raw_affiliation=raw_affiliation, + raw_name=clean(raw_name), + raw_affiliation=clean(raw_affiliation), role=ctype, extra=extra)) return contribs @@ -199,9 +199,9 @@ class CrossrefImporter(EntityImporter): and obj.get('container-title') and len(obj['container-title']) > 0): ce = fatcat_client.ContainerEntity( issnl=issnl, - publisher=publisher, + publisher=clean(publisher), container_type=self.map_container_type(release_type), - name=obj['container-title'][0]) + name=clean(obj['container-title'][0], force_xml=True)) ce_edit = self.create_container(ce) container_id = ce_edit.ident @@ -257,10 +257,10 @@ class CrossrefImporter(EntityImporter): # doing lookups would be a second import pass target_release_id=None, key=key, - year=year, - container_name=container_name, - title=ref_title, - locator=ref_locator, + year=clean(year), + container_name=clean(container_name), + title=clean(ref_title), + locator=clean(ref_locator), # TODO: just dump JSON somewhere here? extra=extra)) @@ -269,7 +269,7 @@ class CrossrefImporter(EntityImporter): if obj.get('abstract') != None: abstracts.append(fatcat_client.ReleaseEntityAbstracts( mimetype="application/xml+jats", - content=obj.get('abstract'))) + content=clean(obj.get('abstract')))) # extra fields extra = dict() @@ -279,13 +279,16 @@ class CrossrefImporter(EntityImporter): # TODO: unpack "container-title" array val = obj.get(key) if val: - extra[key] = val + if type(val) == str: + extra[key] = clean(val) + else: + extra[key] = val if 'license' in extra and extra['license']: for i in range(len(extra['license'])): if 'start' in extra['license'][i]: extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] if len(obj['title']) > 1: - extra['other-titles'] = obj['title'][1:] + extra['other-titles'] = [clean(t) for t in obj['title'][1:]] # TODO: this should be top-level extra['is_kept'] = len(obj.get('archive', [])) > 0 @@ -329,13 +332,13 @@ class CrossrefImporter(EntityImporter): re = fatcat_client.ReleaseEntity( work_id=None, container_id=container_id, - title=obj.get('title', [None])[0], - original_title=obj.get('original-title', [None])[0], + title=clean(obj.get('title', [None])[0], force_xml=True), + original_title=clean(obj.get('original-title', [None])[0]), release_type=release_type, release_status=release_status, release_date=release_date, release_year=release_year, - publisher=publisher, + publisher=clean(publisher), doi=obj['DOI'].lower(), pmid=extids['pmid'], pmcid=extids['pmcid'], @@ -344,9 +347,9 @@ class CrossrefImporter(EntityImporter): core_id=extids['core_id'], arxiv_id=extids['arxiv_id'], jstor_id=extids['jstor_id'], - volume=obj.get('volume'), - issue=obj.get('issue'), - pages=obj.get('page'), + volume=clean(obj.get('volume')), + issue=clean(obj.get('issue')), + pages=clean(obj.get('page')), language=None, # crossref doesn't supply language info license_slug=license_slug, extra=dict(crossref=extra), |