diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-08-15 21:47:00 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-08-15 21:47:00 -0700 |
commit | 4c11f65f202ef8f71bfd640232ed30ccd6f4c3a4 (patch) | |
tree | cfa76395e59cbcf0c05f5eb79af2e2013f231080 /python/fatcat | |
parent | 18821fcbfa9eb38ee0fb0b065d6642b461fed021 (diff) | |
download | fatcat-4c11f65f202ef8f71bfd640232ed30ccd6f4c3a4.tar.gz fatcat-4c11f65f202ef8f71bfd640232ed30ccd6f4c3a4.zip |
improve handling of invalid identifiers
Diffstat (limited to 'python/fatcat')
-rw-r--r-- | python/fatcat/importer_common.py | 17 | ||||
-rw-r--r-- | python/fatcat/orcid_importer.py | 7 |
2 files changed, 19 insertions, 5 deletions
diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py index 9d495aa7..e084d8c4 100644 --- a/python/fatcat/importer_common.py +++ b/python/fatcat/importer_common.py @@ -1,4 +1,5 @@ +import re import sys import csv import json @@ -22,6 +23,7 @@ class FatcatImporter: self._orcid_id_map = dict() self._doi_id_map = dict() self._issn_issnl_map = None + self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{4}$") if issn_map_file: self.read_issn_map_file(issn_map_file) @@ -54,9 +56,11 @@ class FatcatImporter: reader = csv.DictReader(source, delimiter=delimiter) self.process_batch(reader, size) + def is_issnl(self, issnl): + return len(issnl) == 9 and issnl[4] == '-' + def lookup_issnl(self, issnl): """Caches calls to the ISSN-L lookup API endpoint in a local dict""" - assert len(issnl) == 9 and issnl[4] == '-' if issnl in self._issnl_id_map: return self._issnl_id_map[issnl] container_id = None @@ -69,9 +73,13 @@ class FatcatImporter: self._issnl_id_map[issnl] = container_id # might be None return container_id + def is_orcid(self, orcid): + return self._orcid_regex.match(orcid) != None + def lookup_orcid(self, orcid): """Caches calls to the Orcid lookup API endpoint in a local dict""" - assert len(orcid) == 19 and orcid[4] == '-' + if not self.is_orcid(orcid): + return None if orcid in self._orcid_id_map: return self._orcid_id_map[orcid] creator_id = None @@ -84,9 +92,12 @@ class FatcatImporter: self._orcid_id_map[orcid] = creator_id # might be None return creator_id + def is_doi(self, doi): + return doi.startswith("10.") and doi.count("/") >= 1 + def lookup_doi(self, doi): """Caches calls to the doi lookup API endpoint in a local dict""" - assert doi.startswith('10.') + assert self.is_doi(doi) doi = doi.lower() if doi in self._doi_id_map: return self._doi_id_map[doi] diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py index 69b184d5..e57703d5 100644 --- a/python/fatcat/orcid_importer.py +++ b/python/fatcat/orcid_importer.py @@ -5,7 +5,6 @@ import itertools import fatcat_client from fatcat.importer_common import FatcatImporter - def value_or_none(e): if type(e) == dict: e = e.get('value') @@ -46,8 +45,12 @@ class FatcatOrcidImporter(FatcatImporter): else: # must have *some* name return None + orcid = obj['orcid-identifier']['path'] + if not self.is_orcid(orcid): + sys.stderr.write("Bad ORCID: {}\n".format(orcid)) + return None ce = fatcat_client.CreatorEntity( - orcid=obj['orcid-identifier']['path'], + orcid=orcid, given_name=given, surname=sur, display_name=display, |