diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-13 17:08:28 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-13 17:08:28 -0700 |
commit | c67fb5d850ec6bd6659ada8ce8162a8859dafe15 (patch) | |
tree | 0fad6d0d486854ecda4604127b458568cca15af5 /python/fatcat_tools | |
parent | c9e067e4b7c23b4d871ac091c7a9ec5a6650e909 (diff) | |
download | fatcat-c67fb5d850ec6bd6659ada8ce8162a8859dafe15.tar.gz fatcat-c67fb5d850ec6bd6659ada8ce8162a8859dafe15.zip |
importer code updates
Diffstat (limited to 'python/fatcat_tools')
-rwxr-xr-x | python/fatcat_tools/importers/cdl_dash_dat.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 16 | ||||
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 2 | ||||
-rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py | 1 |
4 files changed, 18 insertions, 3 deletions
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index e1252b6d..c1ea075d 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -70,7 +70,6 @@ def cdl_dash_release(meta, extra=None): if extid['value'].startswith('ark:'): ark_id = extid['value'] assert ark_id - extra['ark_id'] = ark_id license_slug = lookup_license_slug(meta['rights']['uri']) @@ -98,6 +97,7 @@ def cdl_dash_release(meta, extra=None): r = ReleaseEntity( ext_ids=ReleaseEntityExtIds( doi=doi, + ark=ark_id, ), title=clean(meta['title'], force_xml=True), publisher=clean(meta['publisher']), diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 999ce13f..c875010c 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -218,6 +218,8 @@ class CrossrefImporter(EntityImporter): creator_id=creator_id, index=index, raw_name=raw_name, + given_name=clean(am.get('given')), + surname=clean(am.get('family')), raw_affiliation=clean(raw_affiliation), role=ctype, extra=extra)) @@ -318,13 +320,15 @@ class CrossrefImporter(EntityImporter): if not container_id: if obj.get('container-title'): extra['container_name'] = clean(obj['container-title'][0]) - for key in ('group-title', 'subtitle'): + for key in ('group-title'): val = obj.get(key) if val: if type(val) == list: val = val[0] if type(val) == str: - extra[key] = clean(val) + val = clean(val) + if val: + extra[key] = clean(val) else: extra[key] = val # crossref-nested extra keys @@ -397,6 +401,13 @@ class CrossrefImporter(EntityImporter): # title can't be just a single character return None + subtitle = None + if obj.get('subtitle'): + subtitle = clean(obj.get('subtitle')[0], force_xml=True) + if not subtitle or len(subtitle) <= 1: + # subtitle can't be just a single character + return None + if extra_crossref: extra['crossref'] = extra_crossref if not extra: @@ -406,6 +417,7 @@ class CrossrefImporter(EntityImporter): work_id=None, container_id=container_id, title=title, + subtitle=subtitle, original_title=original_title, release_type=release_type, release_stage=release_stage, diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index ba91d183..9e99bc0a 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -94,6 +94,8 @@ class GrobidMetadataImporter(EntityImporter): contribs.append(fatcat_client.ReleaseContrib( index=i, raw_name=clean(a['name']), + given_name=clean(a.get('given_name')), + surname=clean(a.get('surname')), role="author", extra=None)) diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index 114920f7..c4f4f21e 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -120,6 +120,7 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): resp.raise_for_status() assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() + webcapture_cdx.size_bytes = len(resp.content) return webcapture_cdx else: return None |