From c67fb5d850ec6bd6659ada8ce8162a8859dafe15 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 13 May 2019 17:08:28 -0700 Subject: importer code updates --- python/fatcat_tools/importers/cdl_dash_dat.py | 2 +- python/fatcat_tools/importers/crossref.py | 16 ++++++++++++++-- python/fatcat_tools/importers/grobid_metadata.py | 2 ++ python/fatcat_tools/importers/wayback_static.py | 1 + 4 files changed, 18 insertions(+), 3 deletions(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index e1252b6d..c1ea075d 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -70,7 +70,6 @@ def cdl_dash_release(meta, extra=None): if extid['value'].startswith('ark:'): ark_id = extid['value'] assert ark_id - extra['ark_id'] = ark_id license_slug = lookup_license_slug(meta['rights']['uri']) @@ -98,6 +97,7 @@ def cdl_dash_release(meta, extra=None): r = ReleaseEntity( ext_ids=ReleaseEntityExtIds( doi=doi, + ark=ark_id, ), title=clean(meta['title'], force_xml=True), publisher=clean(meta['publisher']), diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 999ce13f..c875010c 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -218,6 +218,8 @@ class CrossrefImporter(EntityImporter): creator_id=creator_id, index=index, raw_name=raw_name, + given_name=clean(am.get('given')), + surname=clean(am.get('family')), raw_affiliation=clean(raw_affiliation), role=ctype, extra=extra)) @@ -318,13 +320,15 @@ class CrossrefImporter(EntityImporter): if not container_id: if obj.get('container-title'): extra['container_name'] = clean(obj['container-title'][0]) - for key in ('group-title', 'subtitle'): + for key in ('group-title'): val = obj.get(key) if val: if type(val) == list: val = val[0] if type(val) == str: - extra[key] = clean(val) + val = clean(val) + if val: + extra[key] = clean(val) else: extra[key] = val # crossref-nested extra keys @@ -397,6 +401,13 @@ class CrossrefImporter(EntityImporter): # title can't be just a single character return None + subtitle = None + if obj.get('subtitle'): + subtitle = clean(obj.get('subtitle')[0], force_xml=True) + if not subtitle or len(subtitle) <= 1: + # subtitle can't be just a single character + return None + if extra_crossref: extra['crossref'] = extra_crossref if not extra: @@ -406,6 +417,7 @@ class CrossrefImporter(EntityImporter): work_id=None, container_id=container_id, title=title, + subtitle=subtitle, original_title=original_title, release_type=release_type, release_stage=release_stage, diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index ba91d183..9e99bc0a 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -94,6 +94,8 @@ class GrobidMetadataImporter(EntityImporter): contribs.append(fatcat_client.ReleaseContrib( index=i, raw_name=clean(a['name']), + given_name=clean(a.get('given_name')), + surname=clean(a.get('surname')), role="author", extra=None)) diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index 114920f7..c4f4f21e 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -120,6 +120,7 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): resp.raise_for_status() assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() + webcapture_cdx.size_bytes = len(resp.content) return webcapture_cdx else: return None -- cgit v1.2.3