summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-13 17:08:28 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-13 17:08:28 -0700
commitc67fb5d850ec6bd6659ada8ce8162a8859dafe15 (patch)
tree0fad6d0d486854ecda4604127b458568cca15af5 /python/fatcat_tools
parentc9e067e4b7c23b4d871ac091c7a9ec5a6650e909 (diff)
downloadfatcat-c67fb5d850ec6bd6659ada8ce8162a8859dafe15.tar.gz
fatcat-c67fb5d850ec6bd6659ada8ce8162a8859dafe15.zip
importer code updates
Diffstat (limited to 'python/fatcat_tools')
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py2
-rw-r--r--python/fatcat_tools/importers/crossref.py16
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py2
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py1
4 files changed, 18 insertions, 3 deletions
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
index e1252b6d..c1ea075d 100755
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ b/python/fatcat_tools/importers/cdl_dash_dat.py
@@ -70,7 +70,6 @@ def cdl_dash_release(meta, extra=None):
if extid['value'].startswith('ark:'):
ark_id = extid['value']
assert ark_id
- extra['ark_id'] = ark_id
license_slug = lookup_license_slug(meta['rights']['uri'])
@@ -98,6 +97,7 @@ def cdl_dash_release(meta, extra=None):
r = ReleaseEntity(
ext_ids=ReleaseEntityExtIds(
doi=doi,
+ ark=ark_id,
),
title=clean(meta['title'], force_xml=True),
publisher=clean(meta['publisher']),
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 999ce13f..c875010c 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -218,6 +218,8 @@ class CrossrefImporter(EntityImporter):
creator_id=creator_id,
index=index,
raw_name=raw_name,
+ given_name=clean(am.get('given')),
+ surname=clean(am.get('family')),
raw_affiliation=clean(raw_affiliation),
role=ctype,
extra=extra))
@@ -318,13 +320,15 @@ class CrossrefImporter(EntityImporter):
if not container_id:
if obj.get('container-title'):
extra['container_name'] = clean(obj['container-title'][0])
- for key in ('group-title', 'subtitle'):
+ for key in ('group-title'):
val = obj.get(key)
if val:
if type(val) == list:
val = val[0]
if type(val) == str:
- extra[key] = clean(val)
+ val = clean(val)
+ if val:
+ extra[key] = clean(val)
else:
extra[key] = val
# crossref-nested extra keys
@@ -397,6 +401,13 @@ class CrossrefImporter(EntityImporter):
# title can't be just a single character
return None
+ subtitle = None
+ if obj.get('subtitle'):
+ subtitle = clean(obj.get('subtitle')[0], force_xml=True)
+ if not subtitle or len(subtitle) <= 1:
+ # subtitle can't be just a single character
+ return None
+
if extra_crossref:
extra['crossref'] = extra_crossref
if not extra:
@@ -406,6 +417,7 @@ class CrossrefImporter(EntityImporter):
work_id=None,
container_id=container_id,
title=title,
+ subtitle=subtitle,
original_title=original_title,
release_type=release_type,
release_stage=release_stage,
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index ba91d183..9e99bc0a 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -94,6 +94,8 @@ class GrobidMetadataImporter(EntityImporter):
contribs.append(fatcat_client.ReleaseContrib(
index=i,
raw_name=clean(a['name']),
+ given_name=clean(a.get('given_name')),
+ surname=clean(a.get('surname')),
role="author",
extra=None))
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
index 114920f7..c4f4f21e 100755
--- a/python/fatcat_tools/importers/wayback_static.py
+++ b/python/fatcat_tools/importers/wayback_static.py
@@ -120,6 +120,7 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
resp.raise_for_status()
assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
+ webcapture_cdx.size_bytes = len(resp.content)
return webcapture_cdx
else:
return None