aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2019-12-26 17:43:00 +0100
committerMartin Czygan <martin.czygan@gmail.com>2019-12-28 23:07:32 +0100
commit13430af9e8c2e39ba90a7db2135496503fb020b2 (patch)
tree68e6619ab4ed3e0633a767270f1bf79e733b97a9
parent1f7bbc5a582db45fcd6034800959e158d35a2297 (diff)
downloadfatcat-13430af9e8c2e39ba90a7db2135496503fb020b2.tar.gz
fatcat-13430af9e8c2e39ba90a7db2135496503fb020b2.zip
datacite: use clean on field values
-rw-r--r--python/fatcat_tools/importers/datacite.py30
1 files changed, 28 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index a4a3ef8b..16431928 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -4,7 +4,7 @@ Prototype Importer for datacite.org data.
Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
"""
-from .common import EntityImporter
+from .common import EntityImporter, clean
import dateparser
import datetime
import fatcat_openapi_client
@@ -292,7 +292,20 @@ class DataciteImporter(EntityImporter):
if len(affiliations) == 0:
raw_affiliation = None
else:
- raw_affiliation = affiliations[0]
+ raw_affiliation = clean(affiliations[0])
+
+ name = c.get('name')
+ given_name = c.get('givenName')
+ surname = c.get('familyName')
+
+ if name:
+ name = clean(name)
+
+ if given_name:
+ given_name = clean(given_name)
+
+ if surname:
+ surname = clean(surname)
contribs.append(
fatcat_openapi_client.ReleaseContrib(
@@ -325,9 +338,13 @@ class DataciteImporter(EntityImporter):
if not title:
print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
return False
+ else:
+ title = clean(title)
if not subtitle:
subtitle = None
+ else:
+ subtitle = clean(subtitle)
# Dates. A few internal dates (registered, created, updated) and
# published (0..2554). We try to work with typed date list, in
@@ -352,6 +369,9 @@ class DataciteImporter(EntityImporter):
# werden"
publisher = None
+ if publisher:
+ publisher = clean(publisher)
+
# Container. For the moment, only ISSN as container.
container_id = None
@@ -388,6 +408,12 @@ class DataciteImporter(EntityImporter):
volume = container.get('volume')
issue = container.get('issue')
+ if volume:
+ volume = clean(volume)
+
+ if issue:
+ issue = clean(issue)
+
# Pages.
pages = None