diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2019-12-26 17:43:00 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2019-12-28 23:07:32 +0100 |
commit | 13430af9e8c2e39ba90a7db2135496503fb020b2 (patch) | |
tree | 68e6619ab4ed3e0633a767270f1bf79e733b97a9 | |
parent | 1f7bbc5a582db45fcd6034800959e158d35a2297 (diff) | |
download | fatcat-13430af9e8c2e39ba90a7db2135496503fb020b2.tar.gz fatcat-13430af9e8c2e39ba90a7db2135496503fb020b2.zip |
datacite: use clean on field values
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 30 |
1 files changed, 28 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index a4a3ef8b..16431928 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -4,7 +4,7 @@ Prototype Importer for datacite.org data. Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8 """ -from .common import EntityImporter +from .common import EntityImporter, clean import dateparser import datetime import fatcat_openapi_client @@ -292,7 +292,20 @@ class DataciteImporter(EntityImporter): if len(affiliations) == 0: raw_affiliation = None else: - raw_affiliation = affiliations[0] + raw_affiliation = clean(affiliations[0]) + + name = c.get('name') + given_name = c.get('givenName') + surname = c.get('familyName') + + if name: + name = clean(name) + + if given_name: + given_name = clean(given_name) + + if surname: + surname = clean(surname) contribs.append( fatcat_openapi_client.ReleaseContrib( @@ -325,9 +338,13 @@ class DataciteImporter(EntityImporter): if not title: print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) return False + else: + title = clean(title) if not subtitle: subtitle = None + else: + subtitle = clean(subtitle) # Dates. A few internal dates (registered, created, updated) and # published (0..2554). We try to work with typed date list, in @@ -352,6 +369,9 @@ class DataciteImporter(EntityImporter): # werden" publisher = None + if publisher: + publisher = clean(publisher) + # Container. For the moment, only ISSN as container. container_id = None @@ -388,6 +408,12 @@ class DataciteImporter(EntityImporter): volume = container.get('volume') issue = container.get('issue') + if volume: + volume = clean(volume) + + if issue: + issue = clean(issue) + # Pages. pages = None |