From 90eb8a70796230b29ec19142482f2503bae55252 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 31 Dec 2019 23:42:30 +0100 Subject: datacite: address 'Unpublished' publisher --- python/fatcat_tools/importers/datacite.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index f8080c10..854085b8 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -378,12 +378,21 @@ class DataciteImporter(EntityImporter): release_date, release_year = parse_datacite_dates( attributes.get('dates', [])) + # Start with clear stages, e.g. published. TODO(martin): we could + # probably infer a bit more from the relations, e.g. + # "IsPreviousVersionOf" or "IsNewVersionOf". + release_stage = None + if attributes.get( + 'state') == 'findable' or attributes.get('isActive') is True: + release_stage = 'published' + # Publisher. A few NA values. A few bogus values. publisher = attributes.get('publisher') if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', - '(:none)'): + '(:none)', 'Unpublished'): publisher = None + release_stage = None if publisher is not None and len(publisher) > 80: # Arbitrary magic value max length. TODO(martin): better heuristic, # but factored out; first we have to log misses. Example: @@ -561,14 +570,6 @@ class DataciteImporter(EntityImporter): )) ref_index += 1 - # Start with clear stages, e.g. published. TODO(martin): we could - # probably infer a bit more from the relations, e.g. - # "IsPreviousVersionOf" or "IsNewVersionOf". - release_stage = None - if attributes.get( - 'state') == 'findable' or attributes.get('isActive') is True: - release_stage = 'published' - # Extra information. extra_datacite = dict() -- cgit v1.2.3