diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2019-12-31 23:42:30 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2019-12-31 23:42:30 +0100 |
commit | 90eb8a70796230b29ec19142482f2503bae55252 (patch) | |
tree | 895560ac5891894b32ed051ea4d1754bb036c833 | |
parent | 02f5298be0da677e52621e7e6be682e07b9fce7e (diff) | |
download | fatcat-90eb8a70796230b29ec19142482f2503bae55252.tar.gz fatcat-90eb8a70796230b29ec19142482f2503bae55252.zip |
datacite: address 'Unpublished' publisher
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 19 |
1 files changed, 10 insertions, 9 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index f8080c10..854085b8 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -378,12 +378,21 @@ class DataciteImporter(EntityImporter): release_date, release_year = parse_datacite_dates( attributes.get('dates', [])) + # Start with clear stages, e.g. published. TODO(martin): we could + # probably infer a bit more from the relations, e.g. + # "IsPreviousVersionOf" or "IsNewVersionOf". + release_stage = None + if attributes.get( + 'state') == 'findable' or attributes.get('isActive') is True: + release_stage = 'published' + # Publisher. A few NA values. A few bogus values. publisher = attributes.get('publisher') if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', - '(:none)'): + '(:none)', 'Unpublished'): publisher = None + release_stage = None if publisher is not None and len(publisher) > 80: # Arbitrary magic value max length. TODO(martin): better heuristic, # but factored out; first we have to log misses. Example: @@ -561,14 +570,6 @@ class DataciteImporter(EntityImporter): )) ref_index += 1 - # Start with clear stages, e.g. published. TODO(martin): we could - # probably infer a bit more from the relations, e.g. - # "IsPreviousVersionOf" or "IsNewVersionOf". - release_stage = None - if attributes.get( - 'state') == 'findable' or attributes.get('isActive') is True: - release_stage = 'published' - # Extra information. extra_datacite = dict() |