aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2019-12-31 23:42:30 +0100
committerMartin Czygan <martin.czygan@gmail.com>2019-12-31 23:42:30 +0100
commit90eb8a70796230b29ec19142482f2503bae55252 (patch)
tree895560ac5891894b32ed051ea4d1754bb036c833
parent02f5298be0da677e52621e7e6be682e07b9fce7e (diff)
downloadfatcat-90eb8a70796230b29ec19142482f2503bae55252.tar.gz
fatcat-90eb8a70796230b29ec19142482f2503bae55252.zip
datacite: address 'Unpublished' publisher
-rw-r--r--python/fatcat_tools/importers/datacite.py19
1 files changed, 10 insertions, 9 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index f8080c10..854085b8 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -378,12 +378,21 @@ class DataciteImporter(EntityImporter):
release_date, release_year = parse_datacite_dates(
attributes.get('dates', []))
+ # Start with clear stages, e.g. published. TODO(martin): we could
+ # probably infer a bit more from the relations, e.g.
+ # "IsPreviousVersionOf" or "IsNewVersionOf".
+ release_stage = None
+ if attributes.get(
+ 'state') == 'findable' or attributes.get('isActive') is True:
+ release_stage = 'published'
+
# Publisher. A few NA values. A few bogus values.
publisher = attributes.get('publisher')
if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)',
- '(:none)'):
+ '(:none)', 'Unpublished'):
publisher = None
+ release_stage = None
if publisher is not None and len(publisher) > 80:
# Arbitrary magic value max length. TODO(martin): better heuristic,
# but factored out; first we have to log misses. Example:
@@ -561,14 +570,6 @@ class DataciteImporter(EntityImporter):
))
ref_index += 1
- # Start with clear stages, e.g. published. TODO(martin): we could
- # probably infer a bit more from the relations, e.g.
- # "IsPreviousVersionOf" or "IsNewVersionOf".
- release_stage = None
- if attributes.get(
- 'state') == 'findable' or attributes.get('isActive') is True:
- release_stage = 'published'
-
# Extra information.
extra_datacite = dict()