diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2020-01-06 21:47:13 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2020-01-06 21:50:13 +0100 | 
| commit | 3590cf0e06b6c4f1b1c9621a94c9567e398bca04 (patch) | |
| tree | 47da43b9f9e6670316b17b32531b743c5f98d3f5 /python/fatcat_tools/importers | |
| parent | 582e18d3b9b4599604cddacd526f9b81c1d117d4 (diff) | |
| download | fatcat-3590cf0e06b6c4f1b1c9621a94c9567e398bca04.tar.gz fatcat-3590cf0e06b6c4f1b1c9621a94c9567e398bca04.zip | |
datacite: clean abstracts, use unknown value tokens
Datacite defines placeholders for unknown values:
* https://support.datacite.org/docs/schema-values-unknown-information-v43
Clean abstracts.
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 30 | 
1 files changed, 26 insertions, 4 deletions
| diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index c3d6138e..f9d1b49a 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -125,6 +125,29 @@ DATACITE_TYPE_MAP = {      }  } +# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. +DATACITE_UNKNOWN_MARKERS = ( +    '(:unac)',  # temporarily inaccessible +    '(:unal)',  # unallowed, suppressed intentionally +    '(:unap)',  # not applicable, makes no sense +    '(:unas)',  # value unassigned (e.g., Untitled) +    '(:unav)',  # value unavailable, possibly unknown +    '(:unkn)',  # known to be unknown (e.g., Anonymous, Inconnue) +    '(:none)',  # never had a value, never will +    '(:null)',  # explicitly and meaningfully empty +    '(:tba)',  # to be assigned or announced later +    '(:etal)',  # too numerous to list (et alia) +) + +# UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking +# unknown values. +UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( +    'NA', +    'NN', +    'n.a.', +    '[s.n.]', +))) +  # TODO(martin): merge this with other maps, maybe.  LICENSE_SLUG_MAP = {      "//creativecommons.org/licenses/by/2.0/": "CC-BY", @@ -326,7 +349,7 @@ class DataciteImporter(EntityImporter):                  if raw_affiliation == '':                      continue -                if name in ('(:Unav)', 'NA', 'NN', '(:Null)'): +                if name.lower() in UNKNOWN_MARKERS:                      continue                  # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. @@ -345,7 +368,7 @@ class DataciteImporter(EntityImporter):                      ))              elif nameType == 'Organizational':                  name = c.get('name', '') or '' -                if name == 'NN': +                if name in UNKNOWN_MARKERS:                      continue                  if len(name) < 3:                      continue @@ -394,8 +417,7 @@ class DataciteImporter(EntityImporter):          # Publisher. A few NA values. A few bogus values.          publisher = attributes.get('publisher') -        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', -                         '(:none)', 'Unpublished'): +        if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')):              publisher = None              release_stage = None          if publisher is not None and len(publisher) > 80: | 
