From 3590cf0e06b6c4f1b1c9621a94c9567e398bca04 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 6 Jan 2020 21:47:13 +0100 Subject: datacite: clean abstracts, use unknown value tokens Datacite defines placeholders for unknown values: * https://support.datacite.org/docs/schema-values-unknown-information-v43 Clean abstracts. --- python/fatcat_tools/importers/datacite.py | 30 +++++++++++++++++++--- .../tests/files/datacite/datacite_result_05.json | 2 +- .../tests/files/datacite/datacite_result_08.json | 2 +- .../tests/files/datacite/datacite_result_14.json | 2 +- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index c3d6138e..f9d1b49a 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -125,6 +125,29 @@ DATACITE_TYPE_MAP = { } } +# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. +DATACITE_UNKNOWN_MARKERS = ( + '(:unac)', # temporarily inaccessible + '(:unal)', # unallowed, suppressed intentionally + '(:unap)', # not applicable, makes no sense + '(:unas)', # value unassigned (e.g., Untitled) + '(:unav)', # value unavailable, possibly unknown + '(:unkn)', # known to be unknown (e.g., Anonymous, Inconnue) + '(:none)', # never had a value, never will + '(:null)', # explicitly and meaningfully empty + '(:tba)', # to be assigned or announced later + '(:etal)', # too numerous to list (et alia) +) + +# UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking +# unknown values. +UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( + 'NA', + 'NN', + 'n.a.', + '[s.n.]', +))) + # TODO(martin): merge this with other maps, maybe. LICENSE_SLUG_MAP = { "//creativecommons.org/licenses/by/2.0/": "CC-BY", @@ -326,7 +349,7 @@ class DataciteImporter(EntityImporter): if raw_affiliation == '': continue - if name in ('(:Unav)', 'NA', 'NN', '(:Null)'): + if name.lower() in UNKNOWN_MARKERS: continue # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. @@ -345,7 +368,7 @@ class DataciteImporter(EntityImporter): )) elif nameType == 'Organizational': name = c.get('name', '') or '' - if name == 'NN': + if name in UNKNOWN_MARKERS: continue if len(name) < 3: continue @@ -394,8 +417,7 @@ class DataciteImporter(EntityImporter): # Publisher. A few NA values. A few bogus values. publisher = attributes.get('publisher') - if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', - '(:none)', 'Unpublished'): + if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')): publisher = None release_stage = None if publisher is not None and len(publisher) > 80: diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json index ff998c0f..1840884e 100644 --- a/python/tests/files/datacite/datacite_result_05.json +++ b/python/tests/files/datacite/datacite_result_05.json @@ -523,7 +523,7 @@ "refs": [], "abstracts": [ { - "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (<0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.", + "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (<0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.", "mimetype": "text/plain", "lang": "en" } diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json index cc0e968b..46ef5b44 100644 --- a/python/tests/files/datacite/datacite_result_08.json +++ b/python/tests/files/datacite/datacite_result_08.json @@ -46,7 +46,7 @@ "refs": [], "abstracts": [ { - "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan\u2019s irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.", + "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan's irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.", "mimetype": "text/plain", "lang": "en" } diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json index 4521f891..c3719aeb 100644 --- a/python/tests/files/datacite/datacite_result_14.json +++ b/python/tests/files/datacite/datacite_result_14.json @@ -103,7 +103,7 @@ "refs": [], "abstracts": [ { - "content": "An entry from the Cambridge Structural Database, the world\u2019s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.", + "content": "An entry from the Cambridge Structural Database, the world's repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.", "mimetype": "text/plain", "lang": "en" } -- cgit v1.2.3