From 6499e2911386f3f5e82a589c71da4003043bfc72 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 8 Jan 2020 03:01:27 +0100 Subject: datacite: over 3% records have the same title: stub The GBIF (https://www.gbif.org/) deposits most records under the titles: * 599243 GBIF Occurrence Download * 41176 Occurrence Download Mark them as "stub" for the moment (https://guide.fatcat.wiki/entity_release.html#release_type-vocabulary). --- python/fatcat_tools/importers/datacite.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 4996fbed..52fede06 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -530,6 +530,13 @@ class DataciteImporter(EntityImporter): if release_type is None: print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr) + # release_type exception: Global Biodiversity Information Facility + # publishes highly interesting datasets, but titles are mostly the same + # ("GBIF Occurrence Download" or "Occurrence Download"); set + # release_type to "stub" (CSL/FC). + if publisher == 'The Global Biodiversity Information Facility': + release_type = 'stub' + # Language values are varied ("ger", "es", "English", "ENG", "en-us", # "other", ...). Try to crush it with langcodes: "It may sound to you # like langcodes solves a pretty boring problem. At one level, that's -- cgit v1.2.3