aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-01-08 03:01:27 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-01-08 03:01:27 +0100
commit6499e2911386f3f5e82a589c71da4003043bfc72 (patch)
tree4903deb4a5d9f1fae5fc730008435912daf3b78f
parent06da78e2360f803b60fd9a0e28932d825c0a0019 (diff)
downloadfatcat-6499e2911386f3f5e82a589c71da4003043bfc72.tar.gz
fatcat-6499e2911386f3f5e82a589c71da4003043bfc72.zip
datacite: over 3% records have the same title: stub
The GBIF (https://www.gbif.org/) deposits most records under the titles: * 599243 GBIF Occurrence Download * 41176 Occurrence Download Mark them as "stub" for the moment (https://guide.fatcat.wiki/entity_release.html#release_type-vocabulary).
-rw-r--r--python/fatcat_tools/importers/datacite.py7
1 files changed, 7 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 4996fbed..52fede06 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -530,6 +530,13 @@ class DataciteImporter(EntityImporter):
if release_type is None:
print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr)
+ # release_type exception: Global Biodiversity Information Facility
+ # publishes highly interesting datasets, but titles are mostly the same
+ # ("GBIF Occurrence Download" or "Occurrence Download"); set
+ # release_type to "stub" (CSL/FC).
+ if publisher == 'The Global Biodiversity Information Facility':
+ release_type = 'stub'
+
# Language values are varied ("ger", "es", "English", "ENG", "en-us",
# "other", ...). Try to crush it with langcodes: "It may sound to you
# like langcodes solves a pretty boring problem. At one level, that's