From 06da78e2360f803b60fd9a0e28932d825c0a0019 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 8 Jan 2020 02:31:46 +0100 Subject: datacite: fill a few more release_type gaps * citeproc: http://docs.citationstyles.org/en/stable/specification.html#appendix-iii-types * resourceTypeGeneral: https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 * resourceType: uncontrolled, over 170000 distinct values, frequent: null, Dataset, JournalArticle, PGRFA Material, Journal Article, Dataset/UNITE Species Hypothesis, ... General frequency: * "attributes.types": 18210075, * "attributes.types.ris": 18058890, * "attributes.types.bibtex": 18058888, * "attributes.types.citeproc": 18058890, * "attributes.types.schemaOrg": 18058929, * "attributes.types.resourceType": 12737988, * "attributes.types.resourceTypeGeneral": 16576139, --- python/fatcat_tools/importers/datacite.py | 35 ++++++++++++++++--------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index c2725aeb..4996fbed 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -41,28 +41,28 @@ CONTAINER_TYPE_MAP = { DATACITE_TYPE_MAP = { 'ris': { 'THES': 'thesis', - 'SOUND': None, + 'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report) 'CHAP': 'chapter', - 'FIGURE': None, + 'FIGURE': 'figure', 'RPRT': 'report', 'JOUR': 'article-journal', - 'MPCT': None, - 'GEN': None, + 'MPCT': 'motion_picture', + 'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset 'BOOK': 'book', 'DATA': 'dataset', - 'COMP': None, + 'COMP': 'software', }, 'schemaOrg': { 'Dataset': 'dataset', 'Book': 'book', - 'ScholarlyArticle': 'article', + 'ScholarlyArticle': 'article-journal', 'ImageObject': 'graphic', 'Collection': None, 'MediaObject': None, 'Event': None, - 'SoftwareSourceCode': None, + 'SoftwareSourceCode': 'software', 'Chapter': 'chapter', - 'CreativeWork': None, + 'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. 'PublicationIssue': 'article', 'AudioObject': None, 'Thesis': 'thesis', @@ -112,19 +112,19 @@ DATACITE_TYPE_MAP = { 'book': 'book', }, 'resourceTypeGeneral': { - 'Image': None, + 'Image': 'graphic', 'Dataset': 'dataset', 'PhysicalObject': None, 'Collection': None, - 'Text': None, + 'Text': None, # "Greyliterature, labnotes, accompanyingmaterials" 'Sound': None, 'InteractiveResource': None, 'Event': None, - 'Software': None, + 'Software': 'software', 'Other': None, 'Workflow': None, 'Audiovisual': None, - } + } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 } # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. @@ -516,11 +516,12 @@ class DataciteImporter(EntityImporter): license_extra.append(l) # Release type. Try to determine the release type from a variety of - # types supplied in datacite. The "attributes.types.resourceType" - # contains too many (176 in sample) things for now; citeproc may be the - # closest, but not always supplied. - for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', - 'bibtex', 'ris'): + # types supplied in datacite. The "attributes.types.resourceType" is + # uncontrolled (170000+ unique values, from "null", "Dataset" to + # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP + # flows in 2009") citeproc may be the closest, but not always supplied. + # Order lookup roughly by completeness of mapping. + for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): value = attributes.get('types', {}).get(typeType) release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) if release_type is not None: -- cgit v1.2.3