summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/datacite.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-01-08 02:31:46 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-01-08 02:31:46 +0100
commit06da78e2360f803b60fd9a0e28932d825c0a0019 (patch)
tree114ef0e0a22d4503b7fa983969f2c45518fef3e9 /python/fatcat_tools/importers/datacite.py
parentf9c711f77bba992e6e9e1d75929d35e8da828f61 (diff)
downloadfatcat-06da78e2360f803b60fd9a0e28932d825c0a0019.tar.gz
fatcat-06da78e2360f803b60fd9a0e28932d825c0a0019.zip
datacite: fill a few more release_type gaps
* citeproc: http://docs.citationstyles.org/en/stable/specification.html#appendix-iii-types * resourceTypeGeneral: https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 * resourceType: uncontrolled, over 170000 distinct values, frequent: null, Dataset, JournalArticle, PGRFA Material, Journal Article, Dataset/UNITE Species Hypothesis, ... General frequency: * "attributes.types": 18210075, * "attributes.types.ris": 18058890, * "attributes.types.bibtex": 18058888, * "attributes.types.citeproc": 18058890, * "attributes.types.schemaOrg": 18058929, * "attributes.types.resourceType": 12737988, * "attributes.types.resourceTypeGeneral": 16576139,
Diffstat (limited to 'python/fatcat_tools/importers/datacite.py')
-rw-r--r--python/fatcat_tools/importers/datacite.py35
1 files changed, 18 insertions, 17 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index c2725aeb..4996fbed 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -41,28 +41,28 @@ CONTAINER_TYPE_MAP = {
DATACITE_TYPE_MAP = {
'ris': {
'THES': 'thesis',
- 'SOUND': None,
+ 'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report)
'CHAP': 'chapter',
- 'FIGURE': None,
+ 'FIGURE': 'figure',
'RPRT': 'report',
'JOUR': 'article-journal',
- 'MPCT': None,
- 'GEN': None,
+ 'MPCT': 'motion_picture',
+ 'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset
'BOOK': 'book',
'DATA': 'dataset',
- 'COMP': None,
+ 'COMP': 'software',
},
'schemaOrg': {
'Dataset': 'dataset',
'Book': 'book',
- 'ScholarlyArticle': 'article',
+ 'ScholarlyArticle': 'article-journal',
'ImageObject': 'graphic',
'Collection': None,
'MediaObject': None,
'Event': None,
- 'SoftwareSourceCode': None,
+ 'SoftwareSourceCode': 'software',
'Chapter': 'chapter',
- 'CreativeWork': None,
+ 'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
'PublicationIssue': 'article',
'AudioObject': None,
'Thesis': 'thesis',
@@ -112,19 +112,19 @@ DATACITE_TYPE_MAP = {
'book': 'book',
},
'resourceTypeGeneral': {
- 'Image': None,
+ 'Image': 'graphic',
'Dataset': 'dataset',
'PhysicalObject': None,
'Collection': None,
- 'Text': None,
+ 'Text': None, # "Greyliterature, labnotes, accompanyingmaterials"
'Sound': None,
'InteractiveResource': None,
'Event': None,
- 'Software': None,
+ 'Software': 'software',
'Other': None,
'Workflow': None,
'Audiovisual': None,
- }
+ } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
}
# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
@@ -516,11 +516,12 @@ class DataciteImporter(EntityImporter):
license_extra.append(l)
# Release type. Try to determine the release type from a variety of
- # types supplied in datacite. The "attributes.types.resourceType"
- # contains too many (176 in sample) things for now; citeproc may be the
- # closest, but not always supplied.
- for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg',
- 'bibtex', 'ris'):
+ # types supplied in datacite. The "attributes.types.resourceType" is
+ # uncontrolled (170000+ unique values, from "null", "Dataset" to
+ # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP
+ # flows in 2009") citeproc may be the closest, but not always supplied.
+ # Order lookup roughly by completeness of mapping.
+ for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'):
value = attributes.get('types', {}).get(typeType)
release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
if release_type is not None: