aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-08-10 17:34:50 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-08-11 15:32:28 -0700
commit211ef075f5ac2960fa09134043a8246270d99baf (patch)
treefe27016b4cb399cd27fa152eccadd99922a336d7 /python/fatcat_tools
parent26646b5636767495881965d566e3889ad6d126e7 (diff)
downloadfatcat-211ef075f5ac2960fa09134043a8246270d99baf.tar.gz
fatcat-211ef075f5ac2960fa09134043a8246270d99baf.zip
datacite import: refactor release_type detection into static method
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/datacite.py65
1 files changed, 51 insertions, 14 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 00ce9ccd..0481337a 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -478,20 +478,7 @@ class DataciteImporter(EntityImporter):
license_slug = slug
license_extra.append(lic)
- # Release type. Try to determine the release type from a variety of
- # types supplied in datacite. The "attributes.types.resourceType" is
- # uncontrolled (170000+ unique values, from "null", "Dataset" to
- # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP
- # flows in 2009") citeproc may be the closest, but not always supplied.
- # Order lookup roughly by completeness of mapping.
- for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'):
- value = attributes.get('types', {}).get(typeType)
- release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
- if release_type is not None:
- break
-
- if release_type is None:
- print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr)
+ release_type = self.datacite_release_type(doi, attributes)
# Language values are varied ("ger", "es", "English", "ENG", "en-us",
# "other", ...). Try to crush it with langcodes: "It may sound to you
@@ -682,6 +669,38 @@ class DataciteImporter(EntityImporter):
return re
@staticmethod
+ def datacite_release_type(doi, attributes):
+ """
+ Release type. Try to determine the release type from a variety of types
+ supplied in datacite. The "attributes.types.resourceType" is
+ uncontrolled (170000+ unique values, from "null", "Dataset" to "Jupyter
+ Notebook" and "Macroseismic Data Points" or "2 days of IP flows in
+ 2009") citeproc may be the closest, but not always supplied. Order
+ lookup roughly by completeness of mapping.
+ """
+
+ release_type = None
+ if not attributes.get('types'):
+ return None
+ types = attributes['types']
+
+ for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'):
+ value = types.get(typeType)
+ release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
+ if release_type is not None:
+ break
+
+ # special case: figshare "collections" which group other entities
+ if doi.startswith('10.6084/') or doi.startswith('10.25384'):
+ if types.get('resourceType') == "Collection":
+ release_type = "stub"
+
+ if release_type is None:
+ print("[{}] no mapped type: {}".format(doi, types), file=sys.stderr)
+
+ return release_type
+
+ @staticmethod
def biblio_hacks(re):
"""
This function handles known special cases. For example,
@@ -707,6 +726,24 @@ class DataciteImporter(EntityImporter):
if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'):
re.release_type = 'component'
+ # figshare
+ if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'):
+ # set version if DOI ends with versioned suffix
+ doi_suffix = re.ext_ids.doi.split('.')
+ if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit():
+ re.version = doi_suffix
+ # "Figure 123 from " -> component
+ # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean"
+ if " from " in re.title and re.release_type not in ('stub', 'graphic'):
+ if re.title.startswith("Figure "):
+ re.release_type = "component"
+ elif re.title.startswith("Table "):
+ re.release_type = "component"
+
+ # figshare.com
+ if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.container_name is None:
+ re.container_name = "figshare.com"
+
return re
def try_update(self, re):