diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-10 17:33:15 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-11 15:32:28 -0700 |
commit | 26646b5636767495881965d566e3889ad6d126e7 (patch) | |
tree | 9e89ceeea3062a919eb31c69aa832511e429f198 /python/fatcat_tools/importers | |
parent | e9dd3c73f036d3fba2680eeaff8e62ecf2dbf9a1 (diff) | |
download | fatcat-26646b5636767495881965d566e3889ad6d126e7.tar.gz fatcat-26646b5636767495881965d566e3889ad6d126e7.zip |
datacite import: refactor publisher-specific hacks into static method
Also tweak title/publisher detection to use DOI prefixes
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 44 |
1 files changed, 29 insertions, 15 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index f93362d6..00ce9ccd 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -493,21 +493,6 @@ class DataciteImporter(EntityImporter): if release_type is None: print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr) - # release_type exception: Global Biodiversity Information Facility - # publishes highly interesting datasets, but titles are mostly the same - # ("GBIF Occurrence Download" or "Occurrence Download"); set - # release_type to "stub" (CSL/FC). - if publisher == 'The Global Biodiversity Information Facility': - release_type = 'stub' - - # release_type exception: lots of "Experimental Crystal Structure Determination" - if publisher == 'Cambridge Crystallographic Data Centre': - release_type = 'entry' - - # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." - if title.lower().startswith('additional file'): - release_type = 'stub' - # Language values are varied ("ger", "es", "English", "ENG", "en-us", # "other", ...). Try to crush it with langcodes: "It may sound to you # like langcodes solves a pretty boring problem. At one level, that's @@ -693,6 +678,35 @@ class DataciteImporter(EntityImporter): license_slug=license_slug, version=version, ) + re = self.biblio_hacks(re) + return re + + @staticmethod + def biblio_hacks(re): + """ + This function handles known special cases. For example, + publisher-specific or platform-specific workarounds. + """ + + # only runs on datacite entities with a DOI + assert re.ext_ids.doi + + # release_type exception: Global Biodiversity Information Facility + # publishes highly interesting datasets, but titles are mostly the same + # ("GBIF Occurrence Download" or "Occurrence Download"); set + # release_type to "stub" (CSL/FC). + if re.title == 'GBIF Occurrence Download' and re.ext_ids.doi.startswith('10.15468/dl.'): + re.release_type = 'stub' + + # release_type exception: lots of "Experimental Crystal Structure Determination" + # publisher: "Cambridge Crystallographic Data Centre" + if re.ext_ids.doi.startswith('10.5517/'): + re.release_type = 'entry' + + # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." + if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'): + re.release_type = 'component' + return re def try_update(self, re): |