From ddc757bc1d5c610f42e9f5f10a4f060f517b66ca Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 10 Nov 2021 13:52:39 -0800 Subject: refactor importer metadata tables into separate file; move some helpers around - MAX_ABSTRACT_LENGTH set in a single place (importer common) - merge datacite license slug table in to common table, removing some TDM-specific licenses (which do not apply in the context of preserving the full work) --- python/fatcat_tools/importers/datacite.py | 155 ++---------------------------- 1 file changed, 10 insertions(+), 145 deletions(-) (limited to 'python/fatcat_tools/importers/datacite.py') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 441514b8..b310f8bc 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -21,113 +21,19 @@ import langdetect import pycountry from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from fatcat_tools.normal import clean_doi, clean_str +from fatcat_tools.biblio_lookup_tables import DATACITE_TYPE_MAP +from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug from fatcat_tools.transforms import entity_to_dict -from .common import EntityImporter - -# Cutoff length for abstracts. -MAX_ABSTRACT_LENGTH = 2048 +from .common import MAX_ABSTRACT_LENGTH, EntityImporter # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary -CONTAINER_TYPE_MAP: Dict[str, str] = { +DATACITE_CONTAINER_TYPE_MAP: Dict[str, str] = { "Journal": "journal", "Series": "journal", "Book Series": "book-series", } -# The docs/guide should be the canonical home for these mappings; update there -# first. Map various datacite type types to CSL-ish types. None means TODO or -# remove. -DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = { - "ris": { - "THES": "thesis", - "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report) - "CHAP": "chapter", - "FIGURE": "figure", - "RPRT": "report", - "JOUR": "article-journal", - "MPCT": "motion_picture", - "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset - "BOOK": "book", - "DATA": "dataset", - "COMP": "software", - }, - "schemaOrg": { - "Dataset": "dataset", - "Book": "book", - "ScholarlyArticle": "article-journal", - "ImageObject": "graphic", - "Collection": None, - "MediaObject": None, - "Event": None, - "SoftwareSourceCode": "software", - "Chapter": "chapter", - "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. - "PublicationIssue": "article", - "AudioObject": None, - "Thesis": "thesis", - }, - "citeproc": { - "article": "article", - "article-journal": "article-journal", - "article-magazine": "article-magazine", - "article-newspaper": "article-newspaper", - "bill": "bill", - "book": "book", - "broadcast": "broadcast", - "chapter": "chapter", - "dataset": "dataset", - "entry-dictionary": "entry-dictionary", - "entry-encyclopedia": "entry-encyclopedia", - "entry": "entry", - "figure": "figure", - "graphic": "graphic", - "interview": "interview", - "legal_case": "legal_case", - "legislation": "legislation", - "manuscript": "manuscript", - "map": "map", - "motion_picture": "motion_picture", - "musical_score": "musical_score", - "pamphlet": "pamphlet", - "paper-conference": "paper-conference", - "patent": "patent", - "personal_communication": "personal_communication", - "post": "post", - "post-weblog": "post-weblog", - "report": "report", - "review-book": "review-book", - "review": "review", - "song": "song", - "speech": "speech", - "thesis": "thesis", - "treaty": "treaty", - "webpage": "webpage", - }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types - "bibtex": { - "phdthesis": "thesis", - "inbook": "chapter", - "misc": None, - "article": "article-journal", - "book": "book", - }, - "resourceTypeGeneral": { - "Image": "graphic", - "Dataset": "dataset", - "PhysicalObject": None, - "Collection": None, - "Text": None, # "Greyliterature, labnotes, accompanyingmaterials" - "Sound": None, - "InteractiveResource": None, - "Event": None, - "Software": "software", - "Other": None, - "Workflow": None, - "Audiovisual": None, - }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 -} - # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. DATACITE_UNKNOWN_MARKERS: List[str] = [ "(:unac)", # temporarily inaccessible @@ -180,43 +86,6 @@ DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [ } ] -# TODO(martin): merge this with other maps and lookup functions, eventually. -LICENSE_SLUG_MAP: Dict[str, str] = { - "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK", - "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK", - "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", - "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1", - "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET", - "//onlinelibrary.wiley.com/termsandconditions/": "WILEY", - "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN", - "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY", - "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE", - "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC", - "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA", - "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", - "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", - "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0", - "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3", - "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2", - "//www.karger.com/Services/SiteLicenses/": "KARGER", - "//www.springer.com/tdm/": "SPRINGER-TDM", - "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM", - "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0", - "//spdx.org/licenses/CC0-1.0.json": "CC-0", - "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY", - "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY", - "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC", - "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA", - "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA", - "//spdx.org/licenses/MIT.json": "MIT", - "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada", -} - class DataciteImporter(EntityImporter): """ @@ -406,8 +275,8 @@ class DataciteImporter(EntityImporter): container_name = None container = attributes.get("container", {}) or {} - if container.get("type") in CONTAINER_TYPE_MAP.keys(): - container_type = CONTAINER_TYPE_MAP.get(container["type"]) + if container.get("type") in DATACITE_CONTAINER_TYPE_MAP.keys(): + container_type = DATACITE_CONTAINER_TYPE_MAP.get(container["type"]) if container.get("identifier") and container.get("identifierType") == "ISSN": issn = container.get("identifier") if issn and len(issn) == 8: @@ -488,7 +357,7 @@ class DataciteImporter(EntityImporter): license_extra = [] for lic in attributes.get("rightsList", []): - slug = lookup_license_slug(lic.get("rightsUri")) + slug = datacite_lookup_license_slug(lic.get("rightsUri")) if slug: license_slug = slug license_extra.append(lic) @@ -968,7 +837,7 @@ def contributor_list_contains_contributor( return False -def lookup_license_slug(raw: Optional[str]) -> Optional[str]: +def datacite_lookup_license_slug(raw: Optional[str]) -> Optional[str]: """ Resolve a variety of strings into a some pseudo-canonical form, e.g. CC-BY-ND, CC-0, MIT and so on. @@ -1063,12 +932,8 @@ def lookup_license_slug(raw: Optional[str]) -> Optional[str]: return None return "RS-{}".format(name.upper()) - # Fallback to mapped values. - raw = raw.lower() - raw = raw.strip().replace("http://", "//").replace("https://", "//") - if not raw.endswith("/"): - raw = raw + "/" - return LICENSE_SLUG_MAP.get(raw) + # Fallback to generic license lookup + return lookup_license_slug(raw) def find_original_language_title( -- cgit v1.2.3