summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/datacite.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-10 13:52:39 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-10 13:52:43 -0800
commitddc757bc1d5c610f42e9f5f10a4f060f517b66ca (patch)
tree86ccdef998bd3da3910cfe8fb9f2177b58a664a0 /python/fatcat_tools/importers/datacite.py
parent16e9979a6f347b49764c1141209e84083ea81057 (diff)
downloadfatcat-ddc757bc1d5c610f42e9f5f10a4f060f517b66ca.tar.gz
fatcat-ddc757bc1d5c610f42e9f5f10a4f060f517b66ca.zip
refactor importer metadata tables into separate file; move some helpers around
- MAX_ABSTRACT_LENGTH set in a single place (importer common) - merge datacite license slug table in to common table, removing some TDM-specific licenses (which do not apply in the context of preserving the full work)
Diffstat (limited to 'python/fatcat_tools/importers/datacite.py')
-rw-r--r--python/fatcat_tools/importers/datacite.py155
1 files changed, 10 insertions, 145 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 441514b8..b310f8bc 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -21,113 +21,19 @@ import langdetect
import pycountry
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import DATACITE_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
from fatcat_tools.transforms import entity_to_dict
-from .common import EntityImporter
-
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter
# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
-CONTAINER_TYPE_MAP: Dict[str, str] = {
+DATACITE_CONTAINER_TYPE_MAP: Dict[str, str] = {
"Journal": "journal",
"Series": "journal",
"Book Series": "book-series",
}
-# The docs/guide should be the canonical home for these mappings; update there
-# first. Map various datacite type types to CSL-ish types. None means TODO or
-# remove.
-DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
- "ris": {
- "THES": "thesis",
- "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report)
- "CHAP": "chapter",
- "FIGURE": "figure",
- "RPRT": "report",
- "JOUR": "article-journal",
- "MPCT": "motion_picture",
- "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset
- "BOOK": "book",
- "DATA": "dataset",
- "COMP": "software",
- },
- "schemaOrg": {
- "Dataset": "dataset",
- "Book": "book",
- "ScholarlyArticle": "article-journal",
- "ImageObject": "graphic",
- "Collection": None,
- "MediaObject": None,
- "Event": None,
- "SoftwareSourceCode": "software",
- "Chapter": "chapter",
- "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
- "PublicationIssue": "article",
- "AudioObject": None,
- "Thesis": "thesis",
- },
- "citeproc": {
- "article": "article",
- "article-journal": "article-journal",
- "article-magazine": "article-magazine",
- "article-newspaper": "article-newspaper",
- "bill": "bill",
- "book": "book",
- "broadcast": "broadcast",
- "chapter": "chapter",
- "dataset": "dataset",
- "entry-dictionary": "entry-dictionary",
- "entry-encyclopedia": "entry-encyclopedia",
- "entry": "entry",
- "figure": "figure",
- "graphic": "graphic",
- "interview": "interview",
- "legal_case": "legal_case",
- "legislation": "legislation",
- "manuscript": "manuscript",
- "map": "map",
- "motion_picture": "motion_picture",
- "musical_score": "musical_score",
- "pamphlet": "pamphlet",
- "paper-conference": "paper-conference",
- "patent": "patent",
- "personal_communication": "personal_communication",
- "post": "post",
- "post-weblog": "post-weblog",
- "report": "report",
- "review-book": "review-book",
- "review": "review",
- "song": "song",
- "speech": "speech",
- "thesis": "thesis",
- "treaty": "treaty",
- "webpage": "webpage",
- }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
- "bibtex": {
- "phdthesis": "thesis",
- "inbook": "chapter",
- "misc": None,
- "article": "article-journal",
- "book": "book",
- },
- "resourceTypeGeneral": {
- "Image": "graphic",
- "Dataset": "dataset",
- "PhysicalObject": None,
- "Collection": None,
- "Text": None, # "Greyliterature, labnotes, accompanyingmaterials"
- "Sound": None,
- "InteractiveResource": None,
- "Event": None,
- "Software": "software",
- "Other": None,
- "Workflow": None,
- "Audiovisual": None,
- }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
-}
-
# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
DATACITE_UNKNOWN_MARKERS: List[str] = [
"(:unac)", # temporarily inaccessible
@@ -180,43 +86,6 @@ DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [
}
]
-# TODO(martin): merge this with other maps and lookup functions, eventually.
-LICENSE_SLUG_MAP: Dict[str, str] = {
- "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
- "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
- "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
- "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
- "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
- "//onlinelibrary.wiley.com/termsandconditions/": "WILEY",
- "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
- "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
- "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
- "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
- "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
- "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
- "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
- "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
- "//www.karger.com/Services/SiteLicenses/": "KARGER",
- "//www.springer.com/tdm/": "SPRINGER-TDM",
- "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
- "//spdx.org/licenses/CC0-1.0.json": "CC-0",
- "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
- "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
- "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
- "//spdx.org/licenses/MIT.json": "MIT",
- "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-}
-
class DataciteImporter(EntityImporter):
"""
@@ -406,8 +275,8 @@ class DataciteImporter(EntityImporter):
container_name = None
container = attributes.get("container", {}) or {}
- if container.get("type") in CONTAINER_TYPE_MAP.keys():
- container_type = CONTAINER_TYPE_MAP.get(container["type"])
+ if container.get("type") in DATACITE_CONTAINER_TYPE_MAP.keys():
+ container_type = DATACITE_CONTAINER_TYPE_MAP.get(container["type"])
if container.get("identifier") and container.get("identifierType") == "ISSN":
issn = container.get("identifier")
if issn and len(issn) == 8:
@@ -488,7 +357,7 @@ class DataciteImporter(EntityImporter):
license_extra = []
for lic in attributes.get("rightsList", []):
- slug = lookup_license_slug(lic.get("rightsUri"))
+ slug = datacite_lookup_license_slug(lic.get("rightsUri"))
if slug:
license_slug = slug
license_extra.append(lic)
@@ -968,7 +837,7 @@ def contributor_list_contains_contributor(
return False
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+def datacite_lookup_license_slug(raw: Optional[str]) -> Optional[str]:
"""
Resolve a variety of strings into a some pseudo-canonical form, e.g.
CC-BY-ND, CC-0, MIT and so on.
@@ -1063,12 +932,8 @@ def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
return None
return "RS-{}".format(name.upper())
- # Fallback to mapped values.
- raw = raw.lower()
- raw = raw.strip().replace("http://", "//").replace("https://", "//")
- if not raw.endswith("/"):
- raw = raw + "/"
- return LICENSE_SLUG_MAP.get(raw)
+ # Fallback to generic license lookup
+ return lookup_license_slug(raw)
def find_original_language_title(