From 1d1700678747ae711afbf105b962c5a1db3e7196 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 10 Nov 2021 14:12:59 -0800 Subject: improve lookup_license_slug helper and lookup table --- python/fatcat_tools/biblio_lookup_tables.py | 103 ++++++++++++++-------------- python/fatcat_tools/normal.py | 15 ++-- 2 files changed, 62 insertions(+), 56 deletions(-) diff --git a/python/fatcat_tools/biblio_lookup_tables.py b/python/fatcat_tools/biblio_lookup_tables.py index a9a097ae..edb1f5ef 100644 --- a/python/fatcat_tools/biblio_lookup_tables.py +++ b/python/fatcat_tools/biblio_lookup_tables.py @@ -467,69 +467,72 @@ CONTAINER_TYPE_MAP: Dict[str, str] = { # popular are here; many were variants of the CC URLs. Would be useful to # normalize CC licenses better. # The current norm is to only add license slugs that are at least partially OA. +# NOTE: URL patterns should be lower-case, and have any trailing slash ("/") +# removed. Slugs are usually upper-case acronyms LICENSE_SLUG_MAP: Dict[str, str] = { "//creativecommons.org/publicdomain/mark/1.0": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/": "CC-0", "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/publicdomain/zero/1.0/": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0": "CC-0", "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0", - "//creativecommons.org/licenses/by/2.0/": "CC-BY", - "//creativecommons.org/licenses/by/3.0/": "CC-BY", - "//creativecommons.org/licenses/by/4.0/": "CC-BY", - "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", - "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", - "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", - "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", - "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", - "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", - "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", - "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", - "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", - "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", - "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0", - "//spdx.org/licenses/CC0-1.0.json": "CC-0", - "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY", - "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY", - "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC", - "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA", - "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA", - "//spdx.org/licenses/MIT.json": "MIT", - "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada", - "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", - "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0", - "//www.karger.com/Services/SiteLicenses": "KARGER", - "//www.karger.com/Services/SiteLicenses/": "KARGER", - "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK", - "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK", - "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET", - "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN", + "//creativecommons.org/share-your-work/public-domain/cc0": "CC-0", + "//creativecommons.org/licenses/by/2.0": "CC-BY", + "//creativecommons.org/licenses/by/3.0": "CC-BY", + "//creativecommons.org/licenses/by/4.0": "CC-BY", + "//creativecommons.org/licenses/by-sa/3.0": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/4.0": "CC-BY-SA", + "//creativecommons.org/licenses/by-nd/3.0": "CC-BY-ND", + "//creativecommons.org/licenses/by-nd/4.0": "CC-BY-ND", + "//creativecommons.org/licenses/by-nc/3.0": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/4.0": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc-sa/3.0": "CC-BY-NC-SA", + "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA", + "//creativecommons.org/licenses/by-nc-nd/3.0": "CC-BY-NC-ND", + "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-NC-ND", + "//creativecommons.org/share-your-work/public-domain/cc0": "CC-0", + "//spdx.org/licenses/cc0-1.0.json": "CC-0", + "//spdx.org/licenses/cc-by-1.0.json": "CC-BY", + "//spdx.org/licenses/cc-by-4.0.json": "CC-BY", + "//spdx.org/licenses/cc-by-nc-4.0.json": "CC-BY-NC", + "//spdx.org/licenses/cc-by-sa-3.0.json": "CC-BY-SA", + "//spdx.org/licenses/cc-by-sa-4.0.json": "CC-BY-SA", + "//spdx.org/licenses/mit.json": "MIT", + "//spdx.org/licenses/ogl-canada-2.0.json": "OGL-Canada", + "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0", + "//www.elsevier.com/tdm/userlicense/1.0": "ELSEVIER-USER-1.0", + "//www.karger.com/services/siteLicenses": "KARGER", + "//www.karger.com/services/siteLicenses": "KARGER", + "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml": "ADS-UK", + "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml": "ADS-UK", + "//archaeologydataservice.ac.uk/advice/termsofuseandaccess": "ADS-UK", + "//homepage.data-planet.com/terms-use": "SAGE-DATA-PLANET", + "//homepage.data-planet.com/terms-use": "SAGE-DATA-PLANET", + "//publikationen.bibliothek.kit.edu/kitopen-lizenz": "KIT-OPEN", + "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY", "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY", - "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY", "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE", - "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE", - "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC", - "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC", + "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE", + "//www.ametsoc.org/pubsreuselicenses": "AMETSOC", + "//www.ametsoc.org/pubsreuselicenses": "AMETSOC", + "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA", "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA", - "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA", "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", - "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", - "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", - "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3", - "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2", - # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license - # //www.springer.com/tdm doesn't seem like a license + "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", + "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", + "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3", + "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2", + "//arxiv.org/licenses/nonexclusive-distrib/1.0": "ARXIV-1.0", + # skip these non-OA licenses # //iopscience.iop.org/page/copyright is closed # //www.acm.org/publications/policies/copyright_policy#Background is closed - # //rsc.li/journals-terms-of-use is closed for vor (am open) # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!) - "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", # skip these TDM licenses; they don't apply to content - # "//www.springer.com/tdm/": "SPRINGER-TDM", - # "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM", - # "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1", + # "//www.springer.com/tdm": "SPRINGER-TDM", + # "//journals.sagepub.com/page/policies/text-and-data-mining-license": "SAGE-TDM", + # "//doi.wiley.com/10.1002/tdm_license_1.1": "WILEY-TDM-1.1", + # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license + # //www.springer.com/tdm doesn't seem like a license + # //rsc.li/journals-terms-of-use is closed for vor (am open) } # Map various datacite type types to CSL-ish types. None means TODO or remove. diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index fc80411c..dd0a4f74 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -611,12 +611,15 @@ def test_parse_country_name() -> None: def lookup_license_slug(raw: Optional[str]) -> Optional[str]: if not raw: return None - raw = raw.strip().replace("http://", "//").replace("https://", "//") - if "creativecommons.org" in raw.lower(): - raw = raw.lower() - raw = raw.replace("/legalcode", "/").replace("/uk", "") - if not raw.endswith("/"): - raw = raw + "/" + # normalize to lower-case and not ending with a slash + raw = raw.strip().lower() + if raw.endswith("/"): + raw = raw[:-1] + # remove http/https prefix + raw = raw.replace("http://", "//").replace("https://", "//") + # special-case normalization of CC licenses + if "creativecommons.org" in raw: + raw = raw.replace("/legalcode", "").replace("/uk", "") return LICENSE_SLUG_MAP.get(raw) -- cgit v1.2.3