From 2c8ed137181c0efc6be7a5dbdeefb783700f1d97 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 14 Feb 2019 19:57:17 -0800 Subject: better/additional crossref license lookups --- python/fatcat_tools/importers/crossref.py | 78 +++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 20 deletions(-) diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 5715654b..722f93aa 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -38,28 +38,66 @@ CONTAINER_TYPE_MAP = { 'book': 'book-series', } -# TODO: more licenses; handle https and trailing slash better. Maybe -# "PUBLISHER-SPECIFIC", or "OTHER"? +# These are based, informally, on sorting the most popular licenses found in +# Crossref metadata. There were over 500 unique strings and only a few most +# popular are here; many were variants of the CC URLs. Would be useful to +# normalize CC licenses better. +# The current norm is to only add license slugs that are at least partially OA. LICENSE_SLUG_MAP = { - "http://creativecommons.org/licenses/by/3.0/": "CC-BY", - "http://creativecommons.org/licenses/by/4.0/": "CC-BY", - "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", - "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", - "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", - "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", - "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", - "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", - "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", - "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", - "https://creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA", - "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", - "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", - "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", - "https://www.karger.com/Services/SiteLicenses": "KARGER", - # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license - # http://www.springer.com/tdm doesn't seem like a license + "//creativecommons.org/publicdomain/zero/1.0/": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", + "//creativecommons.org/licenses/by/2.0/": "CC-BY", + "//creativecommons.org/licenses/by/3.0/": "CC-BY", + "//creativecommons.org/licenses/by/4.0/": "CC-BY", + "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", + "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", + "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", + "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", + "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", + "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", + "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", + "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", + "//www.karger.com/Services/SiteLicenses": "KARGER", + "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE", + "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY", + "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", + "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", + "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA", + "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC", + # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license + # //www.springer.com/tdm doesn't seem like a license + # //iopscience.iop.org/page/copyright is closed + # //www.acm.org/publications/policies/copyright_policy#Background is closed + # //rsc.li/journals-terms-of-use is closed for vor (am open) + # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!) } +def lookup_license_slug(raw): + if not raw: + return None + raw = raw.strip().replace('http://', '//').replace('https://', '//') + if 'creativecommons.org' in raw.lower(): + raw = raw.lower() + raw = raw.replace('/legalcode', '/').replace('/uk', '') + if not raw.endswith('/'): + raw = raw + '/' + return LICENSE_SLUG_MAP.get(raw) + +def test_lookup_license_slug(): + + assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC" + assert lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") == "CC-BY" + assert lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") == "CC-0" + assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY" + assert lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") == "CC-BY-NC-SA" + assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC" + assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None + assert lookup_license_slug("") is None + assert lookup_license_slug(None) is None + class CrossrefImporter(EntityImporter): """ Importer for Crossref metadata. @@ -212,7 +250,7 @@ class CrossrefImporter(EntityImporter): for l in obj.get('license', []): if l['content-version'] not in ('vor', 'unspecified'): continue - slug = LICENSE_SLUG_MAP.get(l['URL']) + slug = lookup_license_slug(l['URL']) if slug: license_slug = slug if 'start' in l: -- cgit v1.2.3