diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/importers/crossref.py | 78 | 
1 files changed, 58 insertions, 20 deletions
| diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 5715654b..722f93aa 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -38,28 +38,66 @@ CONTAINER_TYPE_MAP = {      'book': 'book-series',  } -# TODO: more licenses; handle https and trailing slash better. Maybe -# "PUBLISHER-SPECIFIC", or "OTHER"? +# These are based, informally, on sorting the most popular licenses found in +# Crossref metadata. There were over 500 unique strings and only a few most +# popular are here; many were variants of the CC URLs. Would be useful to +# normalize CC licenses better. +# The current norm is to only add license slugs that are at least partially OA.  LICENSE_SLUG_MAP = { -    "http://creativecommons.org/licenses/by/3.0/": "CC-BY", -    "http://creativecommons.org/licenses/by/4.0/": "CC-BY", -    "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", -    "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", -    "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", -    "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", -    "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", -    "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", -    "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", -    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", -    "https://creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA", -    "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", -    "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", -    "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", -    "https://www.karger.com/Services/SiteLicenses": "KARGER", -    # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license -    # http://www.springer.com/tdm doesn't seem like a license +    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0", +    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", +    "//creativecommons.org/licenses/by/2.0/": "CC-BY", +    "//creativecommons.org/licenses/by/3.0/": "CC-BY", +    "//creativecommons.org/licenses/by/4.0/": "CC-BY", +    "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", +    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", +    "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", +    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", +    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", +    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", +    "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", +    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", +    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", +    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", +    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", +    "//www.karger.com/Services/SiteLicenses": "KARGER", +    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE", +    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY", +    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", +    "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", +    "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA", +    "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC", +    # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license +    # //www.springer.com/tdm doesn't seem like a license +    # //iopscience.iop.org/page/copyright is closed +    # //www.acm.org/publications/policies/copyright_policy#Background is closed +    # //rsc.li/journals-terms-of-use is closed for vor (am open) +    # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)  } +def lookup_license_slug(raw): +    if not raw: +        return None +    raw = raw.strip().replace('http://', '//').replace('https://', '//') +    if 'creativecommons.org' in raw.lower(): +        raw = raw.lower() +        raw = raw.replace('/legalcode', '/').replace('/uk', '') +        if not raw.endswith('/'): +            raw = raw + '/' +    return LICENSE_SLUG_MAP.get(raw) + +def test_lookup_license_slug(): + +    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC" +    assert lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") == "CC-BY" +    assert lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") == "CC-0" +    assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY" +    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") == "CC-BY-NC-SA" +    assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC" +    assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None +    assert lookup_license_slug("") is None +    assert lookup_license_slug(None) is None +  class CrossrefImporter(EntityImporter):      """      Importer for Crossref metadata. @@ -212,7 +250,7 @@ class CrossrefImporter(EntityImporter):          for l in obj.get('license', []):              if l['content-version'] not in ('vor', 'unspecified'):                  continue -            slug = LICENSE_SLUG_MAP.get(l['URL']) +            slug = lookup_license_slug(l['URL'])              if slug:                  license_slug = slug              if 'start' in l: | 
