summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-02-14 19:57:17 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-02-14 19:57:17 -0800
commit2c8ed137181c0efc6be7a5dbdeefb783700f1d97 (patch)
tree1b24be83cf9a8a10abe2435eda3cbdf44c20e58f /python/fatcat_tools/importers
parent7a436145745b006107093c196071fe4bf8aeac71 (diff)
downloadfatcat-2c8ed137181c0efc6be7a5dbdeefb783700f1d97.tar.gz
fatcat-2c8ed137181c0efc6be7a5dbdeefb783700f1d97.zip
better/additional crossref license lookups
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/crossref.py78
1 files changed, 58 insertions, 20 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 5715654b..722f93aa 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -38,28 +38,66 @@ CONTAINER_TYPE_MAP = {
'book': 'book-series',
}
-# TODO: more licenses; handle https and trailing slash better. Maybe
-# "PUBLISHER-SPECIFIC", or "OTHER"?
+# These are based, informally, on sorting the most popular licenses found in
+# Crossref metadata. There were over 500 unique strings and only a few most
+# popular are here; many were variants of the CC URLs. Would be useful to
+# normalize CC licenses better.
+# The current norm is to only add license slugs that are at least partially OA.
LICENSE_SLUG_MAP = {
- "http://creativecommons.org/licenses/by/3.0/": "CC-BY",
- "http://creativecommons.org/licenses/by/4.0/": "CC-BY",
- "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
- "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
- "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
- "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
- "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
- "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
- "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
- "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
- "https://creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA",
- "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
- "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
- "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "https://www.karger.com/Services/SiteLicenses": "KARGER",
- # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
- # http://www.springer.com/tdm doesn't seem like a license
+ "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
+ "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
+ "//creativecommons.org/licenses/by/2.0/": "CC-BY",
+ "//creativecommons.org/licenses/by/3.0/": "CC-BY",
+ "//creativecommons.org/licenses/by/4.0/": "CC-BY",
+ "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+ "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+ "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
+ "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
+ "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+ "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+ "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
+ "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+ "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
+ "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
+ "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
+ "//www.karger.com/Services/SiteLicenses": "KARGER",
+ "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
+ "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
+ "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
+ "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
+ "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
+ "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
+ # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+ # //www.springer.com/tdm doesn't seem like a license
+ # //iopscience.iop.org/page/copyright is closed
+ # //www.acm.org/publications/policies/copyright_policy#Background is closed
+ # //rsc.li/journals-terms-of-use is closed for vor (am open)
+ # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
}
+def lookup_license_slug(raw):
+ if not raw:
+ return None
+ raw = raw.strip().replace('http://', '//').replace('https://', '//')
+ if 'creativecommons.org' in raw.lower():
+ raw = raw.lower()
+ raw = raw.replace('/legalcode', '/').replace('/uk', '')
+ if not raw.endswith('/'):
+ raw = raw + '/'
+ return LICENSE_SLUG_MAP.get(raw)
+
+def test_lookup_license_slug():
+
+ assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
+ assert lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") == "CC-BY"
+ assert lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") == "CC-0"
+ assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
+ assert lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") == "CC-BY-NC-SA"
+ assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
+ assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
+ assert lookup_license_slug("") is None
+ assert lookup_license_slug(None) is None
+
class CrossrefImporter(EntityImporter):
"""
Importer for Crossref metadata.
@@ -212,7 +250,7 @@ class CrossrefImporter(EntityImporter):
for l in obj.get('license', []):
if l['content-version'] not in ('vor', 'unspecified'):
continue
- slug = LICENSE_SLUG_MAP.get(l['URL'])
+ slug = lookup_license_slug(l['URL'])
if slug:
license_slug = slug
if 'start' in l: