summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-10 14:12:59 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-10 14:14:48 -0800
commit1d1700678747ae711afbf105b962c5a1db3e7196 (patch)
tree4a7219aaebe6e48831f805728da3b77206f88a1e /python
parentddc757bc1d5c610f42e9f5f10a4f060f517b66ca (diff)
downloadfatcat-1d1700678747ae711afbf105b962c5a1db3e7196.tar.gz
fatcat-1d1700678747ae711afbf105b962c5a1db3e7196.zip
improve lookup_license_slug helper and lookup table
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/biblio_lookup_tables.py103
-rw-r--r--python/fatcat_tools/normal.py15
2 files changed, 62 insertions, 56 deletions
diff --git a/python/fatcat_tools/biblio_lookup_tables.py b/python/fatcat_tools/biblio_lookup_tables.py
index a9a097ae..edb1f5ef 100644
--- a/python/fatcat_tools/biblio_lookup_tables.py
+++ b/python/fatcat_tools/biblio_lookup_tables.py
@@ -467,69 +467,72 @@ CONTAINER_TYPE_MAP: Dict[str, str] = {
# popular are here; many were variants of the CC URLs. Would be useful to
# normalize CC licenses better.
# The current norm is to only add license slugs that are at least partially OA.
+# NOTE: URL patterns should be lower-case, and have any trailing slash ("/")
+# removed. Slugs are usually upper-case acronyms
LICENSE_SLUG_MAP: Dict[str, str] = {
"//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
"//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
+ "//creativecommons.org/publicdomain/zero/1.0": "CC-0",
"//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
"//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
- "//creativecommons.org/licenses/by/2.0/": "CC-BY",
- "//creativecommons.org/licenses/by/3.0/": "CC-BY",
- "//creativecommons.org/licenses/by/4.0/": "CC-BY",
- "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
- "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
- "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
- "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
- "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
- "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
- "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
- "//spdx.org/licenses/CC0-1.0.json": "CC-0",
- "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
- "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
- "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
- "//spdx.org/licenses/MIT.json": "MIT",
- "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
- "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.karger.com/Services/SiteLicenses": "KARGER",
- "//www.karger.com/Services/SiteLicenses/": "KARGER",
- "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
- "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
- "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
- "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
+ "//creativecommons.org/share-your-work/public-domain/cc0": "CC-0",
+ "//creativecommons.org/licenses/by/2.0": "CC-BY",
+ "//creativecommons.org/licenses/by/3.0": "CC-BY",
+ "//creativecommons.org/licenses/by/4.0": "CC-BY",
+ "//creativecommons.org/licenses/by-sa/3.0": "CC-BY-SA",
+ "//creativecommons.org/licenses/by-sa/4.0": "CC-BY-SA",
+ "//creativecommons.org/licenses/by-nd/3.0": "CC-BY-ND",
+ "//creativecommons.org/licenses/by-nd/4.0": "CC-BY-ND",
+ "//creativecommons.org/licenses/by-nc/3.0": "CC-BY-NC",
+ "//creativecommons.org/licenses/by-nc/4.0": "CC-BY-NC",
+ "//creativecommons.org/licenses/by-nc-sa/3.0": "CC-BY-NC-SA",
+ "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA",
+ "//creativecommons.org/licenses/by-nc-nd/3.0": "CC-BY-NC-ND",
+ "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-NC-ND",
+ "//creativecommons.org/share-your-work/public-domain/cc0": "CC-0",
+ "//spdx.org/licenses/cc0-1.0.json": "CC-0",
+ "//spdx.org/licenses/cc-by-1.0.json": "CC-BY",
+ "//spdx.org/licenses/cc-by-4.0.json": "CC-BY",
+ "//spdx.org/licenses/cc-by-nc-4.0.json": "CC-BY-NC",
+ "//spdx.org/licenses/cc-by-sa-3.0.json": "CC-BY-SA",
+ "//spdx.org/licenses/cc-by-sa-4.0.json": "CC-BY-SA",
+ "//spdx.org/licenses/mit.json": "MIT",
+ "//spdx.org/licenses/ogl-canada-2.0.json": "OGL-Canada",
+ "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0",
+ "//www.elsevier.com/tdm/userlicense/1.0": "ELSEVIER-USER-1.0",
+ "//www.karger.com/services/siteLicenses": "KARGER",
+ "//www.karger.com/services/siteLicenses": "KARGER",
+ "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml": "ADS-UK",
+ "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml": "ADS-UK",
+ "//archaeologydataservice.ac.uk/advice/termsofuseandaccess": "ADS-UK",
+ "//homepage.data-planet.com/terms-use": "SAGE-DATA-PLANET",
+ "//homepage.data-planet.com/terms-use": "SAGE-DATA-PLANET",
+ "//publikationen.bibliothek.kit.edu/kitopen-lizenz": "KIT-OPEN",
+ "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
"//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
- "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
"//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
- "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
- "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
- "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
+ "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
+ "//www.ametsoc.org/pubsreuselicenses": "AMETSOC",
+ "//www.ametsoc.org/pubsreuselicenses": "AMETSOC",
+ "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
"//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
- "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
"//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
- "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
- "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
- "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
- "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
- # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
- # //www.springer.com/tdm doesn't seem like a license
+ "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
+ "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
+ "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3",
+ "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2",
+ "//arxiv.org/licenses/nonexclusive-distrib/1.0": "ARXIV-1.0",
+ # skip these non-OA licenses
# //iopscience.iop.org/page/copyright is closed
# //www.acm.org/publications/policies/copyright_policy#Background is closed
- # //rsc.li/journals-terms-of-use is closed for vor (am open)
# //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
- "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
# skip these TDM licenses; they don't apply to content
- # "//www.springer.com/tdm/": "SPRINGER-TDM",
- # "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
- # "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
+ # "//www.springer.com/tdm": "SPRINGER-TDM",
+ # "//journals.sagepub.com/page/policies/text-and-data-mining-license": "SAGE-TDM",
+ # "//doi.wiley.com/10.1002/tdm_license_1.1": "WILEY-TDM-1.1",
+ # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+ # //www.springer.com/tdm doesn't seem like a license
+ # //rsc.li/journals-terms-of-use is closed for vor (am open)
}
# Map various datacite type types to CSL-ish types. None means TODO or remove.
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index fc80411c..dd0a4f74 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -611,12 +611,15 @@ def test_parse_country_name() -> None:
def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
- raw = raw.strip().replace("http://", "//").replace("https://", "//")
- if "creativecommons.org" in raw.lower():
- raw = raw.lower()
- raw = raw.replace("/legalcode", "/").replace("/uk", "")
- if not raw.endswith("/"):
- raw = raw + "/"
+ # normalize to lower-case and not ending with a slash
+ raw = raw.strip().lower()
+ if raw.endswith("/"):
+ raw = raw[:-1]
+ # remove http/https prefix
+ raw = raw.replace("http://", "//").replace("https://", "//")
+ # special-case normalization of CC licenses
+ if "creativecommons.org" in raw:
+ raw = raw.replace("/legalcode", "").replace("/uk", "")
return LICENSE_SLUG_MAP.get(raw)