From 68ce6b089e3405faf0b12cde5e33a67e1cb6e372 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 30 Jun 2020 18:09:05 +0200 Subject: datacite: improve license mapping via "missed potential license", refs #58 --- python/fatcat_tools/importers/datacite.py | 24 +++++++++++++++--------- python/tests/import_datacite.py | 14 ++++++++++++++ 2 files changed, 29 insertions(+), 9 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 98393451..50d694ab 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -824,9 +824,14 @@ def lookup_license_slug(raw): return 'CC-0' if 'creativecommons' in raw: + # https://creativecommons.org/publicdomain/mark/1.0/deed.de + if 'creativecommons.org/publicdomain' in raw: + return 'CC-PUBLICDOMAIN' + if 'creativecommons.org/share-your-work/public-domain/cc0' in raw: + return 'CC-0' # https://creativecommons.org/licenses/by/4.0/deed.es_ES raw = raw.lower() - match = re.search(r'creativecommons.org/licen[sc]es/(?P[a-z-]+)', raw) + match = re.search(r'creativecommons.org/licen[sc]es/(?P[a-z-]+)', raw, re.IGNORECASE) if not match: print('missed potential license: {}'.format(raw), file=sys.stderr) return None @@ -839,7 +844,7 @@ def lookup_license_slug(raw): if 'opensource.org' in raw: # https://opensource.org/licenses/alphabetical, e.g. opensource.org/licenses/EUPL-1.2 - match = re.search(r'opensource.org/licenses/(?P[^/]+)', raw) + match = re.search(r'opensource.org/licenses/(?P[^/]+)', raw, re.IGNORECASE) if not match: print('missed potential license: {}'.format(raw), file=sys.stderr) return None @@ -848,11 +853,11 @@ def lookup_license_slug(raw): return None if len(name) > 11: return None - return name + return name.upper() if 'gnu.org' in raw: # http://www.gnu.org/copyleft/gpl, https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html - match = re.search(r'/(?Pfdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw) + match = re.search(r'/(?Pfdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw, re.IGNORECASE) if not match: print('missed potential license: {}'.format(raw), file=sys.stderr) return None @@ -864,20 +869,21 @@ def lookup_license_slug(raw): return name.upper() if 'spdx.org' in raw: + if 'spdx.org/licenses/CC0' in raw: + return 'CC-0' # https://spdx.org/licenses/CC-BY-NC-ND-4.0.html - match = re.search(r'spdx.org/licenses/(?P[a-z0-9-]+)', raw) + match = re.search(r'spdx.org/licenses/(?P[a-z0-9-]+)', raw, re.IGNORECASE) if not match: print('missed potential license: {}'.format(raw), file=sys.stderr) return None name = match.groupdict().get('name') if not name: return None - if name.startswith('cc'): - name = re.sub(r"-[.0-9-]*html", "", name) - return name if len(name) > 36: return None - return name + # cleanup version and extensions + name = re.sub('(-[0-9])?[.]?[0-9]?(.json|.html)?', '', name.lower()) + return name.upper() if 'rightsstatements.org' in raw: # http://rightsstatements.org/vocab/InC/1.0/ diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index afee06cc..20c1eaf8 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -340,6 +340,20 @@ def test_lookup_license_slug(): Case('http://onlinelibrary.wiley.com/termsAndConditions', 'WILEY'), Case('https://publikationen.bibliothek.kit.edu/kitopen-lizenz', 'KIT-OPEN'), Case('http://journals.sagepub.com/page/policies/text-and-data-mining-license', 'SAGE-TDM'), + Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'), + Case('http://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'), + Case('https://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'), + Case('https://creativecommons.org/publicdomain/mark/1.0/', 'CC-PUBLICDOMAIN'), + Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'), + Case('https://creativecommons.org/share-your-work/public-domain/cc0/', 'CC-0'), + Case('http://spdx.org/licenses/CC0-1.0.json', 'CC-0'), + Case('http://spdx.org/licenses/CC-BY-1.0.json', 'CC-BY'), + Case('http://spdx.org/licenses/CC-BY-4.0.json', 'CC-BY'), + Case('http://spdx.org/licenses/CC-BY-NC-4.0.json', 'CC-BY-NC'), + Case('http://spdx.org/licenses/CC-BY-SA-3.0.json', 'CC-BY-SA'), + Case('http://spdx.org/licenses/CC-BY-SA-4.0.json', 'CC-BY-SA'), + Case('http://spdx.org/licenses/MIT.json', 'MIT'), + Case('http://spdx.org/licenses/OGL-Canada-2.0.json', 'OGL-CANADA'), ] for c in cases: -- cgit v1.2.3