summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-06-30 18:09:05 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-06-30 18:09:05 +0200
commit68ce6b089e3405faf0b12cde5e33a67e1cb6e372 (patch)
tree408dc923a2f4be649736298689aca88247672fa6
parentcc6361e2b87c5ed7af82be8e0c79ddb1f91af1ce (diff)
downloadfatcat-68ce6b089e3405faf0b12cde5e33a67e1cb6e372.tar.gz
fatcat-68ce6b089e3405faf0b12cde5e33a67e1cb6e372.zip
datacite: improve license mapping
via "missed potential license", refs #58
-rw-r--r--python/fatcat_tools/importers/datacite.py24
-rw-r--r--python/tests/import_datacite.py14
2 files changed, 29 insertions, 9 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 98393451..50d694ab 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -824,9 +824,14 @@ def lookup_license_slug(raw):
return 'CC-0'
if 'creativecommons' in raw:
+ # https://creativecommons.org/publicdomain/mark/1.0/deed.de
+ if 'creativecommons.org/publicdomain' in raw:
+ return 'CC-PUBLICDOMAIN'
+ if 'creativecommons.org/share-your-work/public-domain/cc0' in raw:
+ return 'CC-0'
# https://creativecommons.org/licenses/by/4.0/deed.es_ES
raw = raw.lower()
- match = re.search(r'creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)', raw)
+ match = re.search(r'creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)', raw, re.IGNORECASE)
if not match:
print('missed potential license: {}'.format(raw), file=sys.stderr)
return None
@@ -839,7 +844,7 @@ def lookup_license_slug(raw):
if 'opensource.org' in raw:
# https://opensource.org/licenses/alphabetical, e.g. opensource.org/licenses/EUPL-1.2
- match = re.search(r'opensource.org/licenses/(?P<name>[^/]+)', raw)
+ match = re.search(r'opensource.org/licenses/(?P<name>[^/]+)', raw, re.IGNORECASE)
if not match:
print('missed potential license: {}'.format(raw), file=sys.stderr)
return None
@@ -848,11 +853,11 @@ def lookup_license_slug(raw):
return None
if len(name) > 11:
return None
- return name
+ return name.upper()
if 'gnu.org' in raw:
# http://www.gnu.org/copyleft/gpl, https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html
- match = re.search(r'/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw)
+ match = re.search(r'/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw, re.IGNORECASE)
if not match:
print('missed potential license: {}'.format(raw), file=sys.stderr)
return None
@@ -864,20 +869,21 @@ def lookup_license_slug(raw):
return name.upper()
if 'spdx.org' in raw:
+ if 'spdx.org/licenses/CC0' in raw:
+ return 'CC-0'
# https://spdx.org/licenses/CC-BY-NC-ND-4.0.html
- match = re.search(r'spdx.org/licenses/(?P<name>[a-z0-9-]+)', raw)
+ match = re.search(r'spdx.org/licenses/(?P<name>[a-z0-9-]+)', raw, re.IGNORECASE)
if not match:
print('missed potential license: {}'.format(raw), file=sys.stderr)
return None
name = match.groupdict().get('name')
if not name:
return None
- if name.startswith('cc'):
- name = re.sub(r"-[.0-9-]*html", "", name)
- return name
if len(name) > 36:
return None
- return name
+ # cleanup version and extensions
+ name = re.sub('(-[0-9])?[.]?[0-9]?(.json|.html)?', '', name.lower())
+ return name.upper()
if 'rightsstatements.org' in raw:
# http://rightsstatements.org/vocab/InC/1.0/
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index afee06cc..20c1eaf8 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -340,6 +340,20 @@ def test_lookup_license_slug():
Case('http://onlinelibrary.wiley.com/termsAndConditions', 'WILEY'),
Case('https://publikationen.bibliothek.kit.edu/kitopen-lizenz', 'KIT-OPEN'),
Case('http://journals.sagepub.com/page/policies/text-and-data-mining-license', 'SAGE-TDM'),
+ Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'),
+ Case('http://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'),
+ Case('https://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'),
+ Case('https://creativecommons.org/publicdomain/mark/1.0/', 'CC-PUBLICDOMAIN'),
+ Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'),
+ Case('https://creativecommons.org/share-your-work/public-domain/cc0/', 'CC-0'),
+ Case('http://spdx.org/licenses/CC0-1.0.json', 'CC-0'),
+ Case('http://spdx.org/licenses/CC-BY-1.0.json', 'CC-BY'),
+ Case('http://spdx.org/licenses/CC-BY-4.0.json', 'CC-BY'),
+ Case('http://spdx.org/licenses/CC-BY-NC-4.0.json', 'CC-BY-NC'),
+ Case('http://spdx.org/licenses/CC-BY-SA-3.0.json', 'CC-BY-SA'),
+ Case('http://spdx.org/licenses/CC-BY-SA-4.0.json', 'CC-BY-SA'),
+ Case('http://spdx.org/licenses/MIT.json', 'MIT'),
+ Case('http://spdx.org/licenses/OGL-Canada-2.0.json', 'OGL-CANADA'),
]
for c in cases: