aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-01-09 15:02:17 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-01-09 15:02:17 +0100
commitffd2597d5e962e3f3a2ea23c66a135bb737b2390 (patch)
tree382644d68497c3f485e77028f060bb746bfbb947
parent8eab14ac12b17f4965ec5569853bf885ca6aacd2 (diff)
downloadfatcat-ffd2597d5e962e3f3a2ea23c66a135bb737b2390.tar.gz
fatcat-ffd2597d5e962e3f3a2ea23c66a135bb737b2390.zip
datacite: improve license_slug handling
-rw-r--r--python/fatcat_tools/importers/datacite.py161
-rw-r--r--python/tests/files/datacite/datacite_result_00.json3
-rw-r--r--python/tests/files/datacite/datacite_result_16.json1
-rw-r--r--python/tests/import_datacite.py31
4 files changed, 134 insertions, 62 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index ed8b0906..2f1e17d1 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -11,6 +11,7 @@ functions (parse_datacite_...), which can be tested more easily.
import collections
import datetime
import hashlib
+import re
import json
import sqlite3
import sys
@@ -154,62 +155,26 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set((
# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blacklist.
UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
-# TODO(martin): merge this with other maps, maybe.
+# TODO(martin): merge this with other maps and lookup functions, eventually.
LICENSE_SLUG_MAP = {
- "//creativecommons.org/licenses/by/2.0/": "CC-BY",
- "//creativecommons.org/licenses/by/2.0/uk/legalcode": "CC-BY",
- "//creativecommons.org/licenses/by/3.0/": "CC-BY",
- "//creativecommons.org/licenses/by/3.0/us": "CC-BY",
- "//creativecommons.org/licenses/by/4.0/": "CC-BY",
- "//creativecommons.org/licenses/by/4.0/deed.de/": "CC-BY",
- "//creativecommons.org/licenses/by/4.0/deed.en_US/": "CC-BY",
- "//creativecommons.org/licenses/by/4.0/legalcode/": "CC-BY",
- "//creativecommons.org/licenses/by-nc/2.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
- "//creativecommons.org/licenses/by-nc-nd/3.0/gr": "CC-BY-NC-ND",
- "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nc-nd/4.0/legalcode": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
- "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
- "//creativecommons.org/licenses/by-sa/3.0/de": "CC-BY-SA",
- "//creativecommons.org/licenses/by-sa/3.0/gr": "CC-BY-SA",
- "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
- "//creativecommons.org/licenses/by-sa/4.0/legalcode": "CC-BY-SA",
- "//creativecommons.org/licenses/CC-BY/4.0/": "CC-BY",
- "//creativecommons.org/licenses/publicdomain/zero/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
- "//opensource.org/licenses/MIT": "MIT",
- "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0",
- "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3",
- "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2",
- "//www.karger.com/Services/SiteLicenses": "KARGER",
- "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0",
- "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause",
- "//www.opensource.org/licenses/EUPL-1.1":
- "EUPL-1.1", # redirects to EUPL-1.2
- "//www.opensource.org/licenses/MIT": "MIT",
- # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/
- # "http://rsc.li/journals-terms-of-use": "RSC",
- # "http://www.fu-berlin.de/sites/refubium/rechtliches/Nutzungsbedingungen": "", # 53 UrhG.
- # "http://www.nrcresearchpress.com/page/about/CorporateTextAndDataMining": "",
- # "http://www.springer.com/tdm": "",
- # "https://cds.unistra.fr/vizier-org/licences_vizier.html": "", # Maybe try to "SPN" those: https://web.archive.org/web/*/https://cds.unistra.fr/vizier-org/licences_vizier.html
- # "https://link.aps.org/licenses/aps-default-accepted-manuscript-license": "",
- # "https://oparu.uni-ulm.de/xmlui/license_opod_v1": "",
- # "https://publikationen.bibliothek.kit.edu/kitopen-lizenz": "",
- # "https://rightsstatements.org/page/InC/1.0?language=en": "",
- # "https://services.ceda.ac.uk/cedasite/register/info": "",
- # "https://wdc.dlr.de/ndmc/userfiles/file/NDMC-Data_Sharing_Principles.pdf": "", # 404
- # "https://www.cambridge.org/core/terms": "",
- # "https://www.elsevier.com/tdm/userlicense/1.0",
- # "info:eu-repo/semantics/closedAccess": "", # https://wiki.surfnet.nl/display/standards/info-eu-repo/#info-eu-repo-AccessRights
- # "info:eu-repo/semantics/embargoedAccess": "",
- # "info:eu-repo/semantics/openAccess": "",
- # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice).
+ "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
+ "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
+ "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
+ "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
+ "//onlinelibrary.wiley.com/termsandconditions/": "WILEY",
+ "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
+ "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
+ "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
+ "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
+ "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
+ "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
+ "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
+ "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
+ "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
+ "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
+ "//www.karger.com/Services/SiteLicenses/": "KARGER",
+ "//www.springer.com/tdm/": "SPRINGER-TDM",
+ "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
}
# TODO(martin): drop this after 3.7 upgrade
@@ -793,16 +758,92 @@ class DataciteImporter(EntityImporter):
def lookup_license_slug(raw):
"""
+ Resolve a variety of strings into a some pseudo-canonical form, e.g.
+ CC-BY-ND, CC-0, MIT and so on.
TODO(martin): reuse from or combine with crossref, maybe.
"""
if not raw:
return None
- raw = raw.strip().replace('http://', '//').replace('https://', '//')
- if 'creativecommons.org' in raw.lower():
+
+ if 'creativecommons.org/publicdomain/zero' in raw:
+ return 'CC-0'
+ if raw.lower().endswith('/cc0'):
+ return 'CC-0'
+
+ if 'creativecommons' in raw:
+ # https://creativecommons.org/licenses/by/4.0/deed.es_ES
raw = raw.lower()
- raw = raw.replace('/legalcode', '/').replace('/uk', '')
- if not raw.endswith('/'):
- raw = raw + '/'
+ match = re.search(r'creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)', raw)
+ if not match:
+ print('missed potential license: {}'.format(raw), file=sys.stderr)
+ return None
+ name = match.groupdict().get('name')
+ if not name:
+ return None
+ if not name.startswith('cc'):
+ name = 'cc-{}'.format(name)
+ return name.upper()
+
+ if 'opensource.org' in raw:
+ # https://opensource.org/licenses/alphabetical, e.g. opensource.org/licenses/EUPL-1.2
+ match = re.search(r'opensource.org/licenses/(?P<name>[^/]+)', raw)
+ if not match:
+ print('missed potential license: {}'.format(raw), file=sys.stderr)
+ return None
+ name = match.groupdict().get('name')
+ if not name:
+ return None
+ if len(name) > 11:
+ return None
+ return name
+
+ if 'gnu.org' in raw:
+ # http://www.gnu.org/copyleft/gpl, https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html
+ match = re.search(r'/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw)
+ if not match:
+ print('missed potential license: {}'.format(raw), file=sys.stderr)
+ return None
+ name = match.groupdict().get('name')
+ if not name:
+ return None
+ if len(name) > 8:
+ return None
+ return name.upper()
+
+ if 'spdx.org' in raw:
+ # https://spdx.org/licenses/CC-BY-NC-ND-4.0.html
+ match = re.search(r'spdx.org/licenses/(?P<name>[a-z0-9-]+)', raw)
+ if not match:
+ print('missed potential license: {}'.format(raw), file=sys.stderr)
+ return None
+ name = match.groupdict().get('name')
+ if not name:
+ return None
+ if name.startswith('cc'):
+ name = re.sub(r"-[.0-9-]*html", "", name)
+ return name
+ if len(name) > 36:
+ return None
+ return name
+
+ if 'rightsstatements.org' in raw:
+ # http://rightsstatements.org/vocab/InC/1.0/
+ match = re.search(r'rightsstatements.org/(vocab|page)/(?P<name>[^/]*)', raw)
+ if not match:
+ print('missed potential license: {}'.format(raw), file=sys.stderr)
+ return None
+ name = match.groupdict().get('name')
+ if not name:
+ return None
+ if len(name) > 9:
+ return None
+ return 'RS-{}'.format(name.upper())
+
+ # Fallback to mapped values.
+ raw = raw.lower()
+ raw = raw.strip().replace('http://', '//').replace('https://', '//')
+ if not raw.endswith('/'):
+ raw = raw + '/'
return LICENSE_SLUG_MAP.get(raw)
diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json
index 06147cfc..130a46ff 100644
--- a/python/tests/files/datacite/datacite_result_00.json
+++ b/python/tests/files/datacite/datacite_result_00.json
@@ -87,5 +87,6 @@
"release_type": "article-journal",
"release_year": 2019,
"title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea",
- "volume": "38"
+ "volume": "38",
+ "license_slug": "SPRINGER-TDM"
}
diff --git a/python/tests/files/datacite/datacite_result_16.json b/python/tests/files/datacite/datacite_result_16.json
index ad585e3c..ef26c47c 100644
--- a/python/tests/files/datacite/datacite_result_16.json
+++ b/python/tests/files/datacite/datacite_result_16.json
@@ -24,6 +24,7 @@
"resourceTypeGeneral": "Dataset"
}
},
+ "license_slug": "CC-BY",
"publisher": "Figshare",
"refs": [],
"release_stage": "published",
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 5ad7ef2c..89671cd2 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -7,7 +7,7 @@ import datetime
import pytest
import gzip
from fatcat_tools.importers import DataciteImporter, JsonLinePusher
-from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi, index_form_to_display_name
+from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi, index_form_to_display_name, lookup_license_slug
from fatcat_tools.transforms import entity_to_dict
from fixtures import api
import json
@@ -315,3 +315,32 @@ def test_index_form_to_display_name():
for c in cases:
assert c.output == index_form_to_display_name(c.input)
+
+def test_lookup_license_slug():
+ Case = collections.namedtuple('Case', 'input output')
+ cases = [
+ Case('https://opensource.org/licenses/MIT', 'MIT'),
+ Case('creativecommons.org/licenses/by-nc-nd/3.0/', 'CC-BY-NC-ND'),
+ Case('http://creativecommons.org/licences/by-nc-sa/4.0', 'CC-BY-NC-SA'),
+ Case('http://creativecommons.org/licenses/by-nc-nd/2.5/co', 'CC-BY-NC-ND'),
+ Case('http://creativecommons.org/licenses/by-nd/4.0/legalcode', 'CC-BY-ND'),
+ Case('http://creativecommons.org/licenses/by/2.0/uk/legalcode', 'CC-BY'),
+ Case('http://creativecommons.org/publicdomain/zero/1.0/legalcode', 'CC-0'),
+ Case('http://doi.wiley.com/10.1002/tdm_license_1.1', 'WILEY-TDM-1.1'),
+ Case('http://homepage.data-planet.com/terms-use', 'SAGE-DATA-PLANET'),
+ Case('http://www.springer.com/tdm', 'SPRINGER-TDM'),
+ Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml', 'ADS-UK'),
+ Case('https://creativecommons.org/public-domain/cc0', 'CC-0'),
+ Case('https://creativecommons.org/publicdomain/zero/1.0', 'CC-0'),
+ Case('https://creativecommons.org/share-your-work/public-domain/cc0', 'CC-0'),
+ Case('https://www.elsevier.com/tdm/userlicense/1.0', 'ELSEVIER-USER-1.0'),
+ Case('https://www.gnu.org/licenses/gpl-3.0.html', 'GPL-3.0'),
+ Case('http://rightsstatements.org/page/InC/1.0?language=en', 'RS-INC'),
+ Case('http://onlinelibrary.wiley.com/termsAndConditions', 'WILEY'),
+ Case('https://publikationen.bibliothek.kit.edu/kitopen-lizenz', 'KIT-OPEN'),
+ Case('http://journals.sagepub.com/page/policies/text-and-data-mining-license', 'SAGE-TDM'),
+ ]
+
+ for c in cases:
+ got = lookup_license_slug(c.input)
+ assert c.output == got, '{}: got {}, want {}'.format(c.input, got, c.output)