aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/crossref.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/crossref.py')
-rw-r--r--python/fatcat_tools/importers/crossref.py94
1 files changed, 2 insertions, 92 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 8f5a4265..52bd7465 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,7 +4,8 @@ from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import CONTAINER_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
from .common import EntityImporter
@@ -33,97 +34,6 @@ CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
"standard": "standard",
}
-CONTAINER_TYPE_MAP: Dict[str, str] = {
- "article-journal": "journal",
- "paper-conference": "conference",
- "book": "book-series",
-}
-
-# These are based, informally, on sorting the most popular licenses found in
-# Crossref metadata. There were over 500 unique strings and only a few most
-# popular are here; many were variants of the CC URLs. Would be useful to
-# normalize CC licenses better.
-# The current norm is to only add license slugs that are at least partially OA.
-LICENSE_SLUG_MAP: Dict[str, str] = {
- "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
- "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
- "//creativecommons.org/licenses/by/2.0/": "CC-BY",
- "//creativecommons.org/licenses/by/3.0/": "CC-BY",
- "//creativecommons.org/licenses/by/4.0/": "CC-BY",
- "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
- "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
- "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
- "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
- "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
- "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
- "//spdx.org/licenses/CC0-1.0.json": "CC-0",
- "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
- "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
- "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
- "//spdx.org/licenses/MIT.json": "MIT",
- "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
- "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.karger.com/Services/SiteLicenses": "KARGER",
- "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
- "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
- "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
- "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
- "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
- "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
- # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
- # //www.springer.com/tdm doesn't seem like a license
- # //iopscience.iop.org/page/copyright is closed
- # //www.acm.org/publications/policies/copyright_policy#Background is closed
- # //rsc.li/journals-terms-of-use is closed for vor (am open)
- # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
- "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
-}
-
-
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
- if not raw:
- return None
- raw = raw.strip().replace("http://", "//").replace("https://", "//")
- if "creativecommons.org" in raw.lower():
- raw = raw.lower()
- raw = raw.replace("/legalcode", "/").replace("/uk", "")
- if not raw.endswith("/"):
- raw = raw + "/"
- return LICENSE_SLUG_MAP.get(raw)
-
-
-def test_lookup_license_slug() -> None:
-
- assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
- assert (
- lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
- == "CC-BY"
- )
- assert (
- lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
- == "CC-0"
- )
- assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
- assert (
- lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
- == "CC-BY-NC-SA"
- )
- assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
- assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
- assert lookup_license_slug("") is None
- assert lookup_license_slug(None) is None
-
class CrossrefImporter(EntityImporter):
"""