Merge branch 'bnewbold-import-refactors' into 'master'

import refactors and deprecations Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes. The Datacite-specific stuff could use review here. Remove unused/deprecated/dead code: - cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers - "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used) Refactors: - moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code) - shuffled around relative imports and some function names ("clean_str" vs. "clean") Some actual behavioral changes: - remove some Datacite-specific license slugs - stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!) - remove some excess metadata from datacite 'extra' fields
author: bnewbold <bnewbold@archive.org> 2021-11-11 01:12:18 +0000
committer: bnewbold <bnewbold@archive.org> 2021-11-11 01:12:18 +0000
commit: 6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch)
tree: 1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_tools/normal.py
parent: 7e3f91f1a49ea85707cae31125021ba761f5373d (diff)
parent: 6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff)
download: fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz
fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip
1 files changed, 39 insertions, 82 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 34e5c3d1..dd0a4f74 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -12,6 +12,8 @@ import ftfy
 import langdetect
 import pycountry
 
+from .biblio_lookup_tables import LICENSE_SLUG_MAP
+
 DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
 
 
@@ -47,7 +49,7 @@ def clean_doi(raw: Optional[str]) -> Optional[str]:
         raw = raw[8:]
     if raw.startswith("dx.doi.org/"):
         raw = raw[11:]
-    if raw[7:9] == "//":
+    if raw[7:9] == "//" and "10.1037//" in raw:
         raw = raw[:8] + raw[9:]
 
     # fatcatd uses same REGEX, but Rust regex rejects these characters, while
@@ -74,6 +76,7 @@ def test_clean_doi() -> None:
     assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
     assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
     assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
+    assert clean_doi("10.1026//1616-1041.3.2.86") == "10.1026//1616-1041.3.2.86"
     assert clean_doi("10.23750/abm.v88i2 -s.6506") is None
     assert clean_doi("10.17167/mksz.2017.2.129–155") is None
     assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf"
@@ -605,84 +608,38 @@ def test_parse_country_name() -> None:
     assert parse_country_name("Japan") == "jp"
 
 
-# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
-# 2/T and 2/B?
-# PubMed/MEDLINE and JSTOR use these MARC codes
-# https://www.loc.gov/marc/languages/language_name.html
-LANG_MAP_MARC = {
-    "afr": "af",
-    "alb": "sq",
-    "amh": "am",
-    "ara": "ar",
-    "arm": "hy",
-    "aze": "az",
-    "ben": "bn",
-    "bos": "bs",
-    "bul": "bg",
-    "cat": "ca",
-    "chi": "zh",
-    "cze": "cs",
-    "dan": "da",
-    "dut": "nl",
-    "eng": "en",
-    "epo": "eo",
-    "est": "et",
-    "fin": "fi",
-    "fre": "fr",
-    "geo": "ka",
-    "ger": "de",
-    "gla": "gd",
-    "gre": "el",
-    "heb": "he",
-    "hin": "hi",
-    "hrv": "hr",
-    "hun": "hu",
-    "ice": "is",
-    "ind": "id",
-    "ita": "it",
-    "jpn": "ja",
-    "kin": "rw",
-    "kor": "ko",
-    "lat": "la",
-    "lav": "lv",
-    "lit": "lt",
-    "mac": "mk",
-    "mal": "ml",
-    "mao": "mi",
-    "may": "ms",
-    "nor": "no",
-    "per": "fa",
-    "per": "fa",
-    "pol": "pl",
-    "por": "pt",
-    "pus": "ps",
-    "rum": "ro",
-    "rus": "ru",
-    "san": "sa",
-    "slo": "sk",
-    "slv": "sl",
-    "spa": "es",
-    "srp": "sr",
-    "swe": "sv",
-    "tha": "th",
-    "tur": "tr",
-    "ukr": "uk",
-    "urd": "ur",
-    "vie": "vi",
-    "wel": "cy",
-    # additions
-    "gle": "ga",  # "Irish" (Gaelic)
-    "jav": "jv",  # Javanese
-    "welsh": "cy",  # Welsh
-    "oci": "oc",  # Occitan
-    # Don't have ISO 639-1 codes
-    "grc": "el",  # Ancient Greek; map to modern greek
-    "map": None,  # Austronesian (collection)
-    "syr": None,  # Syriac, Modern
-    "gem": None,  # Old Saxon
-    "non": None,  # Old Norse
-    "emg": None,  # Eastern Meohang
-    "neg": None,  # Negidal
-    "mul": None,  # Multiple languages
-    "und": None,  # Undetermined
-}
+def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+    if not raw:
+        return None
+    # normalize to lower-case and not ending with a slash
+    raw = raw.strip().lower()
+    if raw.endswith("/"):
+        raw = raw[:-1]
+    # remove http/https prefix
+    raw = raw.replace("http://", "//").replace("https://", "//")
+    # special-case normalization of CC licenses
+    if "creativecommons.org" in raw:
+        raw = raw.replace("/legalcode", "").replace("/uk", "")
+    return LICENSE_SLUG_MAP.get(raw)
+
+
+def test_lookup_license_slug() -> None:
+
+    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
+    assert (
+        lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
+        == "CC-BY"
+    )
+    assert (
+        lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
+        == "CC-0"
+    )
+    assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
+    assert (
+        lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
+        == "CC-BY-NC-SA"
+    )
+    assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
+    assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
+    assert lookup_license_slug("") is None
+    assert lookup_license_slug(None) is None
author	bnewbold <bnewbold@archive.org>	2021-11-11 01:12:18 +0000
committer	bnewbold <bnewbold@archive.org>	2021-11-11 01:12:18 +0000
commit	6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch)
tree	1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_tools/normal.py
parent	7e3f91f1a49ea85707cae31125021ba761f5373d (diff)
parent	6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff)
download	fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip