aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/normal.py
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2021-11-11 01:12:18 +0000
committerbnewbold <bnewbold@archive.org>2021-11-11 01:12:18 +0000
commit6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch)
tree1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_tools/normal.py
parent7e3f91f1a49ea85707cae31125021ba761f5373d (diff)
parent6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff)
downloadfatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz
fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip
Merge branch 'bnewbold-import-refactors' into 'master'
import refactors and deprecations Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes. The Datacite-specific stuff could use review here. Remove unused/deprecated/dead code: - cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers - "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used) Refactors: - moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code) - shuffled around relative imports and some function names ("clean_str" vs. "clean") Some actual behavioral changes: - remove some Datacite-specific license slugs - stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!) - remove some excess metadata from datacite 'extra' fields
Diffstat (limited to 'python/fatcat_tools/normal.py')
-rw-r--r--python/fatcat_tools/normal.py121
1 files changed, 39 insertions, 82 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 34e5c3d1..dd0a4f74 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -12,6 +12,8 @@ import ftfy
import langdetect
import pycountry
+from .biblio_lookup_tables import LICENSE_SLUG_MAP
+
DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
@@ -47,7 +49,7 @@ def clean_doi(raw: Optional[str]) -> Optional[str]:
raw = raw[8:]
if raw.startswith("dx.doi.org/"):
raw = raw[11:]
- if raw[7:9] == "//":
+ if raw[7:9] == "//" and "10.1037//" in raw:
raw = raw[:8] + raw[9:]
# fatcatd uses same REGEX, but Rust regex rejects these characters, while
@@ -74,6 +76,7 @@ def test_clean_doi() -> None:
assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
+ assert clean_doi("10.1026//1616-1041.3.2.86") == "10.1026//1616-1041.3.2.86"
assert clean_doi("10.23750/abm.v88i2 -s.6506") is None
assert clean_doi("10.17167/mksz.2017.2.129–155") is None
assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf"
@@ -605,84 +608,38 @@ def test_parse_country_name() -> None:
assert parse_country_name("Japan") == "jp"
-# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
-# 2/T and 2/B?
-# PubMed/MEDLINE and JSTOR use these MARC codes
-# https://www.loc.gov/marc/languages/language_name.html
-LANG_MAP_MARC = {
- "afr": "af",
- "alb": "sq",
- "amh": "am",
- "ara": "ar",
- "arm": "hy",
- "aze": "az",
- "ben": "bn",
- "bos": "bs",
- "bul": "bg",
- "cat": "ca",
- "chi": "zh",
- "cze": "cs",
- "dan": "da",
- "dut": "nl",
- "eng": "en",
- "epo": "eo",
- "est": "et",
- "fin": "fi",
- "fre": "fr",
- "geo": "ka",
- "ger": "de",
- "gla": "gd",
- "gre": "el",
- "heb": "he",
- "hin": "hi",
- "hrv": "hr",
- "hun": "hu",
- "ice": "is",
- "ind": "id",
- "ita": "it",
- "jpn": "ja",
- "kin": "rw",
- "kor": "ko",
- "lat": "la",
- "lav": "lv",
- "lit": "lt",
- "mac": "mk",
- "mal": "ml",
- "mao": "mi",
- "may": "ms",
- "nor": "no",
- "per": "fa",
- "per": "fa",
- "pol": "pl",
- "por": "pt",
- "pus": "ps",
- "rum": "ro",
- "rus": "ru",
- "san": "sa",
- "slo": "sk",
- "slv": "sl",
- "spa": "es",
- "srp": "sr",
- "swe": "sv",
- "tha": "th",
- "tur": "tr",
- "ukr": "uk",
- "urd": "ur",
- "vie": "vi",
- "wel": "cy",
- # additions
- "gle": "ga", # "Irish" (Gaelic)
- "jav": "jv", # Javanese
- "welsh": "cy", # Welsh
- "oci": "oc", # Occitan
- # Don't have ISO 639-1 codes
- "grc": "el", # Ancient Greek; map to modern greek
- "map": None, # Austronesian (collection)
- "syr": None, # Syriac, Modern
- "gem": None, # Old Saxon
- "non": None, # Old Norse
- "emg": None, # Eastern Meohang
- "neg": None, # Negidal
- "mul": None, # Multiple languages
- "und": None, # Undetermined
-}
+def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+ if not raw:
+ return None
+ # normalize to lower-case and not ending with a slash
+ raw = raw.strip().lower()
+ if raw.endswith("/"):
+ raw = raw[:-1]
+ # remove http/https prefix
+ raw = raw.replace("http://", "//").replace("https://", "//")
+ # special-case normalization of CC licenses
+ if "creativecommons.org" in raw:
+ raw = raw.replace("/legalcode", "").replace("/uk", "")
+ return LICENSE_SLUG_MAP.get(raw)
+
+
+def test_lookup_license_slug() -> None:
+
+ assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
+ assert (
+ lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
+ == "CC-BY"
+ )
+ assert (
+ lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
+ == "CC-0"
+ )
+ assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
+ assert (
+ lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
+ == "CC-BY-NC-SA"
+ )
+ assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
+ assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
+ assert lookup_license_slug("") is None
+ assert lookup_license_slug(None) is None