refactor importer metadata tables into separate file; move some helpers around

- MAX_ABSTRACT_LENGTH set in a single place (importer common) - merge datacite license slug table in to common table, removing some TDM-specific licenses (which do not apply in the context of preserving the full work)
author: Bryan Newbold <bnewbold@robocracy.org> 2021-11-10 13:52:39 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-11-10 13:52:43 -0800
commit: ddc757bc1d5c610f42e9f5f10a4f060f517b66ca (patch)
tree: 86ccdef998bd3da3910cfe8fb9f2177b58a664a0
parent: 16e9979a6f347b49764c1141209e84083ea81057 (diff)
download: fatcat-ddc757bc1d5c610f42e9f5f10a4f060f517b66ca.tar.gz
fatcat-ddc757bc1d5c610f42e9f5f10a4f060f517b66ca.zip
10 files changed, 682 insertions, 702 deletions
diff --git a/python/fatcat_tools/biblio_lookup_tables.py b/python/fatcat_tools/biblio_lookup_tables.py
new file mode 100644
index 00000000..a9a097ae
--- /dev/null
+++ b/python/fatcat_tools/biblio_lookup_tables.py
@@ -0,0 +1,623 @@
+"""
+This file contains lookup tables and other static data structures used in
+bibliographic metadata munging.
+"""
+
+from typing import Dict, Optional
+
+# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
+# 2/T and 2/B?
+# PubMed/MEDLINE and JSTOR use these MARC codes
+# https://www.loc.gov/marc/languages/language_name.html
+LANG_MAP_MARC: Dict[str, Optional[str]] = {
+    "afr": "af",
+    "alb": "sq",
+    "amh": "am",
+    "ara": "ar",
+    "arm": "hy",
+    "aze": "az",
+    "ben": "bn",
+    "bos": "bs",
+    "bul": "bg",
+    "cat": "ca",
+    "chi": "zh",
+    "cze": "cs",
+    "dan": "da",
+    "dut": "nl",
+    "eng": "en",
+    "epo": "eo",
+    "est": "et",
+    "fin": "fi",
+    "fre": "fr",
+    "geo": "ka",
+    "ger": "de",
+    "gla": "gd",
+    "gre": "el",
+    "heb": "he",
+    "hin": "hi",
+    "hrv": "hr",
+    "hun": "hu",
+    "ice": "is",
+    "ind": "id",
+    "ita": "it",
+    "jpn": "ja",
+    "kin": "rw",
+    "kor": "ko",
+    "lat": "la",
+    "lav": "lv",
+    "lit": "lt",
+    "mac": "mk",
+    "mal": "ml",
+    "mao": "mi",
+    "may": "ms",
+    "nor": "no",
+    "per": "fa",
+    "per": "fa",
+    "pol": "pl",
+    "por": "pt",
+    "pus": "ps",
+    "rum": "ro",
+    "rus": "ru",
+    "san": "sa",
+    "slo": "sk",
+    "slv": "sl",
+    "spa": "es",
+    "srp": "sr",
+    "swe": "sv",
+    "tha": "th",
+    "tur": "tr",
+    "ukr": "uk",
+    "urd": "ur",
+    "vie": "vi",
+    "wel": "cy",
+    # additions
+    "gle": "ga",  # "Irish" (Gaelic)
+    "jav": "jv",  # Javanese
+    "welsh": "cy",  # Welsh
+    "oci": "oc",  # Occitan
+    # Don't have ISO 639-1 codes
+    "grc": "el",  # Ancient Greek; map to modern greek
+    "map": None,  # Austronesian (collection)
+    "syr": None,  # Syriac, Modern
+    "gem": None,  # Old Saxon
+    "non": None,  # Old Norse
+    "emg": None,  # Eastern Meohang
+    "neg": None,  # Negidal
+    "mul": None,  # Multiple languages
+    "und": None,  # Undetermined
+}
+
+# these are mappings from web domains to URL 'rel' for things like file entity
+# URL notation
+DOMAIN_REL_MAP: Dict[str, str] = {
+    "archive.org": "archive",
+    # LOCKSS, Portico, DuraSpace, etc would also be "archive"
+    "arxiv.org": "repository",
+    "babel.hathitrust.org": "repository",
+    "cds.cern.ch": "repository",
+    "deepblue.lib.umich.edu": "repository",
+    "europepmc.org": "repository",
+    "hal.inria.fr": "repository",
+    "scielo.isciii.es": "repository",
+    "www.dtic.mil": "repository",
+    "www.jstage.jst.go.jp": "repository",
+    "www.jstor.org": "repository",
+    "www.ncbi.nlm.nih.gov": "repository",
+    "ftp.ncbi.nlm.nih.gov": "repository",
+    "www.scielo.br": "repository",
+    "www.scielo.cl": "repository",
+    "www.scielo.org.mx": "repository",
+    "zenodo.org": "repository",
+    "www.biorxiv.org": "repository",
+    "www.medrxiv.org": "repository",
+    "citeseerx.ist.psu.edu": "aggregator",
+    "publisher-connector.core.ac.uk": "aggregator",
+    "core.ac.uk": "aggregator",
+    "static.aminer.org": "aggregator",
+    "aminer.org": "aggregator",
+    "pdfs.semanticscholar.org": "aggregator",
+    "semanticscholar.org": "aggregator",
+    "www.semanticscholar.org": "aggregator",
+    "academic.oup.com": "publisher",
+    "cdn.elifesciences.org": "publisher",
+    "cell.com": "publisher",
+    "dl.acm.org": "publisher",
+    "downloads.hindawi.com": "publisher",
+    "elifesciences.org": "publisher",
+    "iopscience.iop.org": "publisher",
+    "journals.plos.org": "publisher",
+    "link.springer.com": "publisher",
+    "onlinelibrary.wiley.com": "publisher",
+    "works.bepress.com": "publisher",
+    "www.biomedcentral.com": "publisher",
+    "www.cell.com": "publisher",
+    "www.nature.com": "publisher",
+    "www.pnas.org": "publisher",
+    "www.tandfonline.com": "publisher",
+    "www.frontiersin.org": "publisher",
+    "www.degruyter.com": "publisher",
+    "www.mdpi.com": "publisher",
+    "www.ahajournals.org": "publisher",
+    "ehp.niehs.nih.gov": "publisher",
+    "journals.tsu.ru": "publisher",
+    "www.cogentoa.com": "publisher",
+    "www.researchgate.net": "academicsocial",
+    "academia.edu": "academicsocial",
+    "wayback.archive-it.org": "webarchive",
+    "web.archive.org": "webarchive",
+    "archive.is": "webarchive",
+}
+
+# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
+PUBMED_RELEASE_TYPE_MAP = {
+    # Adaptive Clinical Trial
+    "Address": "speech",
+    "Autobiography": "book",
+    # Bibliography
+    "Biography": "book",
+    # Case Reports
+    "Classical Article": "article-journal",
+    # Clinical Conference
+    # Clinical Study
+    # Clinical Trial
+    # Clinical Trial, Phase I
+    # Clinical Trial, Phase II
+    # Clinical Trial, Phase III
+    # Clinical Trial, Phase IV
+    # Clinical Trial Protocol
+    # Clinical Trial, Veterinary
+    # Collected Works
+    # Comparative Study
+    # Congress
+    # Consensus Development Conference
+    # Consensus Development Conference, NIH
+    # Controlled Clinical Trial
+    "Dataset": "dataset",
+    # Dictionary
+    # Directory
+    # Duplicate Publication
+    "Editorial": "editorial",
+    # English Abstract   # doesn't indicate that this is abstract-only
+    # Equivalence Trial
+    # Evaluation Studies
+    # Expression of Concern
+    # Festschrift
+    # Government Document
+    # Guideline
+    "Historical Article": "article-journal",
+    # Interactive Tutorial
+    "Interview": "interview",
+    "Introductory Journal Article": "article-journal",
+    "Journal Article": "article-journal",
+    "Lecture": "speech",
+    "Legal Case": "legal_case",
+    "Legislation": "legislation",
+    "Letter": "letter",
+    # Meta-Analysis
+    # Multicenter Study
+    # News
+    "Newspaper Article": "article-newspaper",
+    # Observational Study
+    # Observational Study, Veterinary
+    # Overall
+    # Patient Education Handout
+    # Periodical Index
+    # Personal Narrative
+    # Portrait
+    # Practice Guideline
+    # Pragmatic Clinical Trial
+    # Publication Components
+    # Publication Formats
+    # Publication Type Category
+    # Randomized Controlled Trial
+    # Research Support, American Recovery and Reinvestment Act
+    # Research Support, N.I.H., Extramural
+    # Research Support, N.I.H., Intramural
+    # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
+    # Research Support, U.S. Gov't, P.H.S.
+    # Review     # in the "literature review" sense, not "product review"
+    # Scientific Integrity Review
+    # Study Characteristics
+    # Support of Research
+    # Systematic Review
+    "Technical Report": "report",
+    # Twin Study
+    # Validation Studies
+    # Video-Audio Media
+    # Webcasts
+}
+
+MONTH_ABBR_MAP: Dict[str, int] = {
+    "Jan": 1,
+    "01": 1,
+    "Feb": 2,
+    "02": 2,
+    "Mar": 3,
+    "03": 3,
+    "Apr": 4,
+    "04": 4,
+    "May": 5,
+    "05": 5,
+    "Jun": 6,
+    "06": 6,
+    "Jul": 7,
+    "07": 7,
+    "Aug": 8,
+    "08": 8,
+    "Sep": 9,
+    "09": 9,
+    "Oct": 10,
+    "10": 10,
+    "Nov": 11,
+    "11": 11,
+    "Dec": 12,
+    "12": 12,
+}
+
+# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
+COUNTRY_NAME_MAP: Dict[str, str] = {
+    "Afghanistan": "af",
+    "Albania": "al",
+    "Algeria": "dz",
+    "Andorra": "ad",
+    "Angola": "ao",
+    "Antigua and Barbuda": "ag",
+    "Argentina": "ar",
+    "Armenia": "am",
+    "Australia": "au",
+    "Austria": "at",
+    "Azerbaijan": "az",
+    "Bahamas": "bs",
+    "Bahrain": "bh",
+    "Bangladesh": "bd",
+    "Barbados": "bb",
+    "Belarus": "by",
+    "Belgium": "be",
+    "Belize": "bz",
+    "Benin": "bj",
+    "Bhutan": "bt",
+    "Bolivia": "bo",
+    "Bosnia and Herzegowina": "ba",
+    "Botswana": "bw",
+    "Brazil": "br",
+    "Brunei Darussalam": "bn",
+    "Bulgaria": "bg",
+    "Burkina Faso": "bf",
+    "Burundi": "bi",
+    "Cambodia": "kh",
+    "Cameroon": "cm",
+    "Canada": "ca",
+    "Cape Verde": "cv",
+    "Central African Republic": "cf",
+    "Chad": "td",
+    "Chile": "cl",
+    "China": "cn",
+    "Colombia": "co",
+    "Comoros": "km",
+    "Congo, Democratic Republic": "cd",
+    "Congo, People’s Republic": "cg",
+    "Costa Rica": "cr",
+    "Cote d'Ivoire": "ci",
+    "Croatia (Local Name: Hrvatska)": "hr",
+    "Cuba": "cu",
+    "Cyprus": "cy",
+    "Czech Republic": "cz",
+    "Denmark": "dk",
+    "Djibouti": "dj",
+    "Dominica": "dm",
+    "Dominican Republic": "do",
+    "East Timor": "tl",
+    "Ecuador": "ec",
+    "El Salvador": "sv",
+    "Equatorial Guinea": "gq",
+    "Eritrea": "er",
+    "Estonia": "ee",
+    "Ethiopia": "et",
+    "Fiji": "fj",
+    "Finland": "fi",
+    "France": "fr",
+    "Gabon": "ga",
+    "Gambia": "gm",
+    "Georgia": "ge",
+    "Germany": "de",
+    "Ghana": "gh",
+    "Greece": "gr",
+    "Greenland": "gl",
+    "Grenada": "gd",
+    "Guatemala": "gt",
+    "Guinea": "gn",
+    "Guinea-Bissau": "gw",
+    "Guyana": "gy",
+    "Haiti": "ht",
+    "Honduras": "hn",
+    "Hong Kong": "hk",
+    "Hungary": "hu",
+    "Iceland": "is",
+    "India": "in",
+    "Indonesia": "id",
+    "Iran": "ir",
+    "Iraq": "iq",
+    "Ireland": "ie",
+    "Israel": "il",
+    "Italy": "it",
+    "Jamaica": "jm",
+    "Japan": "jp",
+    "Jordan": "jo",
+    "Kazakhstan": "kz",
+    "Kenya": "ke",
+    "Kiribati": "ki",
+    "Korea, Democratic People's Republic": "kp",
+    "Korea, Republic": "kr",
+    "Kuwait": "kw",
+    "Kyrgyzstan": "kg",
+    "Laos": "la",
+    "Latvia": "lv",
+    "Lebanon": "lb",
+    "Lesotho": "ls",
+    "Liberia": "lr",
+    "Libya": "ly",
+    "Liechtenstein": "li",
+    "Lithuania": "lt",
+    "Luxembourg": "lu",
+    "Macedonia": "mk",
+    "Madagascar": "mg",
+    "Malawi": "mw",
+    "Malaysia": "my",
+    "Maldives": "mv",
+    "Mali": "ml",
+    "Malta": "mt",
+    "Marshall Islands": "mh",
+    "Mauritania": "mr",
+    "Mauritius": "mu",
+    "Mexico": "mx",
+    "Micronesia": "fm",
+    "Moldova": "md",
+    "Monaco": "mc",
+    "Mongolia": "mn",
+    "Morocco": "ma",
+    "Mozambique": "mz",
+    "Myanmar": "mm",
+    "Namibia": "na",
+    "Nauru": "nr",
+    "Nepal": "np",
+    "Netherlands": "nl",
+    "New Zealand": "nz",
+    "Nicaragua": "ni",
+    "Niger": "ne",
+    "Nigeria": "ng",
+    "Norway": "no",
+    "Oman": "om",
+    "Pakistan": "pk",
+    "Palau": "pw",
+    "Panama": "pa",
+    "Papua New Guinea": "pg",
+    "Paraguay": "py",
+    "Peru": "pe",
+    "Philippines": "ph",
+    "Poland": "pl",
+    "Portugal": "pt",
+    "Puerto Rico": "pr",
+    "Qatar": "qa",
+    "Romania": "ro",
+    "Russian Federation": "ru",
+    "Rwanda": "rw",
+    "Saint Kitts and Nevis": "kn",
+    "Saint Lucia": "lc",
+    "Saint Vincent and the Grenadines": "vc",
+    "Samoa": "ws",
+    "San Marino": "sm",
+    "Sao Tome and Príncipe": "st",
+    "Saudi Arabia": "sa",
+    "Senegal": "sn",
+    "Serbia and Montenegro": "cs",
+    "Seychelles": "sc",
+    "Sierra Leone": "sl",
+    "Singapore": "sg",
+    "Slovakia (Slovak Republic)": "sk",
+    "Slovenia": "si",
+    "Solomon Islands": "sb",
+    "Somalia": "so",
+    "South Africa": "za",
+    "Spain": "es",
+    "Sri Lanka": "lk",
+    "Sudan": "sd",
+    "Suriname": "sr",
+    "Swaziland": "sz",
+    "Sweden": "se",
+    "Switzerland": "ch",
+    "Syrian Arab Republic": "sy",
+    "Taiwan": "tw",
+    "Tajikistan": "tj",
+    "Tanzania": "tz",
+    "Tanzania": "tz",
+    "Thailand": "th",
+    "Togo": "tg",
+    "Tonga": "to",
+    "Trinidad and Tobago": "tt",
+    "Tunisia": "tn",
+    "Turkey": "tr",
+    "Turkmenistan": "tm",
+    "Tuvalu": "tv",
+    "Uganda": "ug",
+    "Ukraine": "ua",
+    "United Arab Emirates": "ae",
+    "United Kingdom": "gb",
+    "United States": "us",
+    "Uruguay": "uy",
+    # Additions from running over large files
+    "Bosnia and Herzegovina": "ba",
+    # "International"
+    "China (Republic : 1949- )": "tw",  # pretty sure this is tw not cn
+    "Russia (Federation)": "ru",
+    "Scotland": "gb",
+    "England": "gb",
+    "Korea (South)": "kr",
+    "Georgia (Republic)": "ge",
+    "Egypt": "eg",
+}
+
+CONTAINER_TYPE_MAP: Dict[str, str] = {
+    "article-journal": "journal",
+    "paper-conference": "conference",
+    "book": "book-series",
+}
+
+# These are based, informally, on sorting the most popular licenses found in
+# Crossref metadata. There were over 500 unique strings and only a few most
+# popular are here; many were variants of the CC URLs. Would be useful to
+# normalize CC licenses better.
+# The current norm is to only add license slugs that are at least partially OA.
+LICENSE_SLUG_MAP: Dict[str, str] = {
+    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
+    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
+    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
+    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
+    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
+    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
+    "//creativecommons.org/licenses/by/2.0/": "CC-BY",
+    "//creativecommons.org/licenses/by/3.0/": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/": "CC-BY",
+    "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
+    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
+    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
+    "//spdx.org/licenses/CC0-1.0.json": "CC-0",
+    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
+    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
+    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
+    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
+    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
+    "//spdx.org/licenses/MIT.json": "MIT",
+    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
+    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
+    "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
+    "//www.karger.com/Services/SiteLicenses": "KARGER",
+    "//www.karger.com/Services/SiteLicenses/": "KARGER",
+    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
+    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
+    "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
+    "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
+    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
+    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
+    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
+    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
+    "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
+    "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
+    "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
+    "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
+    "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
+    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
+    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
+    "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
+    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
+    # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+    # //www.springer.com/tdm doesn't seem like a license
+    # //iopscience.iop.org/page/copyright is closed
+    # //www.acm.org/publications/policies/copyright_policy#Background is closed
+    # //rsc.li/journals-terms-of-use is closed for vor (am open)
+    # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
+    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
+    # skip these TDM licenses; they don't apply to content
+    # "//www.springer.com/tdm/": "SPRINGER-TDM",
+    # "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
+    # "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
+}
+
+# Map various datacite type types to CSL-ish types. None means TODO or remove.
+DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
+    "ris": {
+        "THES": "thesis",
+        "SOUND": "song",  # 99.9% maps to citeproc song, so use that (exception: report)
+        "CHAP": "chapter",
+        "FIGURE": "figure",
+        "RPRT": "report",
+        "JOUR": "article-journal",
+        "MPCT": "motion_picture",
+        "GEN": "article-journal",  # GEN consist of 99% article and report, post-weblog, misc - and one dataset
+        "BOOK": "book",
+        "DATA": "dataset",
+        "COMP": "software",
+    },
+    "schemaOrg": {
+        "Dataset": "dataset",
+        "Book": "book",
+        "ScholarlyArticle": "article-journal",
+        "ImageObject": "graphic",
+        "Collection": None,
+        "MediaObject": None,
+        "Event": None,
+        "SoftwareSourceCode": "software",
+        "Chapter": "chapter",
+        "CreativeWork": None,  # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
+        "PublicationIssue": "article",
+        "AudioObject": None,
+        "Thesis": "thesis",
+    },
+    "citeproc": {
+        "article": "article",
+        "article-journal": "article-journal",
+        "article-magazine": "article-magazine",
+        "article-newspaper": "article-newspaper",
+        "bill": "bill",
+        "book": "book",
+        "broadcast": "broadcast",
+        "chapter": "chapter",
+        "dataset": "dataset",
+        "entry-dictionary": "entry-dictionary",
+        "entry-encyclopedia": "entry-encyclopedia",
+        "entry": "entry",
+        "figure": "figure",
+        "graphic": "graphic",
+        "interview": "interview",
+        "legal_case": "legal_case",
+        "legislation": "legislation",
+        "manuscript": "manuscript",
+        "map": "map",
+        "motion_picture": "motion_picture",
+        "musical_score": "musical_score",
+        "pamphlet": "pamphlet",
+        "paper-conference": "paper-conference",
+        "patent": "patent",
+        "personal_communication": "personal_communication",
+        "post": "post",
+        "post-weblog": "post-weblog",
+        "report": "report",
+        "review-book": "review-book",
+        "review": "review",
+        "song": "song",
+        "speech": "speech",
+        "thesis": "thesis",
+        "treaty": "treaty",
+        "webpage": "webpage",
+    },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
+    "bibtex": {
+        "phdthesis": "thesis",
+        "inbook": "chapter",
+        "misc": None,
+        "article": "article-journal",
+        "book": "book",
+    },
+    "resourceTypeGeneral": {
+        "Image": "graphic",
+        "Dataset": "dataset",
+        "PhysicalObject": None,
+        "Collection": None,
+        "Text": None,  # "Greyliterature, labnotes, accompanyingmaterials"
+        "Sound": None,
+        "InteractiveResource": None,
+        "Event": None,
+        "Software": "software",
+        "Other": None,
+        "Workflow": None,
+        "Audiovisual": None,
+    },  # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
+}
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 4d4d696b..654be2e9 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -26,9 +26,8 @@ from .common import (
     KafkaJsonPusher,
     LinePusher,
     SqlitePusher,
-    make_kafka_consumer,
 )
-from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug
+from .crossref import CrossrefImporter
 from .datacite import DataciteImporter
 from .dblp_container import DblpContainerImporter
 from .dblp_release import DblpReleaseImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 56c3d32e..7c587395 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -27,71 +27,14 @@ from fatcat_openapi_client import (
 from fatcat_openapi_client.rest import ApiException
 from fuzzycat.matching import match_release_fuzzy
 
+from fatcat_tools.biblio_lookup_tables import DOMAIN_REL_MAP
 from fatcat_tools.normal import clean_doi
 from fatcat_tools.transforms import entity_to_dict
 
 DATE_FMT: str = "%Y-%m-%d"
 SANE_MAX_RELEASES: int = 200
 SANE_MAX_URLS: int = 100
-
-DOMAIN_REL_MAP: Dict[str, str] = {
-    "archive.org": "archive",
-    # LOCKSS, Portico, DuraSpace, etc would also be "archive"
-    "arxiv.org": "repository",
-    "babel.hathitrust.org": "repository",
-    "cds.cern.ch": "repository",
-    "deepblue.lib.umich.edu": "repository",
-    "europepmc.org": "repository",
-    "hal.inria.fr": "repository",
-    "scielo.isciii.es": "repository",
-    "www.dtic.mil": "repository",
-    "www.jstage.jst.go.jp": "repository",
-    "www.jstor.org": "repository",
-    "www.ncbi.nlm.nih.gov": "repository",
-    "ftp.ncbi.nlm.nih.gov": "repository",
-    "www.scielo.br": "repository",
-    "www.scielo.cl": "repository",
-    "www.scielo.org.mx": "repository",
-    "zenodo.org": "repository",
-    "www.biorxiv.org": "repository",
-    "www.medrxiv.org": "repository",
-    "citeseerx.ist.psu.edu": "aggregator",
-    "publisher-connector.core.ac.uk": "aggregator",
-    "core.ac.uk": "aggregator",
-    "static.aminer.org": "aggregator",
-    "aminer.org": "aggregator",
-    "pdfs.semanticscholar.org": "aggregator",
-    "semanticscholar.org": "aggregator",
-    "www.semanticscholar.org": "aggregator",
-    "academic.oup.com": "publisher",
-    "cdn.elifesciences.org": "publisher",
-    "cell.com": "publisher",
-    "dl.acm.org": "publisher",
-    "downloads.hindawi.com": "publisher",
-    "elifesciences.org": "publisher",
-    "iopscience.iop.org": "publisher",
-    "journals.plos.org": "publisher",
-    "link.springer.com": "publisher",
-    "onlinelibrary.wiley.com": "publisher",
-    "works.bepress.com": "publisher",
-    "www.biomedcentral.com": "publisher",
-    "www.cell.com": "publisher",
-    "www.nature.com": "publisher",
-    "www.pnas.org": "publisher",
-    "www.tandfonline.com": "publisher",
-    "www.frontiersin.org": "publisher",
-    "www.degruyter.com": "publisher",
-    "www.mdpi.com": "publisher",
-    "www.ahajournals.org": "publisher",
-    "ehp.niehs.nih.gov": "publisher",
-    "journals.tsu.ru": "publisher",
-    "www.cogentoa.com": "publisher",
-    "www.researchgate.net": "academicsocial",
-    "academia.edu": "academicsocial",
-    "wayback.archive-it.org": "webarchive",
-    "web.archive.org": "webarchive",
-    "archive.is": "webarchive",
-}
+MAX_ABSTRACT_LENGTH: int = 2048
 
 
 def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]:
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 8f5a4265..52bd7465 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,7 +4,8 @@ from typing import Any, Dict, List, Optional, Sequence
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import CONTAINER_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
 
 from .common import EntityImporter
 
@@ -33,97 +34,6 @@ CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
     "standard": "standard",
 }
 
-CONTAINER_TYPE_MAP: Dict[str, str] = {
-    "article-journal": "journal",
-    "paper-conference": "conference",
-    "book": "book-series",
-}
-
-# These are based, informally, on sorting the most popular licenses found in
-# Crossref metadata. There were over 500 unique strings and only a few most
-# popular are here; many were variants of the CC URLs. Would be useful to
-# normalize CC licenses better.
-# The current norm is to only add license slugs that are at least partially OA.
-LICENSE_SLUG_MAP: Dict[str, str] = {
-    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
-    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
-    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
-    "//creativecommons.org/licenses/by/2.0/": "CC-BY",
-    "//creativecommons.org/licenses/by/3.0/": "CC-BY",
-    "//creativecommons.org/licenses/by/4.0/": "CC-BY",
-    "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
-    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
-    "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
-    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
-    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
-    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
-    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
-    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
-    "//spdx.org/licenses/CC0-1.0.json": "CC-0",
-    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
-    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/MIT.json": "MIT",
-    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.karger.com/Services/SiteLicenses": "KARGER",
-    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
-    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
-    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
-    "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
-    "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
-    "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
-    # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
-    # //www.springer.com/tdm doesn't seem like a license
-    # //iopscience.iop.org/page/copyright is closed
-    # //www.acm.org/publications/policies/copyright_policy#Background is closed
-    # //rsc.li/journals-terms-of-use is closed for vor (am open)
-    # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
-    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
-}
-
-
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
-    if not raw:
-        return None
-    raw = raw.strip().replace("http://", "//").replace("https://", "//")
-    if "creativecommons.org" in raw.lower():
-        raw = raw.lower()
-        raw = raw.replace("/legalcode", "/").replace("/uk", "")
-        if not raw.endswith("/"):
-            raw = raw + "/"
-    return LICENSE_SLUG_MAP.get(raw)
-
-
-def test_lookup_license_slug() -> None:
-
-    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
-    assert (
-        lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
-        == "CC-BY"
-    )
-    assert (
-        lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
-        == "CC-0"
-    )
-    assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
-    assert (
-        lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
-        == "CC-BY-NC-SA"
-    )
-    assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
-    assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
-    assert lookup_license_slug("") is None
-    assert lookup_license_slug(None) is None
-
 
 class CrossrefImporter(EntityImporter):
     """
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 441514b8..b310f8bc 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -21,113 +21,19 @@ import langdetect
 import pycountry
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import DATACITE_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
 from fatcat_tools.transforms import entity_to_dict
 
-from .common import EntityImporter
-
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter
 
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
-CONTAINER_TYPE_MAP: Dict[str, str] = {
+DATACITE_CONTAINER_TYPE_MAP: Dict[str, str] = {
     "Journal": "journal",
     "Series": "journal",
     "Book Series": "book-series",
 }
 
-# The docs/guide should be the canonical home for these mappings; update there
-# first.  Map various datacite type types to CSL-ish types. None means TODO or
-# remove.
-DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
-    "ris": {
-        "THES": "thesis",
-        "SOUND": "song",  # 99.9% maps to citeproc song, so use that (exception: report)
-        "CHAP": "chapter",
-        "FIGURE": "figure",
-        "RPRT": "report",
-        "JOUR": "article-journal",
-        "MPCT": "motion_picture",
-        "GEN": "article-journal",  # GEN consist of 99% article and report, post-weblog, misc - and one dataset
-        "BOOK": "book",
-        "DATA": "dataset",
-        "COMP": "software",
-    },
-    "schemaOrg": {
-        "Dataset": "dataset",
-        "Book": "book",
-        "ScholarlyArticle": "article-journal",
-        "ImageObject": "graphic",
-        "Collection": None,
-        "MediaObject": None,
-        "Event": None,
-        "SoftwareSourceCode": "software",
-        "Chapter": "chapter",
-        "CreativeWork": None,  # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
-        "PublicationIssue": "article",
-        "AudioObject": None,
-        "Thesis": "thesis",
-    },
-    "citeproc": {
-        "article": "article",
-        "article-journal": "article-journal",
-        "article-magazine": "article-magazine",
-        "article-newspaper": "article-newspaper",
-        "bill": "bill",
-        "book": "book",
-        "broadcast": "broadcast",
-        "chapter": "chapter",
-        "dataset": "dataset",
-        "entry-dictionary": "entry-dictionary",
-        "entry-encyclopedia": "entry-encyclopedia",
-        "entry": "entry",
-        "figure": "figure",
-        "graphic": "graphic",
-        "interview": "interview",
-        "legal_case": "legal_case",
-        "legislation": "legislation",
-        "manuscript": "manuscript",
-        "map": "map",
-        "motion_picture": "motion_picture",
-        "musical_score": "musical_score",
-        "pamphlet": "pamphlet",
-        "paper-conference": "paper-conference",
-        "patent": "patent",
-        "personal_communication": "personal_communication",
-        "post": "post",
-        "post-weblog": "post-weblog",
-        "report": "report",
-        "review-book": "review-book",
-        "review": "review",
-        "song": "song",
-        "speech": "speech",
-        "thesis": "thesis",
-        "treaty": "treaty",
-        "webpage": "webpage",
-    },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
-    "bibtex": {
-        "phdthesis": "thesis",
-        "inbook": "chapter",
-        "misc": None,
-        "article": "article-journal",
-        "book": "book",
-    },
-    "resourceTypeGeneral": {
-        "Image": "graphic",
-        "Dataset": "dataset",
-        "PhysicalObject": None,
-        "Collection": None,
-        "Text": None,  # "Greyliterature, labnotes, accompanyingmaterials"
-        "Sound": None,
-        "InteractiveResource": None,
-        "Event": None,
-        "Software": "software",
-        "Other": None,
-        "Workflow": None,
-        "Audiovisual": None,
-    },  # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
-}
-
 # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
 DATACITE_UNKNOWN_MARKERS: List[str] = [
     "(:unac)",  # temporarily inaccessible
@@ -180,43 +86,6 @@ DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [
     }
 ]
 
-# TODO(martin): merge this with other maps and lookup functions, eventually.
-LICENSE_SLUG_MAP: Dict[str, str] = {
-    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
-    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
-    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
-    "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
-    "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
-    "//onlinelibrary.wiley.com/termsandconditions/": "WILEY",
-    "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
-    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
-    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
-    "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
-    "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
-    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
-    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
-    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
-    "//www.karger.com/Services/SiteLicenses/": "KARGER",
-    "//www.springer.com/tdm/": "SPRINGER-TDM",
-    "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
-    "//spdx.org/licenses/CC0-1.0.json": "CC-0",
-    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
-    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/MIT.json": "MIT",
-    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-}
-
 
 class DataciteImporter(EntityImporter):
     """
@@ -406,8 +275,8 @@ class DataciteImporter(EntityImporter):
         container_name = None
 
         container = attributes.get("container", {}) or {}
-        if container.get("type") in CONTAINER_TYPE_MAP.keys():
-            container_type = CONTAINER_TYPE_MAP.get(container["type"])
+        if container.get("type") in DATACITE_CONTAINER_TYPE_MAP.keys():
+            container_type = DATACITE_CONTAINER_TYPE_MAP.get(container["type"])
             if container.get("identifier") and container.get("identifierType") == "ISSN":
                 issn = container.get("identifier")
                 if issn and len(issn) == 8:
@@ -488,7 +357,7 @@ class DataciteImporter(EntityImporter):
         license_extra = []
 
         for lic in attributes.get("rightsList", []):
-            slug = lookup_license_slug(lic.get("rightsUri"))
+            slug = datacite_lookup_license_slug(lic.get("rightsUri"))
             if slug:
                 license_slug = slug
             license_extra.append(lic)
@@ -968,7 +837,7 @@ def contributor_list_contains_contributor(
     return False
 
 
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+def datacite_lookup_license_slug(raw: Optional[str]) -> Optional[str]:
     """
     Resolve a variety of strings into a some pseudo-canonical form, e.g.
     CC-BY-ND, CC-0, MIT and so on.
@@ -1063,12 +932,8 @@ def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
             return None
         return "RS-{}".format(name.upper())
 
-    # Fallback to mapped values.
-    raw = raw.lower()
-    raw = raw.strip().replace("http://", "//").replace("https://", "//")
-    if not raw.endswith("/"):
-        raw = raw + "/"
-    return LICENSE_SLUG_MAP.get(raw)
+    # Fallback to generic license lookup
+    return lookup_license_slug(raw)
 
 
 def find_original_language_title(
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index f5c886a2..92dbe574 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Sequence
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.importers.common import EntityImporter
+from fatcat_tools.importers.common import MAX_ABSTRACT_LENGTH, EntityImporter
 from fatcat_tools.normal import (
     clean_doi,
     clean_orcid,
@@ -24,9 +24,6 @@ from fatcat_tools.normal import (
     parse_month,
 )
 
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
-
 
 class DoajArticleImporter(EntityImporter):
     def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None:
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 9db499a0..3c85132c 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -9,9 +9,7 @@ from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
 
 from fatcat_tools.normal import clean_doi, clean_str
 
-from .common import EntityImporter, make_rel_url
-
-MAX_ABSTRACT_BYTES = 4096
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url
 
 
 class GrobidMetadataImporter(EntityImporter):
@@ -84,7 +82,7 @@ class GrobidMetadataImporter(EntityImporter):
         extra_grobid: Dict[str, Any] = dict()
 
         abstract = obj.get("abstract")
-        if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
+        if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10:
             abobj = fatcat_openapi_client.ReleaseAbstract(
                 mimetype="text/plain", content=clean_str(obj.get("abstract"))
             )
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index c2f650b0..79691c9a 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,7 +8,8 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import LANG_MAP_MARC, clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import LANG_MAP_MARC
+from fatcat_tools.normal import clean_doi, clean_str
 
 from .common import EntityImporter
 from .crossref import CONTAINER_TYPE_MAP
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3274234f..5bc7a9ff 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,325 +8,16 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import (
+from fatcat_tools.biblio_lookup_tables import (
+    COUNTRY_NAME_MAP,
     LANG_MAP_MARC,
-    clean_doi,
-    clean_issn,
-    clean_pmcid,
-    clean_pmid,
-    clean_str,
+    MONTH_ABBR_MAP,
+    PUBMED_RELEASE_TYPE_MAP,
 )
+from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid, clean_str
 
 from .common import EntityImporter
 
-# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
-PUBMED_RELEASE_TYPE_MAP = {
-    # Adaptive Clinical Trial
-    "Address": "speech",
-    "Autobiography": "book",
-    # Bibliography
-    "Biography": "book",
-    # Case Reports
-    "Classical Article": "article-journal",
-    # Clinical Conference
-    # Clinical Study
-    # Clinical Trial
-    # Clinical Trial, Phase I
-    # Clinical Trial, Phase II
-    # Clinical Trial, Phase III
-    # Clinical Trial, Phase IV
-    # Clinical Trial Protocol
-    # Clinical Trial, Veterinary
-    # Collected Works
-    # Comparative Study
-    # Congress
-    # Consensus Development Conference
-    # Consensus Development Conference, NIH
-    # Controlled Clinical Trial
-    "Dataset": "dataset",
-    # Dictionary
-    # Directory
-    # Duplicate Publication
-    "Editorial": "editorial",
-    # English Abstract   # doesn't indicate that this is abstract-only
-    # Equivalence Trial
-    # Evaluation Studies
-    # Expression of Concern
-    # Festschrift
-    # Government Document
-    # Guideline
-    "Historical Article": "article-journal",
-    # Interactive Tutorial
-    "Interview": "interview",
-    "Introductory Journal Article": "article-journal",
-    "Journal Article": "article-journal",
-    "Lecture": "speech",
-    "Legal Case": "legal_case",
-    "Legislation": "legislation",
-    "Letter": "letter",
-    # Meta-Analysis
-    # Multicenter Study
-    # News
-    "Newspaper Article": "article-newspaper",
-    # Observational Study
-    # Observational Study, Veterinary
-    # Overall
-    # Patient Education Handout
-    # Periodical Index
-    # Personal Narrative
-    # Portrait
-    # Practice Guideline
-    # Pragmatic Clinical Trial
-    # Publication Components
-    # Publication Formats
-    # Publication Type Category
-    # Randomized Controlled Trial
-    # Research Support, American Recovery and Reinvestment Act
-    # Research Support, N.I.H., Extramural
-    # Research Support, N.I.H., Intramural
-    # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
-    # Research Support, U.S. Gov't, P.H.S.
-    # Review     # in the "literature review" sense, not "product review"
-    # Scientific Integrity Review
-    # Study Characteristics
-    # Support of Research
-    # Systematic Review
-    "Technical Report": "report",
-    # Twin Study
-    # Validation Studies
-    # Video-Audio Media
-    # Webcasts
-}
-
-MONTH_ABBR_MAP = {
-    "Jan": 1,
-    "01": 1,
-    "Feb": 2,
-    "02": 2,
-    "Mar": 3,
-    "03": 3,
-    "Apr": 4,
-    "04": 4,
-    "May": 5,
-    "05": 5,
-    "Jun": 6,
-    "06": 6,
-    "Jul": 7,
-    "07": 7,
-    "Aug": 8,
-    "08": 8,
-    "Sep": 9,
-    "09": 9,
-    "Oct": 10,
-    "10": 10,
-    "Nov": 11,
-    "11": 11,
-    "Dec": 12,
-    "12": 12,
-}
-
-# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
-COUNTRY_NAME_MAP = {
-    "Afghanistan": "af",
-    "Albania": "al",
-    "Algeria": "dz",
-    "Andorra": "ad",
-    "Angola": "ao",
-    "Antigua and Barbuda": "ag",
-    "Argentina": "ar",
-    "Armenia": "am",
-    "Australia": "au",
-    "Austria": "at",
-    "Azerbaijan": "az",
-    "Bahamas": "bs",
-    "Bahrain": "bh",
-    "Bangladesh": "bd",
-    "Barbados": "bb",
-    "Belarus": "by",
-    "Belgium": "be",
-    "Belize": "bz",
-    "Benin": "bj",
-    "Bhutan": "bt",
-    "Bolivia": "bo",
-    "Bosnia and Herzegowina": "ba",
-    "Botswana": "bw",
-    "Brazil": "br",
-    "Brunei Darussalam": "bn",
-    "Bulgaria": "bg",
-    "Burkina Faso": "bf",
-    "Burundi": "bi",
-    "Cambodia": "kh",
-    "Cameroon": "cm",
-    "Canada": "ca",
-    "Cape Verde": "cv",
-    "Central African Republic": "cf",
-    "Chad": "td",
-    "Chile": "cl",
-    "China": "cn",
-    "Colombia": "co",
-    "Comoros": "km",
-    "Congo, Democratic Republic": "cd",
-    "Congo, People’s Republic": "cg",
-    "Costa Rica": "cr",
-    "Cote d'Ivoire": "ci",
-    "Croatia (Local Name: Hrvatska)": "hr",
-    "Cuba": "cu",
-    "Cyprus": "cy",
-    "Czech Republic": "cz",
-    "Denmark": "dk",
-    "Djibouti": "dj",
-    "Dominica": "dm",
-    "Dominican Republic": "do",
-    "East Timor": "tl",
-    "Ecuador": "ec",
-    "El Salvador": "sv",
-    "Equatorial Guinea": "gq",
-    "Eritrea": "er",
-    "Estonia": "ee",
-    "Ethiopia": "et",
-    "Fiji": "fj",
-    "Finland": "fi",
-    "France": "fr",
-    "Gabon": "ga",
-    "Gambia": "gm",
-    "Georgia": "ge",
-    "Germany": "de",
-    "Ghana": "gh",
-    "Greece": "gr",
-    "Greenland": "gl",
-    "Grenada": "gd",
-    "Guatemala": "gt",
-    "Guinea": "gn",
-    "Guinea-Bissau": "gw",
-    "Guyana": "gy",
-    "Haiti": "ht",
-    "Honduras": "hn",
-    "Hong Kong": "hk",
-    "Hungary": "hu",
-    "Iceland": "is",
-    "India": "in",
-    "Indonesia": "id",
-    "Iran": "ir",
-    "Iraq": "iq",
-    "Ireland": "ie",
-    "Israel": "il",
-    "Italy": "it",
-    "Jamaica": "jm",
-    "Japan": "jp",
-    "Jordan": "jo",
-    "Kazakhstan": "kz",
-    "Kenya": "ke",
-    "Kiribati": "ki",
-    "Korea, Democratic People's Republic": "kp",
-    "Korea, Republic": "kr",
-    "Kuwait": "kw",
-    "Kyrgyzstan": "kg",
-    "Laos": "la",
-    "Latvia": "lv",
-    "Lebanon": "lb",
-    "Lesotho": "ls",
-    "Liberia": "lr",
-    "Libya": "ly",
-    "Liechtenstein": "li",
-    "Lithuania": "lt",
-    "Luxembourg": "lu",
-    "Macedonia": "mk",
-    "Madagascar": "mg",
-    "Malawi": "mw",
-    "Malaysia": "my",
-    "Maldives": "mv",
-    "Mali": "ml",
-    "Malta": "mt",
-    "Marshall Islands": "mh",
-    "Mauritania": "mr",
-    "Mauritius": "mu",
-    "Mexico": "mx",
-    "Micronesia": "fm",
-    "Moldova": "md",
-    "Monaco": "mc",
-    "Mongolia": "mn",
-    "Morocco": "ma",
-    "Mozambique": "mz",
-    "Myanmar": "mm",
-    "Namibia": "na",
-    "Nauru": "nr",
-    "Nepal": "np",
-    "Netherlands": "nl",
-    "New Zealand": "nz",
-    "Nicaragua": "ni",
-    "Niger": "ne",
-    "Nigeria": "ng",
-    "Norway": "no",
-    "Oman": "om",
-    "Pakistan": "pk",
-    "Palau": "pw",
-    "Panama": "pa",
-    "Papua New Guinea": "pg",
-    "Paraguay": "py",
-    "Peru": "pe",
-    "Philippines": "ph",
-    "Poland": "pl",
-    "Portugal": "pt",
-    "Puerto Rico": "pr",
-    "Qatar": "qa",
-    "Romania": "ro",
-    "Russian Federation": "ru",
-    "Rwanda": "rw",
-    "Saint Kitts and Nevis": "kn",
-    "Saint Lucia": "lc",
-    "Saint Vincent and the Grenadines": "vc",
-    "Samoa": "ws",
-    "San Marino": "sm",
-    "Sao Tome and Príncipe": "st",
-    "Saudi Arabia": "sa",
-    "Senegal": "sn",
-    "Serbia and Montenegro": "cs",
-    "Seychelles": "sc",
-    "Sierra Leone": "sl",
-    "Singapore": "sg",
-    "Slovakia (Slovak Republic)": "sk",
-    "Slovenia": "si",
-    "Solomon Islands": "sb",
-    "Somalia": "so",
-    "South Africa": "za",
-    "Spain": "es",
-    "Sri Lanka": "lk",
-    "Sudan": "sd",
-    "Suriname": "sr",
-    "Swaziland": "sz",
-    "Sweden": "se",
-    "Switzerland": "ch",
-    "Syrian Arab Republic": "sy",
-    "Taiwan": "tw",
-    "Tajikistan": "tj",
-    "Tanzania": "tz",
-    "Tanzania": "tz",
-    "Thailand": "th",
-    "Togo": "tg",
-    "Tonga": "to",
-    "Trinidad and Tobago": "tt",
-    "Tunisia": "tn",
-    "Turkey": "tr",
-    "Turkmenistan": "tm",
-    "Tuvalu": "tv",
-    "Uganda": "ug",
-    "Ukraine": "ua",
-    "United Arab Emirates": "ae",
-    "United Kingdom": "gb",
-    "United States": "us",
-    "Uruguay": "uy",
-    # Additions from running over large files
-    "Bosnia and Herzegovina": "ba",
-    # "International"
-    "China (Republic : 1949- )": "tw",  # pretty sure this is tw not cn
-    "Russia (Federation)": "ru",
-    "Scotland": "gb",
-    "England": "gb",
-    "Korea (South)": "kr",
-    "Georgia (Republic)": "ge",
-    "Egypt": "eg",
-}
-
 
 class PubmedImporter(EntityImporter):
     """
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 0d2c84ce..fc80411c 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -12,6 +12,8 @@ import ftfy
 import langdetect
 import pycountry
 
+from .biblio_lookup_tables import LICENSE_SLUG_MAP
+
 DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
 
 
@@ -606,84 +608,35 @@ def test_parse_country_name() -> None:
     assert parse_country_name("Japan") == "jp"
 
 
-# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
-# 2/T and 2/B?
-# PubMed/MEDLINE and JSTOR use these MARC codes
-# https://www.loc.gov/marc/languages/language_name.html
-LANG_MAP_MARC = {
-    "afr": "af",
-    "alb": "sq",
-    "amh": "am",
-    "ara": "ar",
-    "arm": "hy",
-    "aze": "az",
-    "ben": "bn",
-    "bos": "bs",
-    "bul": "bg",
-    "cat": "ca",
-    "chi": "zh",
-    "cze": "cs",
-    "dan": "da",
-    "dut": "nl",
-    "eng": "en",
-    "epo": "eo",
-    "est": "et",
-    "fin": "fi",
-    "fre": "fr",
-    "geo": "ka",
-    "ger": "de",
-    "gla": "gd",
-    "gre": "el",
-    "heb": "he",
-    "hin": "hi",
-    "hrv": "hr",
-    "hun": "hu",
-    "ice": "is",
-    "ind": "id",
-    "ita": "it",
-    "jpn": "ja",
-    "kin": "rw",
-    "kor": "ko",
-    "lat": "la",
-    "lav": "lv",
-    "lit": "lt",
-    "mac": "mk",
-    "mal": "ml",
-    "mao": "mi",
-    "may": "ms",
-    "nor": "no",
-    "per": "fa",
-    "per": "fa",
-    "pol": "pl",
-    "por": "pt",
-    "pus": "ps",
-    "rum": "ro",
-    "rus": "ru",
-    "san": "sa",
-    "slo": "sk",
-    "slv": "sl",
-    "spa": "es",
-    "srp": "sr",
-    "swe": "sv",
-    "tha": "th",
-    "tur": "tr",
-    "ukr": "uk",
-    "urd": "ur",
-    "vie": "vi",
-    "wel": "cy",
-    # additions
-    "gle": "ga",  # "Irish" (Gaelic)
-    "jav": "jv",  # Javanese
-    "welsh": "cy",  # Welsh
-    "oci": "oc",  # Occitan
-    # Don't have ISO 639-1 codes
-    "grc": "el",  # Ancient Greek; map to modern greek
-    "map": None,  # Austronesian (collection)
-    "syr": None,  # Syriac, Modern
-    "gem": None,  # Old Saxon
-    "non": None,  # Old Norse
-    "emg": None,  # Eastern Meohang
-    "neg": None,  # Negidal
-    "mul": None,  # Multiple languages
-    "und": None,  # Undetermined
-}
+def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+    if not raw:
+        return None
+    raw = raw.strip().replace("http://", "//").replace("https://", "//")
+    if "creativecommons.org" in raw.lower():
+        raw = raw.lower()
+        raw = raw.replace("/legalcode", "/").replace("/uk", "")
+        if not raw.endswith("/"):
+            raw = raw + "/"
+    return LICENSE_SLUG_MAP.get(raw)
+
+
+def test_lookup_license_slug() -> None:
+
+    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
+    assert (
+        lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
+        == "CC-BY"
+    )
+    assert (
+        lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
+        == "CC-0"
+    )
+    assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
+    assert (
+        lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
+        == "CC-BY-NC-SA"
+    )
+    assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
+    assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
+    assert lookup_license_slug("") is None
+    assert lookup_license_slug(None) is None
author	Bryan Newbold <bnewbold@robocracy.org>	2021-11-10 13:52:39 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-11-10 13:52:43 -0800
commit	ddc757bc1d5c610f42e9f5f10a4f060f517b66ca (patch)
tree	86ccdef998bd3da3910cfe8fb9f2177b58a664a0
parent	16e9979a6f347b49764c1141209e84083ea81057 (diff)
download	fatcat-ddc757bc1d5c610f42e9f5f10a4f060f517b66ca.tar.gz fatcat-ddc757bc1d5c610f42e9f5f10a4f060f517b66ca.zip