Merge branch 'bnewbold-import-refactors' into 'master'

import refactors and deprecations Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes. The Datacite-specific stuff could use review here. Remove unused/deprecated/dead code: - cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers - "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used) Refactors: - moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code) - shuffled around relative imports and some function names ("clean_str" vs. "clean") Some actual behavioral changes: - remove some Datacite-specific license slugs - stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!) - remove some excess metadata from datacite 'extra' fields
author: bnewbold <bnewbold@archive.org> 2021-11-11 01:12:18 +0000
committer: bnewbold <bnewbold@archive.org> 2021-11-11 01:12:18 +0000
commit: 6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch)
tree: 1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_tools/importers
parent: 7e3f91f1a49ea85707cae31125021ba761f5373d (diff)
parent: 6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff)
download: fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz
fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip
16 files changed, 146 insertions, 1380 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 06ecfd58..654be2e9 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -13,10 +13,8 @@ To run an import you combine two classes; one each of:
 
 from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter
 from .arxiv import ArxivRawImporter
-from .cdl_dash_dat import auto_cdl_dash_dat
 from .chocula import ChoculaImporter
 from .common import (
-    LANG_MAP_MARC,
     Bs4XmlFileListPusher,
     Bs4XmlFilePusher,
     Bs4XmlLargeFilePusher,
@@ -28,11 +26,8 @@ from .common import (
     KafkaJsonPusher,
     LinePusher,
     SqlitePusher,
-    clean,
-    is_cjk,
-    make_kafka_consumer,
 )
-from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug
+from .crossref import CrossrefImporter
 from .datacite import DataciteImporter
 from .dblp_container import DblpContainerImporter
 from .dblp_release import DblpReleaseImporter
@@ -55,4 +50,3 @@ from .matched import MatchedImporter
 from .orcid import OrcidImporter
 from .pubmed import PubmedImporter
 from .shadow import ShadowLibraryImporter
-from .wayback_static import auto_wayback_static
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index b4a4d9ed..92289bb3 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, FileEntity
 
-from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url
+from fatcat_tools.normal import b32_hex
+
+from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, make_rel_url
 
 ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL"
 
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 1d50dd9a..dd2c2284 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -9,6 +9,8 @@ from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 from pylatexenc.latex2text import LatexNodes2Text
 
+from fatcat_tools.normal import clean_doi
+
 from .common import EntityImporter
 from .crossref import lookup_license_slug
 
@@ -127,8 +129,8 @@ class ArxivRawImporter(EntityImporter):
         base_id = metadata.id.string
         doi = None
         if metadata.doi and metadata.doi.string:
-            doi = metadata.doi.string.lower().split()[0].strip()
-            if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):
+            doi = clean_doi(metadata.doi.string.lower().split()[0].strip())
+            if doi and not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):
                 sys.stderr.write("BOGUS DOI: {}\n".format(doi))
                 doi = None
         title = latex_to_text(metadata.title.get_text().replace("\n", " "))
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
deleted file mode 100755
index 1a4114a0..00000000
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ /dev/null
@@ -1,219 +0,0 @@
-#!/usr/bin/env python3
-
-import hashlib
-import json
-import mimetypes
-import os
-import subprocess
-import sys
-import urllib
-import urllib.parse
-from typing import Any, Dict, List, Optional, Tuple
-
-import fatcat_openapi_client
-import magic
-from fatcat_openapi_client import (
-    ApiClient,
-    Editgroup,
-    FilesetEntity,
-    FilesetFile,
-    ReleaseAbstract,
-    ReleaseContrib,
-    ReleaseEntity,
-    ReleaseExtIds,
-)
-
-from .common import clean
-from .crossref import lookup_license_slug
-
-
-def single_file(prefix: str, path: str) -> FilesetFile:
-
-    full = prefix + path
-    size_bytes = os.stat(full).st_size
-
-    hashes = [
-        hashlib.md5(),
-        hashlib.sha1(),
-        hashlib.sha256(),
-    ]
-    with open(full, "rb") as fp:
-        while True:
-            data = fp.read(2 ** 20)
-            if not data:
-                break
-            for h in hashes:
-                h.update(data)
-    mime = magic.Magic(mime=True).from_file(full)
-    if mime == "application/octet-stream":
-        # magic apparently isn't that great; try using filename as well
-        guess = mimetypes.guess_type(full)[0]
-        if guess:
-            mime = guess
-
-    fsf = FilesetFile(
-        path=path,
-        size=size_bytes,
-        md5=hashes[0].hexdigest(),
-        sha1=hashes[1].hexdigest(),
-        sha256=hashes[2].hexdigest(),
-        extra=dict(mimetype=mime),
-    )
-    return fsf
-
-
-def make_manifest(base_dir: str) -> List[FilesetFile]:
-    manifest = []
-    for root, dirs, files in os.walk(base_dir):
-        for f in files:
-            manifest.append(single_file(root, f))
-    return manifest
-
-
-def cdl_dash_release(
-    meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None
-) -> ReleaseEntity:
-
-    if not extra:
-        extra = dict()
-
-    assert meta["identifier"]["type"] == "DOI"
-    doi = meta["identifier"]["value"].lower()
-    assert doi.startswith("10.")
-
-    ark_id = None
-    for extid in meta.get("alternativeIdentifiers", []):
-        if extid["value"].startswith("ark:"):
-            ark_id = extid["value"]
-    assert ark_id
-
-    license_slug = lookup_license_slug(meta["rights"]["uri"])
-
-    abstracts = []
-    for desc in meta["descriptions"]:
-        if desc["type"] == "abstract":
-            abstracts.append(
-                ReleaseAbstract(mimetype="text/html", content=clean(desc["value"]))
-            )
-            # print(abstracts)
-
-    contribs = []
-    for creator in meta["creator"]:
-        contribs.append(
-            ReleaseContrib(
-                given_name=creator["given"],
-                surname=creator["family"],
-                # sorry everybody
-                raw_name="{} {}".format(creator["given"], creator["family"]),
-                raw_affiliation=creator.get("affiliation"),
-                role="author",  # presumably, for these datasets?
-            )
-        )
-
-    r = ReleaseEntity(
-        ext_ids=ReleaseExtIds(
-            doi=doi,
-            ark=ark_id,
-        ),
-        title=clean(meta["title"], force_xml=True),
-        publisher=clean(meta["publisher"]),
-        release_year=int(meta["publicationYear"]),
-        release_type="dataset",
-        license_slug=license_slug,
-        contribs=contribs,
-        abstracts=abstracts or None,
-        extra=extra,
-    )
-    return r
-
-
-def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]:
-
-    if dat_path.endswith("/"):
-        dat_path = dat_path[:-1]
-    dat_discovery = dat_path
-    extra = dict()
-    assert len(dat_discovery) == 64
-
-    with open(dat_path + "/cdl_dash_metadata.json", "r") as fp:
-        meta_dict = json.loads(fp.read())
-
-    release = cdl_dash_release(meta_dict)
-    ark_id = release.extra["ark_id"]
-
-    dash_version = None
-    # really crude XML parse-out
-    with open(dat_path + "/stash-wrapper.xml", "r") as fp:
-        for line in fp:
-            line = line.strip()
-            if line.startswith("<st:version_number>"):
-                dash_version = int(line[19:].split("<")[0])
-    assert dash_version is not None
-    extra["cdl_dash"] = dict(version=dash_version)
-    release.extra["cdl_dash"] = dict(version=dash_version)
-
-    manifest = make_manifest(dat_path + "/files/")
-
-    bundle_url = dict(
-        url="https://merritt.cdlib.org/u/{}/{}".format(
-            urllib.parse.quote(ark_id, safe=""), dash_version
-        ),
-        rel="repo-bundle",
-    )
-    repo_url = dict(
-        url="https://merritt.cdlib.org/d/{}/{}/".format(
-            urllib.parse.quote(ark_id, safe=""), dash_version
-        ),
-        rel="repo",
-    )
-    dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb")
-    fs = FilesetEntity(
-        urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra
-    )
-    return (release, fs)
-
-
-def auto_cdl_dash_dat(
-    api: ApiClient,
-    dat_path: str,
-    release_id: Optional[str] = None,
-    editgroup_id: Optional[str] = None,
-) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]:
-
-    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
-    (release, fileset) = make_release_fileset(dat_path)
-
-    if not editgroup_id:
-        eg = api.create_editgroup(
-            Editgroup(
-                description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
-                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"),
-            )
-        )
-        editgroup_id = eg.editgroup_id
-
-    if not release_id and release.ext_ids.doi:
-        try:
-            r = api.lookup_release(doi=release.ext_ids.doi)
-            release_id = r.ident
-        except fatcat_openapi_client.rest.ApiException:
-            pass
-    if not release_id:
-        edit = api.create_release(eg.editgroup_id, release)
-        release_id = edit.ident
-
-    release = api.get_release(release_id, expand="filesets")
-    if len(release.filesets):
-        print("A fileset already exists for release {}".format(release.ident))
-        return (None, None, None)
-
-    fileset.release_ids = [release.ident]
-    edit = api.create_fileset(eg.editgroup_id, fileset)
-    fileset = api.get_fileset(edit.ident)
-    return (editgroup_id, release, fileset)
-
-
-if __name__ == "__main__":
-    # pass this a discovery key that has been cloned to the local directory
-    print(make_release_fileset(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 842c7853..c44fec3b 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ContainerEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
 
 
 class ChoculaImporter(EntityImporter):
@@ -40,7 +42,7 @@ class ChoculaImporter(EntityImporter):
         returns a ContainerEntity (or None if invalid or couldn't parse)
         """
 
-        name = clean(row.get("name"))
+        name = clean_str(row.get("name"))
         if not name:
             # Name is required (by schema)
             return None
@@ -85,7 +87,7 @@ class ChoculaImporter(EntityImporter):
             ident=row["ident"],
             name=name,
             container_type=container_type,
-            publisher=clean(row.get("publisher")),
+            publisher=clean_str(row.get("publisher")),
             wikidata_qid=row.get("wikidata_qid"),
             extra=extra,
         )
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 2ec6efda..e2157ee5 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -27,74 +27,14 @@ from fatcat_openapi_client import (
 from fatcat_openapi_client.rest import ApiException
 from fuzzycat.matching import match_release_fuzzy
 
-# TODO: refactor so remove need for this (re-imports for backwards compatibility)
-from fatcat_tools.normal import is_cjk  # noqa: F401
-from fatcat_tools.normal import LANG_MAP_MARC, b32_hex  # noqa: F401
-from fatcat_tools.normal import clean_str as clean  # noqa: F401
+from fatcat_tools.biblio_lookup_tables import DOMAIN_REL_MAP
+from fatcat_tools.normal import clean_doi
 from fatcat_tools.transforms import entity_to_dict
 
 DATE_FMT: str = "%Y-%m-%d"
 SANE_MAX_RELEASES: int = 200
 SANE_MAX_URLS: int = 100
-
-DOMAIN_REL_MAP: Dict[str, str] = {
-    "archive.org": "archive",
-    # LOCKSS, Portico, DuraSpace, etc would also be "archive"
-    "arxiv.org": "repository",
-    "babel.hathitrust.org": "repository",
-    "cds.cern.ch": "repository",
-    "deepblue.lib.umich.edu": "repository",
-    "europepmc.org": "repository",
-    "hal.inria.fr": "repository",
-    "scielo.isciii.es": "repository",
-    "www.dtic.mil": "repository",
-    "www.jstage.jst.go.jp": "repository",
-    "www.jstor.org": "repository",
-    "www.ncbi.nlm.nih.gov": "repository",
-    "ftp.ncbi.nlm.nih.gov": "repository",
-    "www.scielo.br": "repository",
-    "www.scielo.cl": "repository",
-    "www.scielo.org.mx": "repository",
-    "zenodo.org": "repository",
-    "www.biorxiv.org": "repository",
-    "www.medrxiv.org": "repository",
-    "citeseerx.ist.psu.edu": "aggregator",
-    "publisher-connector.core.ac.uk": "aggregator",
-    "core.ac.uk": "aggregator",
-    "static.aminer.org": "aggregator",
-    "aminer.org": "aggregator",
-    "pdfs.semanticscholar.org": "aggregator",
-    "semanticscholar.org": "aggregator",
-    "www.semanticscholar.org": "aggregator",
-    "academic.oup.com": "publisher",
-    "cdn.elifesciences.org": "publisher",
-    "cell.com": "publisher",
-    "dl.acm.org": "publisher",
-    "downloads.hindawi.com": "publisher",
-    "elifesciences.org": "publisher",
-    "iopscience.iop.org": "publisher",
-    "journals.plos.org": "publisher",
-    "link.springer.com": "publisher",
-    "onlinelibrary.wiley.com": "publisher",
-    "works.bepress.com": "publisher",
-    "www.biomedcentral.com": "publisher",
-    "www.cell.com": "publisher",
-    "www.nature.com": "publisher",
-    "www.pnas.org": "publisher",
-    "www.tandfonline.com": "publisher",
-    "www.frontiersin.org": "publisher",
-    "www.degruyter.com": "publisher",
-    "www.mdpi.com": "publisher",
-    "www.ahajournals.org": "publisher",
-    "ehp.niehs.nih.gov": "publisher",
-    "journals.tsu.ru": "publisher",
-    "www.cogentoa.com": "publisher",
-    "www.researchgate.net": "academicsocial",
-    "academia.edu": "academicsocial",
-    "wayback.archive-it.org": "webarchive",
-    "web.archive.org": "webarchive",
-    "archive.is": "webarchive",
-}
+MAX_ABSTRACT_LENGTH: int = 2048
 
 
 def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]:
@@ -342,8 +282,7 @@ class EntityImporter:
         return creator_id
 
     def is_doi(self, doi: str) -> bool:
-        # TODO: replace with clean_doi() from fatcat_tools.normal
-        return doi.startswith("10.") and doi.count("/") >= 1
+        return clean_doi(doi) is not None
 
     def lookup_doi(self, doi: str) -> Optional[str]:
         """Caches calls to the doi lookup API endpoint in a local dict
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index a41e2bf5..52bd7465 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,11 +1,13 @@
 import datetime
-import sqlite3
 from typing import Any, Dict, List, Optional, Sequence
 
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.biblio_lookup_tables import CONTAINER_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
+
+from .common import EntityImporter
 
 # The docs/guide should be the canonical home for these mappings; update there
 # first
@@ -32,104 +34,11 @@ CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
     "standard": "standard",
 }
 
-CONTAINER_TYPE_MAP: Dict[str, str] = {
-    "article-journal": "journal",
-    "paper-conference": "conference",
-    "book": "book-series",
-}
-
-# These are based, informally, on sorting the most popular licenses found in
-# Crossref metadata. There were over 500 unique strings and only a few most
-# popular are here; many were variants of the CC URLs. Would be useful to
-# normalize CC licenses better.
-# The current norm is to only add license slugs that are at least partially OA.
-LICENSE_SLUG_MAP: Dict[str, str] = {
-    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
-    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
-    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
-    "//creativecommons.org/licenses/by/2.0/": "CC-BY",
-    "//creativecommons.org/licenses/by/3.0/": "CC-BY",
-    "//creativecommons.org/licenses/by/4.0/": "CC-BY",
-    "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
-    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
-    "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
-    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
-    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
-    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
-    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
-    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
-    "//spdx.org/licenses/CC0-1.0.json": "CC-0",
-    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
-    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/MIT.json": "MIT",
-    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.karger.com/Services/SiteLicenses": "KARGER",
-    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
-    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
-    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
-    "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
-    "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
-    "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
-    # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
-    # //www.springer.com/tdm doesn't seem like a license
-    # //iopscience.iop.org/page/copyright is closed
-    # //www.acm.org/publications/policies/copyright_policy#Background is closed
-    # //rsc.li/journals-terms-of-use is closed for vor (am open)
-    # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
-    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
-}
-
-
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
-    if not raw:
-        return None
-    raw = raw.strip().replace("http://", "//").replace("https://", "//")
-    if "creativecommons.org" in raw.lower():
-        raw = raw.lower()
-        raw = raw.replace("/legalcode", "/").replace("/uk", "")
-        if not raw.endswith("/"):
-            raw = raw + "/"
-    return LICENSE_SLUG_MAP.get(raw)
-
-
-def test_lookup_license_slug() -> None:
-
-    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
-    assert (
-        lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
-        == "CC-BY"
-    )
-    assert (
-        lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
-        == "CC-0"
-    )
-    assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
-    assert (
-        lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
-        == "CC-BY-NC-SA"
-    )
-    assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
-    assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
-    assert lookup_license_slug("") is None
-    assert lookup_license_slug(None) is None
-
 
 class CrossrefImporter(EntityImporter):
     """
     Importer for Crossref metadata.
 
-    Can use a local sqlite3 file for faster "external identifier" lookups
-
     See https://github.com/CrossRef/rest-api-doc for JSON schema notes
     """
 
@@ -150,50 +59,8 @@ class CrossrefImporter(EntityImporter):
         )
 
         self.create_containers: bool = kwargs.get("create_containers", True)
-        extid_map_file = kwargs.get("extid_map_file")
-        self.extid_map_db: Optional[Any] = None
-        if extid_map_file:
-            db_uri = "file:{}?mode=ro".format(extid_map_file)
-            print("Using external ID map: {}".format(db_uri))
-            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
-        else:
-            print("Not using external ID map")
-
         self.read_issn_map_file(issn_map_file)
 
-    def lookup_ext_ids(self, doi: str) -> Optional[Any]:
-        if self.extid_map_db is None:
-            return dict(
-                core_id=None,
-                pmid=None,
-                pmcid=None,
-                wikidata_qid=None,
-                arxiv_id=None,
-                jstor_id=None,
-            )
-        row = self.extid_map_db.execute(
-            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
-        ).fetchone()
-        if row is None:
-            return dict(
-                core_id=None,
-                pmid=None,
-                pmcid=None,
-                wikidata_qid=None,
-                arxiv_id=None,
-                jstor_id=None,
-            )
-        row = [str(cell or "") or None for cell in row]
-        return dict(
-            core_id=row[0],
-            pmid=row[1],
-            pmcid=row[2],
-            wikidata_qid=row[3],
-            # TODO:
-            arxiv_id=None,
-            jstor_id=None,
-        )
-
     def map_release_type(self, crossref_type: str) -> Optional[str]:
         return CROSSREF_TYPE_MAP.get(crossref_type)
 
@@ -275,21 +142,21 @@ class CrossrefImporter(EntityImporter):
                     if len(affiliation_list) > 1:
                         # note: affiliation => more_affiliations
                         extra["more_affiliations"] = [
-                            clean(a["name"]) for a in affiliation_list[1:]
+                            clean_str(a["name"]) for a in affiliation_list[1:]
                         ]
                 if am.get("sequence") and am.get("sequence") != "additional":
-                    extra["seq"] = clean(am.get("sequence"))
+                    extra["seq"] = clean_str(am.get("sequence"))
                 assert ctype in ("author", "editor", "translator")
-                raw_name = clean(raw_name)
+                raw_name = clean_str(raw_name)
                 # TODO: what if 'raw_name' is None?
                 contribs.append(
                     ReleaseContrib(
                         creator_id=creator_id,
                         index=index,
                         raw_name=raw_name,
-                        given_name=clean(am.get("given")),
-                        surname=clean(am.get("family")),
-                        raw_affiliation=clean(raw_affiliation),
+                        given_name=clean_str(am.get("given")),
+                        surname=clean_str(am.get("family")),
+                        raw_affiliation=clean_str(raw_affiliation),
                         role=ctype,
                         extra=extra or None,
                     )
@@ -306,11 +173,11 @@ class CrossrefImporter(EntityImporter):
         container_id = None
         if issnl:
             container_id = self.lookup_issnl(issnl)
-        publisher = clean(obj.get("publisher"))
+        publisher = clean_str(obj.get("publisher"))
 
         container_name = obj.get("container-title")
         if container_name:
-            container_name = clean(container_name[0], force_xml=True)
+            container_name = clean_str(container_name[0], force_xml=True)
         if not container_name:
             container_name = None
         if (
@@ -366,7 +233,7 @@ class CrossrefImporter(EntityImporter):
                 ref_extra["journal-title"] = rm["journal-title"]
             if rm.get("DOI"):
                 ref_extra["doi"] = rm.get("DOI").lower()
-            author = clean(rm.get("author"))
+            author = clean_str(rm.get("author"))
             if author:
                 ref_extra["authors"] = [author]
             for k in (
@@ -390,8 +257,8 @@ class CrossrefImporter(EntityImporter):
                 "series-title",
                 "volume-title",
             ):
-                if clean(rm.get(k)):
-                    ref_extra[k] = clean(rm[k])
+                if clean_str(rm.get(k)):
+                    ref_extra[k] = clean_str(rm[k])
             refs.append(
                 fatcat_openapi_client.ReleaseRef(
                     index=i,
@@ -399,9 +266,9 @@ class CrossrefImporter(EntityImporter):
                     target_release_id=None,
                     key=key,
                     year=year,
-                    container_name=clean(ref_container_name),
-                    title=clean(rm.get("article-title")),
-                    locator=clean(rm.get("first-page")),
+                    container_name=clean_str(ref_container_name),
+                    title=clean_str(rm.get("article-title")),
+                    locator=clean_str(rm.get("first-page")),
                     # TODO: just dump JSON somewhere here?
                     extra=ref_extra or None,
                 )
@@ -409,7 +276,7 @@ class CrossrefImporter(EntityImporter):
 
         # abstracts
         abstracts = []
-        abstract = clean(obj.get("abstract"))
+        abstract = clean_str(obj.get("abstract"))
         if abstract and len(abstract) > 10:
             abstracts.append(
                 fatcat_openapi_client.ReleaseAbstract(
@@ -430,9 +297,9 @@ class CrossrefImporter(EntityImporter):
                 if type(val) == list:
                     val = val[0]
                 if type(val) == str:
-                    val = clean(val)
+                    val = clean_str(val)
                     if val:
-                        extra[key] = clean(val)
+                        extra[key] = clean_str(val)
                 else:
                     extra[key] = val
         # crossref-nested extra keys
@@ -440,14 +307,14 @@ class CrossrefImporter(EntityImporter):
             val = obj.get(key)
             if val:
                 if type(val) == str:
-                    extra_crossref[key] = clean(val)
+                    extra_crossref[key] = clean_str(val)
                 else:
                     extra_crossref[key] = val
         if license_extra:
             extra_crossref["license"] = license_extra
 
         if len(obj["title"]) > 1:
-            aliases = [clean(t) for t in obj["title"][1:]]
+            aliases = [clean_str(t) for t in obj["title"][1:]]
             aliases = [t for t in aliases if t]
             if aliases:
                 extra["aliases"] = aliases
@@ -473,9 +340,6 @@ class CrossrefImporter(EntityImporter):
             # unknown
             release_stage = None
 
-        # external identifiers
-        extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {}
-
         # filter out unreasonably huge releases
         if len(abstracts) > 100:
             self.counts["skip-huge-abstracts"] += 1
@@ -505,19 +369,24 @@ class CrossrefImporter(EntityImporter):
         if obj.get("original-title"):
             ot = obj.get("original-title")
             if ot is not None:
-                original_title = clean(ot[0], force_xml=True)
+                original_title = clean_str(ot[0], force_xml=True)
 
         title: Optional[str] = None
         if obj.get("title"):
-            title = clean(obj["title"][0], force_xml=True)
+            title = clean_str(obj["title"][0], force_xml=True)
             if not title or len(title) <= 1:
                 # title can't be just a single character
                 self.counts["skip-blank-title"] += 1
                 return None
 
+        doi = clean_doi(obj["DOI"].lower())
+        if not doi:
+            self.counts["skip-bad-doi"] += 1
+            return None
+
         subtitle = None
         if obj.get("subtitle"):
-            subtitle = clean(obj["subtitle"][0], force_xml=True)
+            subtitle = clean_str(obj["subtitle"][0], force_xml=True)
             if not subtitle or len(subtitle) <= 1:
                 # subtitle can't be just a single character
                 subtitle = None
@@ -537,19 +406,13 @@ class CrossrefImporter(EntityImporter):
             release_year=release_year,
             publisher=publisher,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
-                doi=obj["DOI"].lower(),
-                pmid=extids["pmid"],
-                pmcid=extids["pmcid"],
-                wikidata_qid=extids["wikidata_qid"],
+                doi=doi,
                 isbn13=isbn13,
-                core=extids["core_id"],
-                arxiv=extids["arxiv_id"],
-                jstor=extids["jstor_id"],
             ),
-            volume=clean(obj.get("volume")),
-            issue=clean(obj.get("issue")),
-            pages=clean(obj.get("page")),
-            language=clean(obj.get("language")),
+            volume=clean_str(obj.get("volume")),
+            issue=clean_str(obj.get("issue")),
+            pages=clean_str(obj.get("page")),
+            language=clean_str(obj.get("language")),
             license_slug=license_slug,
             extra=extra or None,
             abstracts=abstracts or None,
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d5622960..b310f8bc 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -12,7 +12,6 @@ import collections
 import datetime
 import json
 import re
-import sqlite3
 import sys
 from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
 
@@ -22,113 +21,19 @@ import langdetect
 import pycountry
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.biblio_lookup_tables import DATACITE_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
 from fatcat_tools.transforms import entity_to_dict
 
-from .common import EntityImporter, clean
-
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter
 
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
-CONTAINER_TYPE_MAP: Dict[str, str] = {
+DATACITE_CONTAINER_TYPE_MAP: Dict[str, str] = {
     "Journal": "journal",
     "Series": "journal",
     "Book Series": "book-series",
 }
 
-# The docs/guide should be the canonical home for these mappings; update there
-# first.  Map various datacite type types to CSL-ish types. None means TODO or
-# remove.
-DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
-    "ris": {
-        "THES": "thesis",
-        "SOUND": "song",  # 99.9% maps to citeproc song, so use that (exception: report)
-        "CHAP": "chapter",
-        "FIGURE": "figure",
-        "RPRT": "report",
-        "JOUR": "article-journal",
-        "MPCT": "motion_picture",
-        "GEN": "article-journal",  # GEN consist of 99% article and report, post-weblog, misc - and one dataset
-        "BOOK": "book",
-        "DATA": "dataset",
-        "COMP": "software",
-    },
-    "schemaOrg": {
-        "Dataset": "dataset",
-        "Book": "book",
-        "ScholarlyArticle": "article-journal",
-        "ImageObject": "graphic",
-        "Collection": None,
-        "MediaObject": None,
-        "Event": None,
-        "SoftwareSourceCode": "software",
-        "Chapter": "chapter",
-        "CreativeWork": None,  # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
-        "PublicationIssue": "article",
-        "AudioObject": None,
-        "Thesis": "thesis",
-    },
-    "citeproc": {
-        "article": "article",
-        "article-journal": "article-journal",
-        "article-magazine": "article-magazine",
-        "article-newspaper": "article-newspaper",
-        "bill": "bill",
-        "book": "book",
-        "broadcast": "broadcast",
-        "chapter": "chapter",
-        "dataset": "dataset",
-        "entry-dictionary": "entry-dictionary",
-        "entry-encyclopedia": "entry-encyclopedia",
-        "entry": "entry",
-        "figure": "figure",
-        "graphic": "graphic",
-        "interview": "interview",
-        "legal_case": "legal_case",
-        "legislation": "legislation",
-        "manuscript": "manuscript",
-        "map": "map",
-        "motion_picture": "motion_picture",
-        "musical_score": "musical_score",
-        "pamphlet": "pamphlet",
-        "paper-conference": "paper-conference",
-        "patent": "patent",
-        "personal_communication": "personal_communication",
-        "post": "post",
-        "post-weblog": "post-weblog",
-        "report": "report",
-        "review-book": "review-book",
-        "review": "review",
-        "song": "song",
-        "speech": "speech",
-        "thesis": "thesis",
-        "treaty": "treaty",
-        "webpage": "webpage",
-    },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
-    "bibtex": {
-        "phdthesis": "thesis",
-        "inbook": "chapter",
-        "misc": None,
-        "article": "article-journal",
-        "book": "book",
-    },
-    "resourceTypeGeneral": {
-        "Image": "graphic",
-        "Dataset": "dataset",
-        "PhysicalObject": None,
-        "Collection": None,
-        "Text": None,  # "Greyliterature, labnotes, accompanyingmaterials"
-        "Sound": None,
-        "InteractiveResource": None,
-        "Event": None,
-        "Software": "software",
-        "Other": None,
-        "Workflow": None,
-        "Audiovisual": None,
-    },  # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
-}
-
 # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
 DATACITE_UNKNOWN_MARKERS: List[str] = [
     "(:unac)",  # temporarily inaccessible
@@ -181,43 +86,6 @@ DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [
     }
 ]
 
-# TODO(martin): merge this with other maps and lookup functions, eventually.
-LICENSE_SLUG_MAP: Dict[str, str] = {
-    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
-    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
-    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
-    "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
-    "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
-    "//onlinelibrary.wiley.com/termsandconditions/": "WILEY",
-    "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
-    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
-    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
-    "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
-    "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
-    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
-    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
-    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
-    "//www.karger.com/Services/SiteLicenses/": "KARGER",
-    "//www.springer.com/tdm/": "SPRINGER-TDM",
-    "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
-    "//spdx.org/licenses/CC0-1.0.json": "CC-0",
-    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
-    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/MIT.json": "MIT",
-    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-}
-
 
 class DataciteImporter(EntityImporter):
     """
@@ -248,15 +116,6 @@ class DataciteImporter(EntityImporter):
         )
 
         self.create_containers = kwargs.get("create_containers", True)
-        extid_map_file = kwargs.get("extid_map_file")
-        self.extid_map_db = None
-        if extid_map_file:
-            db_uri = "file:{}?mode=ro".format(extid_map_file)
-            print("Using external ID map: {}".format(db_uri), file=sys.stderr)
-            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
-        else:
-            print("Not using external ID map", file=sys.stderr)
-
         self.read_issn_map_file(issn_map_file)
         self.debug = debug
         self.insert_log_file = insert_log_file
@@ -264,42 +123,6 @@ class DataciteImporter(EntityImporter):
 
         print("datacite with debug={}".format(self.debug), file=sys.stderr)
 
-    def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
-        """
-        Return dictionary of identifiers referring to the same things as the given DOI.
-        """
-        if self.extid_map_db is None:
-            return dict(
-                core_id=None,
-                pmid=None,
-                pmcid=None,
-                wikidata_qid=None,
-                arxiv_id=None,
-                jstor_id=None,
-            )
-        row = self.extid_map_db.execute(
-            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
-        ).fetchone()
-        if row is None:
-            return dict(
-                core_id=None,
-                pmid=None,
-                pmcid=None,
-                wikidata_qid=None,
-                arxiv_id=None,
-                jstor_id=None,
-            )
-        row = [str(cell or "") or None for cell in row]
-        return dict(
-            core_id=row[0],
-            pmid=row[1],
-            pmcid=row[2],
-            wikidata_qid=row[3],
-            # TODO:
-            arxiv_id=None,
-            jstor_id=None,
-        )
-
     def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:
         """
         Mapping datacite JSON to ReleaseEntity.
@@ -368,7 +191,7 @@ class DataciteImporter(EntityImporter):
             print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
             return False
 
-        title = clean(title)
+        title = clean_str(title)
         if not title:
             print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
             return False
@@ -387,7 +210,7 @@ class DataciteImporter(EntityImporter):
         if not subtitle:
             subtitle = None
         else:
-            subtitle = clean(subtitle)
+            subtitle = clean_str(subtitle)
 
         # Dates. A few internal dates (registered, created, updated) and
         # published (0..2554). We try to work with typed date list, in
@@ -445,15 +268,15 @@ class DataciteImporter(EntityImporter):
             publisher = None
 
         if publisher:
-            publisher = clean(publisher)
+            publisher = clean_str(publisher)
 
         # Container. For the moment, only ISSN as container.
         container_id = None
         container_name = None
 
         container = attributes.get("container", {}) or {}
-        if container.get("type") in CONTAINER_TYPE_MAP.keys():
-            container_type = CONTAINER_TYPE_MAP.get(container["type"])
+        if container.get("type") in DATACITE_CONTAINER_TYPE_MAP.keys():
+            container_type = DATACITE_CONTAINER_TYPE_MAP.get(container["type"])
             if container.get("identifier") and container.get("identifierType") == "ISSN":
                 issn = container.get("identifier")
                 if issn and len(issn) == 8:
@@ -506,10 +329,10 @@ class DataciteImporter(EntityImporter):
         issue = container.get("issue")
 
         if volume:
-            volume = clean(volume)
+            volume = clean_str(volume)
 
         if issue:
-            issue = clean(issue)
+            issue = clean_str(issue)
 
         # Pages.
         pages = None
@@ -534,7 +357,7 @@ class DataciteImporter(EntityImporter):
         license_extra = []
 
         for lic in attributes.get("rightsList", []):
-            slug = lookup_license_slug(lic.get("rightsUri"))
+            slug = datacite_lookup_license_slug(lic.get("rightsUri"))
             if slug:
                 license_slug = slug
             license_extra.append(lic)
@@ -594,7 +417,7 @@ class DataciteImporter(EntityImporter):
                     "[{}] language detection failed with {} on {}".format(doi, err, text),
                     file=sys.stderr,
                 )
-            abstract_text = clean(text)
+            abstract_text = clean_str(text)
             if not abstract_text:
                 continue
             abstracts.append(
@@ -643,7 +466,13 @@ class DataciteImporter(EntityImporter):
         if license_extra:
             extra_datacite["license"] = license_extra
         if attributes.get("subjects"):
-            extra_datacite["subjects"] = attributes["subjects"]
+            # these subjects with schemeUri are too much metadata, which
+            # doesn't compress. filter them out.
+            extra_subjects = [
+                subj for subj in attributes["subjects"] if not subj.get("schemeUri")
+            ]
+            if extra_subjects:
+                extra_datacite["subjects"] = extra_subjects
 
         # Include version information.
         metadata_version = attributes.get("metadataVersion") or ""
@@ -706,8 +535,6 @@ class DataciteImporter(EntityImporter):
         if release_month:
             extra["release_month"] = release_month
 
-        extids = self.lookup_ext_ids(doi=doi)
-
         # Assemble release.
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
@@ -722,12 +549,6 @@ class DataciteImporter(EntityImporter):
             publisher=publisher,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
                 doi=doi,
-                pmid=extids["pmid"],
-                pmcid=extids["pmcid"],
-                wikidata_qid=extids["wikidata_qid"],
-                core=extids["core_id"],
-                arxiv=extids["arxiv_id"],
-                jstor=extids["jstor_id"],
             ),
             contribs=contribs,
             volume=volume,
@@ -922,14 +743,14 @@ class DataciteImporter(EntityImporter):
                 if len(affiliations) == 0:
                     raw_affiliation = None
                 else:
-                    raw_affiliation = clean(affiliations[0])
+                    raw_affiliation = clean_str(affiliations[0])
 
                 name = c.get("name")
                 given_name = c.get("givenName")
                 surname = c.get("familyName")
 
                 if name:
-                    name = clean(name)
+                    name = clean_str(name)
                 if not any((name, given_name, surname)):
                     continue
                 if not name:
@@ -943,8 +764,8 @@ class DataciteImporter(EntityImporter):
                     name = index_form_to_display_name(name)
 
                 if given_name:
-                    given_name = clean(given_name)
-                surname = clean(surname)
+                    given_name = clean_str(given_name)
+                surname = clean_str(surname)
 
                 # Perform a final assertion that name does not reduce to zero
                 # (e.g. whitespace only name).
@@ -1016,7 +837,7 @@ def contributor_list_contains_contributor(
     return False
 
 
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+def datacite_lookup_license_slug(raw: Optional[str]) -> Optional[str]:
     """
     Resolve a variety of strings into a some pseudo-canonical form, e.g.
     CC-BY-ND, CC-0, MIT and so on.
@@ -1111,12 +932,8 @@ def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
             return None
         return "RS-{}".format(name.upper())
 
-    # Fallback to mapped values.
-    raw = raw.lower()
-    raw = raw.strip().replace("http://", "//").replace("https://", "//")
-    if not raw.endswith("/"):
-        raw = raw + "/"
-    return LICENSE_SLUG_MAP.get(raw)
+    # Fallback to generic license lookup
+    return lookup_license_slug(raw)
 
 
 def find_original_language_title(
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index f5c886a2..92dbe574 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Sequence
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.importers.common import EntityImporter
+from fatcat_tools.importers.common import MAX_ABSTRACT_LENGTH, EntityImporter
 from fatcat_tools.normal import (
     clean_doi,
     clean_orcid,
@@ -24,9 +24,6 @@ from fatcat_tools.normal import (
     parse_month,
 )
 
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
-
 
 class DoajArticleImporter(EntityImporter):
     def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None:
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index e36e1b48..3c85132c 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
 
-from .common import EntityImporter, clean, make_rel_url
+from fatcat_tools.normal import clean_doi, clean_str
 
-MAX_ABSTRACT_BYTES = 4096
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url
 
 
 class GrobidMetadataImporter(EntityImporter):
@@ -82,9 +82,9 @@ class GrobidMetadataImporter(EntityImporter):
         extra_grobid: Dict[str, Any] = dict()
 
         abstract = obj.get("abstract")
-        if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
+        if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10:
             abobj = fatcat_openapi_client.ReleaseAbstract(
-                mimetype="text/plain", content=clean(obj.get("abstract"))
+                mimetype="text/plain", content=clean_str(obj.get("abstract"))
             )
             abstracts = [abobj]
         else:
@@ -95,9 +95,9 @@ class GrobidMetadataImporter(EntityImporter):
             contribs.append(
                 fatcat_openapi_client.ReleaseContrib(
                     index=i,
-                    raw_name=clean(a["name"]),
-                    given_name=clean(a.get("given_name")),
-                    surname=clean(a.get("surname")),
+                    raw_name=clean_str(a["name"]),
+                    given_name=clean_str(a.get("given_name")),
+                    surname=clean_str(a.get("surname")),
                     role="author",
                     extra=None,
                 )
@@ -114,15 +114,15 @@ class GrobidMetadataImporter(EntityImporter):
                     pass
             for key in ("volume", "url", "issue", "publisher"):
                 if raw.get(key):
-                    cite_extra[key] = clean(raw[key])
+                    cite_extra[key] = clean_str(raw[key])
             if raw.get("authors"):
-                cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]
+                cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]]
 
             refs.append(
                 fatcat_openapi_client.ReleaseRef(
-                    key=clean(raw.get("id")),
+                    key=clean_str(raw.get("id")),
                     year=year,
-                    title=clean(raw["title"]),
+                    title=clean_str(raw["title"]),
                     extra=cite_extra or None,
                 )
             )
@@ -133,11 +133,12 @@ class GrobidMetadataImporter(EntityImporter):
             # only returns year, ever?
             release_year = int(obj["date"][:4])
 
-        extra = dict()
-        if obj.get("doi"):
-            extra["doi"] = obj["doi"]
+        extra: Dict[str, Any] = dict()
+        doi = clean_doi(obj.get("doi"))
+        if doi:
+            extra["doi"] = doi
         if obj["journal"] and obj["journal"].get("name"):
-            extra["container_name"] = clean(obj["journal"]["name"])
+            extra["container_name"] = clean_str(obj["journal"]["name"])
 
         # TODO: ISSN/eISSN handling? or just journal name lookup?
 
@@ -146,7 +147,7 @@ class GrobidMetadataImporter(EntityImporter):
         if self.longtail_oa:
             extra["longtail_oa"] = True
 
-        clean_title = clean(obj["title"], force_xml=True)
+        clean_title = clean_str(obj["title"], force_xml=True)
         if not clean_title or len(clean_title) < 2:
             return None
         title = clean_title
@@ -158,9 +159,9 @@ class GrobidMetadataImporter(EntityImporter):
             release_year=release_year,
             contribs=contribs,
             refs=refs,
-            publisher=clean(obj["journal"].get("publisher")),
-            volume=clean(obj["journal"].get("volume")),
-            issue=clean(obj["journal"].get("issue")),
+            publisher=clean_str(obj["journal"].get("publisher")),
+            volume=clean_str(obj["journal"].get("volume")),
+            issue=clean_str(obj["journal"].get("issue")),
             abstracts=abstracts or None,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(),
             extra=extra or None,
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 2f10e533..9916a55f 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,5 +1,4 @@
 import datetime
-import sqlite3
 import sys
 from typing import Any, Dict, List, Optional, Sequence
 
@@ -7,9 +6,9 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str, is_cjk
 
-from .common import DATE_FMT, EntityImporter, clean, is_cjk
+from .common import DATE_FMT, EntityImporter
 
 
 # TODO: should be List[Tag] not List[Any] for full type annotations
@@ -37,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]:
     for raw in raw_persons:
         name = raw.find("name") or None
         if name:
-            name = clean(name.get_text().replace("\n", " "))
+            name = clean_str(name.get_text().replace("\n", " "))
         surname = raw.find("familyName") or None
         if surname:
-            surname = clean(surname.get_text().replace("\n", " "))
+            surname = clean_str(surname.get_text().replace("\n", " "))
         given_name = raw.find("givenName") or None
         if given_name:
-            given_name = clean(given_name.get_text().replace("\n", " "))
+            given_name = clean_str(given_name.get_text().replace("\n", " "))
         lang = "en"
         if is_cjk(name):
             lang = "ja"
@@ -117,50 +116,8 @@ class JalcImporter(EntityImporter):
         )
 
         self.create_containers = kwargs.get("create_containers", True)
-        extid_map_file = kwargs.get("extid_map_file")
-        self.extid_map_db = None
-        if extid_map_file:
-            db_uri = "file:{}?mode=ro".format(extid_map_file)
-            print("Using external ID map: {}".format(db_uri))
-            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
-        else:
-            print("Not using external ID map")
-
         self.read_issn_map_file(issn_map_file)
 
-    def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
-        if self.extid_map_db is None:
-            return dict(
-                core_id=None,
-                pmid=None,
-                pmcid=None,
-                wikidata_qid=None,
-                arxiv_id=None,
-                jstor_id=None,
-            )
-        row = self.extid_map_db.execute(
-            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
-        ).fetchone()
-        if row is None:
-            return dict(
-                core_id=None,
-                pmid=None,
-                pmcid=None,
-                wikidata_qid=None,
-                arxiv_id=None,
-                jstor_id=None,
-            )
-        row = [str(cell or "") or None for cell in row]
-        return dict(
-            core_id=row[0],
-            pmid=row[1],
-            pmcid=row[2],
-            wikidata_qid=row[3],
-            # TODO:
-            arxiv_id=None,
-            jstor_id=None,
-        )
-
     def want(self, raw_record: Any) -> bool:
         return True
 
@@ -273,16 +230,16 @@ class JalcImporter(EntityImporter):
                 for p in record.find_all("publicationName")
                 if p.get_text()
             ]
-            pubs = [clean(p) for p in pubs if p]
+            pubs = [clean_str(p) for p in pubs if p]
             assert pubs
             if len(pubs) > 1 and pubs[0] == pubs[1]:
                 pubs = [pubs[0]]
             if len(pubs) > 1 and is_cjk(pubs[0]):
                 # eng/jpn ordering is not reliable
                 pubs = [pubs[1], pubs[0]]
-            container_name = clean(pubs[0])
+            container_name = clean_str(pubs[0])
             if len(pubs) > 1:
-                container_extra["original_name"] = clean(pubs[1])
+                container_extra["original_name"] = clean_str(pubs[1])
 
         if record.publisher:
             pubs = [
@@ -297,7 +254,7 @@ class JalcImporter(EntityImporter):
                 # ordering is not reliable
                 pubs = [pubs[1], pubs[0]]
             if pubs:
-                publisher = clean(pubs[0])
+                publisher = clean_str(pubs[0])
                 if len(pubs) > 1:
                     container_extra["publisher_aliases"] = pubs[1:]
 
@@ -330,9 +287,6 @@ class JalcImporter(EntityImporter):
         # reasonable default for this collection
         release_type = "article-journal"
 
-        # external identifiers
-        extids = self.lookup_ext_ids(doi=doi)
-
         # extra:
         #   translation_of
         #   aliases
@@ -342,26 +296,20 @@ class JalcImporter(EntityImporter):
         # (informally)
         extra["jalc"] = extra_jalc
 
-        title = clean(title)
+        title = clean_str(title)
         if not title:
             return None
 
         re = ReleaseEntity(
             work_id=None,
             title=title,
-            original_title=clean(original_title),
+            original_title=clean_str(original_title),
             release_type=release_type,
             release_stage="published",
             release_date=release_date,
             release_year=release_year,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
                 doi=doi,
-                pmid=extids["pmid"],
-                pmcid=extids["pmcid"],
-                wikidata_qid=extids["wikidata_qid"],
-                core=extids["core_id"],
-                arxiv=extids["arxiv_id"],
-                jstor=extids["jstor_id"],
             ),
             volume=volume,
             issue=issue,
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index a45e49f3..fc1dfcbd 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ContainerEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
 
 
 def or_none(s: Optional[str]) -> Optional[str]:
@@ -105,7 +107,7 @@ class JournalMetadataImporter(EntityImporter):
         if extra_ia:
             extra["ia"] = extra_ia
 
-        name = clean(row.get("name"))
+        name = clean_str(row.get("name"))
         if not name:
             return None
 
@@ -115,7 +117,7 @@ class JournalMetadataImporter(EntityImporter):
             issnp=row.get("issnp"),
             container_type=None,  # TODO
             name=name,
-            publisher=clean(row.get("publisher")),
+            publisher=clean_str(row.get("publisher")),
             wikidata_qid=None,  # TODO
             extra=extra,
         )
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 2c8aa0a4..79691c9a 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,7 +8,10 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from .common import LANG_MAP_MARC, EntityImporter, clean
+from fatcat_tools.biblio_lookup_tables import LANG_MAP_MARC
+from fatcat_tools.normal import clean_doi, clean_str
+
+from .common import EntityImporter
 from .crossref import CONTAINER_TYPE_MAP
 
 # TODO: more entries?
@@ -138,7 +141,7 @@ class JstorImporter(EntityImporter):
                 issnl=issnl,
                 publisher=publisher,
                 container_type=self.map_container_type(release_type),
-                name=clean(journal_title, force_xml=True),
+                name=clean_str(journal_title, force_xml=True),
             )
             ce_edit = self.create_container(ce)
             container_id = ce_edit.ident
@@ -146,7 +149,9 @@ class JstorImporter(EntityImporter):
 
         doi = article_meta.find("article-id", {"pub-id-type": "doi"})
         if doi:
-            doi = doi.string.lower().strip()
+            doi = clean_doi(doi.string.lower())
+        else:
+            doi = None
 
         jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"})
         if jstor_id:
@@ -162,13 +167,13 @@ class JstorImporter(EntityImporter):
             for c in cgroup.find_all("contrib"):
                 given = c.find("given-names")
                 if given:
-                    given = clean(given.get_text().replace("\n", " "))
+                    given = clean_str(given.get_text().replace("\n", " "))
                 surname = c.find("surname")
                 if surname:
-                    surname = clean(surname.get_text().replace("\n", " "))
+                    surname = clean_str(surname.get_text().replace("\n", " "))
                 raw_name = c.find("string-name")
                 if raw_name:
-                    raw_name = clean(raw_name.get_text().replace("\n", " "))
+                    raw_name = clean_str(raw_name.get_text().replace("\n", " "))
 
                 if not raw_name:
                     if given and surname:
@@ -230,7 +235,7 @@ class JstorImporter(EntityImporter):
 
         # JSTOR issue-id
         if article_meta.find("issue-id"):
-            issue_id = clean(article_meta.find("issue-id").string)
+            issue_id = clean_str(article_meta.find("issue-id").string)
             if issue_id:
                 extra_jstor["issue_id"] = issue_id
 
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 430cdd0f..f3d82a86 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -4,7 +4,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, CreatorEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
 
 
 def value_or_none(e: Any) -> Any:
@@ -65,14 +67,14 @@ class OrcidImporter(EntityImporter):
         if not self.is_orcid(orcid):
             sys.stderr.write("Bad ORCID: {}\n".format(orcid))
             return None
-        display = clean(display)
+        display = clean_str(display)
         if not display:
             # must have *some* name
             return None
         ce = CreatorEntity(
             orcid=orcid,
-            given_name=clean(given),
-            surname=clean(sur),
+            given_name=clean_str(given),
+            surname=clean_str(sur),
             display_name=display,
             extra=extra,
         )
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 1cdb450b..a6c7409d 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,317 +8,15 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid
-
-from .common import LANG_MAP_MARC, EntityImporter, clean
-
-# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
-PUBMED_RELEASE_TYPE_MAP = {
-    # Adaptive Clinical Trial
-    "Address": "speech",
-    "Autobiography": "book",
-    # Bibliography
-    "Biography": "book",
-    # Case Reports
-    "Classical Article": "article-journal",
-    # Clinical Conference
-    # Clinical Study
-    # Clinical Trial
-    # Clinical Trial, Phase I
-    # Clinical Trial, Phase II
-    # Clinical Trial, Phase III
-    # Clinical Trial, Phase IV
-    # Clinical Trial Protocol
-    # Clinical Trial, Veterinary
-    # Collected Works
-    # Comparative Study
-    # Congress
-    # Consensus Development Conference
-    # Consensus Development Conference, NIH
-    # Controlled Clinical Trial
-    "Dataset": "dataset",
-    # Dictionary
-    # Directory
-    # Duplicate Publication
-    "Editorial": "editorial",
-    # English Abstract   # doesn't indicate that this is abstract-only
-    # Equivalence Trial
-    # Evaluation Studies
-    # Expression of Concern
-    # Festschrift
-    # Government Document
-    # Guideline
-    "Historical Article": "article-journal",
-    # Interactive Tutorial
-    "Interview": "interview",
-    "Introductory Journal Article": "article-journal",
-    "Journal Article": "article-journal",
-    "Lecture": "speech",
-    "Legal Case": "legal_case",
-    "Legislation": "legislation",
-    "Letter": "letter",
-    # Meta-Analysis
-    # Multicenter Study
-    # News
-    "Newspaper Article": "article-newspaper",
-    # Observational Study
-    # Observational Study, Veterinary
-    # Overall
-    # Patient Education Handout
-    # Periodical Index
-    # Personal Narrative
-    # Portrait
-    # Practice Guideline
-    # Pragmatic Clinical Trial
-    # Publication Components
-    # Publication Formats
-    # Publication Type Category
-    # Randomized Controlled Trial
-    # Research Support, American Recovery and Reinvestment Act
-    # Research Support, N.I.H., Extramural
-    # Research Support, N.I.H., Intramural
-    # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
-    # Research Support, U.S. Gov't, P.H.S.
-    # Review     # in the "literature review" sense, not "product review"
-    # Scientific Integrity Review
-    # Study Characteristics
-    # Support of Research
-    # Systematic Review
-    "Technical Report": "report",
-    # Twin Study
-    # Validation Studies
-    # Video-Audio Media
-    # Webcasts
-}
-
-MONTH_ABBR_MAP = {
-    "Jan": 1,
-    "01": 1,
-    "Feb": 2,
-    "02": 2,
-    "Mar": 3,
-    "03": 3,
-    "Apr": 4,
-    "04": 4,
-    "May": 5,
-    "05": 5,
-    "Jun": 6,
-    "06": 6,
-    "Jul": 7,
-    "07": 7,
-    "Aug": 8,
-    "08": 8,
-    "Sep": 9,
-    "09": 9,
-    "Oct": 10,
-    "10": 10,
-    "Nov": 11,
-    "11": 11,
-    "Dec": 12,
-    "12": 12,
-}
-
-# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
-COUNTRY_NAME_MAP = {
-    "Afghanistan": "af",
-    "Albania": "al",
-    "Algeria": "dz",
-    "Andorra": "ad",
-    "Angola": "ao",
-    "Antigua and Barbuda": "ag",
-    "Argentina": "ar",
-    "Armenia": "am",
-    "Australia": "au",
-    "Austria": "at",
-    "Azerbaijan": "az",
-    "Bahamas": "bs",
-    "Bahrain": "bh",
-    "Bangladesh": "bd",
-    "Barbados": "bb",
-    "Belarus": "by",
-    "Belgium": "be",
-    "Belize": "bz",
-    "Benin": "bj",
-    "Bhutan": "bt",
-    "Bolivia": "bo",
-    "Bosnia and Herzegowina": "ba",
-    "Botswana": "bw",
-    "Brazil": "br",
-    "Brunei Darussalam": "bn",
-    "Bulgaria": "bg",
-    "Burkina Faso": "bf",
-    "Burundi": "bi",
-    "Cambodia": "kh",
-    "Cameroon": "cm",
-    "Canada": "ca",
-    "Cape Verde": "cv",
-    "Central African Republic": "cf",
-    "Chad": "td",
-    "Chile": "cl",
-    "China": "cn",
-    "Colombia": "co",
-    "Comoros": "km",
-    "Congo, Democratic Republic": "cd",
-    "Congo, People’s Republic": "cg",
-    "Costa Rica": "cr",
-    "Cote d'Ivoire": "ci",
-    "Croatia (Local Name: Hrvatska)": "hr",
-    "Cuba": "cu",
-    "Cyprus": "cy",
-    "Czech Republic": "cz",
-    "Denmark": "dk",
-    "Djibouti": "dj",
-    "Dominica": "dm",
-    "Dominican Republic": "do",
-    "East Timor": "tl",
-    "Ecuador": "ec",
-    "El Salvador": "sv",
-    "Equatorial Guinea": "gq",
-    "Eritrea": "er",
-    "Estonia": "ee",
-    "Ethiopia": "et",
-    "Fiji": "fj",
-    "Finland": "fi",
-    "France": "fr",
-    "Gabon": "ga",
-    "Gambia": "gm",
-    "Georgia": "ge",
-    "Germany": "de",
-    "Ghana": "gh",
-    "Greece": "gr",
-    "Greenland": "gl",
-    "Grenada": "gd",
-    "Guatemala": "gt",
-    "Guinea": "gn",
-    "Guinea-Bissau": "gw",
-    "Guyana": "gy",
-    "Haiti": "ht",
-    "Honduras": "hn",
-    "Hong Kong": "hk",
-    "Hungary": "hu",
-    "Iceland": "is",
-    "India": "in",
-    "Indonesia": "id",
-    "Iran": "ir",
-    "Iraq": "iq",
-    "Ireland": "ie",
-    "Israel": "il",
-    "Italy": "it",
-    "Jamaica": "jm",
-    "Japan": "jp",
-    "Jordan": "jo",
-    "Kazakhstan": "kz",
-    "Kenya": "ke",
-    "Kiribati": "ki",
-    "Korea, Democratic People's Republic": "kp",
-    "Korea, Republic": "kr",
-    "Kuwait": "kw",
-    "Kyrgyzstan": "kg",
-    "Laos": "la",
-    "Latvia": "lv",
-    "Lebanon": "lb",
-    "Lesotho": "ls",
-    "Liberia": "lr",
-    "Libya": "ly",
-    "Liechtenstein": "li",
-    "Lithuania": "lt",
-    "Luxembourg": "lu",
-    "Macedonia": "mk",
-    "Madagascar": "mg",
-    "Malawi": "mw",
-    "Malaysia": "my",
-    "Maldives": "mv",
-    "Mali": "ml",
-    "Malta": "mt",
-    "Marshall Islands": "mh",
-    "Mauritania": "mr",
-    "Mauritius": "mu",
-    "Mexico": "mx",
-    "Micronesia": "fm",
-    "Moldova": "md",
-    "Monaco": "mc",
-    "Mongolia": "mn",
-    "Morocco": "ma",
-    "Mozambique": "mz",
-    "Myanmar": "mm",
-    "Namibia": "na",
-    "Nauru": "nr",
-    "Nepal": "np",
-    "Netherlands": "nl",
-    "New Zealand": "nz",
-    "Nicaragua": "ni",
-    "Niger": "ne",
-    "Nigeria": "ng",
-    "Norway": "no",
-    "Oman": "om",
-    "Pakistan": "pk",
-    "Palau": "pw",
-    "Panama": "pa",
-    "Papua New Guinea": "pg",
-    "Paraguay": "py",
-    "Peru": "pe",
-    "Philippines": "ph",
-    "Poland": "pl",
-    "Portugal": "pt",
-    "Puerto Rico": "pr",
-    "Qatar": "qa",
-    "Romania": "ro",
-    "Russian Federation": "ru",
-    "Rwanda": "rw",
-    "Saint Kitts and Nevis": "kn",
-    "Saint Lucia": "lc",
-    "Saint Vincent and the Grenadines": "vc",
-    "Samoa": "ws",
-    "San Marino": "sm",
-    "Sao Tome and Príncipe": "st",
-    "Saudi Arabia": "sa",
-    "Senegal": "sn",
-    "Serbia and Montenegro": "cs",
-    "Seychelles": "sc",
-    "Sierra Leone": "sl",
-    "Singapore": "sg",
-    "Slovakia (Slovak Republic)": "sk",
-    "Slovenia": "si",
-    "Solomon Islands": "sb",
-    "Somalia": "so",
-    "South Africa": "za",
-    "Spain": "es",
-    "Sri Lanka": "lk",
-    "Sudan": "sd",
-    "Suriname": "sr",
-    "Swaziland": "sz",
-    "Sweden": "se",
-    "Switzerland": "ch",
-    "Syrian Arab Republic": "sy",
-    "Taiwan": "tw",
-    "Tajikistan": "tj",
-    "Tanzania": "tz",
-    "Tanzania": "tz",
-    "Thailand": "th",
-    "Togo": "tg",
-    "Tonga": "to",
-    "Trinidad and Tobago": "tt",
-    "Tunisia": "tn",
-    "Turkey": "tr",
-    "Turkmenistan": "tm",
-    "Tuvalu": "tv",
-    "Uganda": "ug",
-    "Ukraine": "ua",
-    "United Arab Emirates": "ae",
-    "United Kingdom": "gb",
-    "United States": "us",
-    "Uruguay": "uy",
-    # Additions from running over large files
-    "Bosnia and Herzegovina": "ba",
-    # "International"
-    "China (Republic : 1949- )": "tw",  # pretty sure this is tw not cn
-    "Russia (Federation)": "ru",
-    "Scotland": "gb",
-    "England": "gb",
-    "Korea (South)": "kr",
-    "Georgia (Republic)": "ge",
-    "Egypt": "eg",
-}
+from fatcat_tools.biblio_lookup_tables import (
+    COUNTRY_NAME_MAP,
+    LANG_MAP_MARC,
+    MONTH_ABBR_MAP,
+    PUBMED_RELEASE_TYPE_MAP,
+)
+from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid, clean_str
+
+from .common import EntityImporter
 
 
 class PubmedImporter(EntityImporter):
@@ -704,14 +402,14 @@ class PubmedImporter(EntityImporter):
         if extra_pubmed:
             extra["pubmed"] = extra_pubmed
 
-        title = clean(title)
+        title = clean_str(title)
         if not title:
             return None
 
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
             title=title,
-            original_title=clean(original_title),
+            original_title=clean_str(original_title),
             release_type=release_type,
             release_stage=release_stage,
             release_date=release_date,
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
deleted file mode 100755
index 5caed2c7..00000000
--- a/python/fatcat_tools/importers/wayback_static.py
+++ /dev/null
@@ -1,287 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Helpers to create Web Capture entities from extracted wayback content.
-
-Works as a stand-alone script (for debugging) or as library routines.
-"""
-
-import argparse
-import datetime
-import hashlib
-import json
-import subprocess
-import sys
-from typing import Any, Dict, List, Optional, Tuple
-
-import requests
-from bs4 import BeautifulSoup
-from fatcat_openapi_client import (
-    ApiClient,
-    Editgroup,
-    EntityEdit,
-    WebcaptureCdxLine,
-    WebcaptureEntity,
-    WebcaptureUrl,
-)
-
-from .common import b32_hex
-
-CDX_API_BASE = "https://web.archive.org/cdx/search/cdx"
-GWB_URL_BASE = "https://web.archive.org/web"
-REQ_SESSION = requests.Session()
-
-
-def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]:
-    """Takes a wayback machine URL, and returns a tuple:
-
-    (timestamp, datetime, original_url)
-    """
-    chunks = url.split("/")
-    assert len(chunks) >= 6
-    assert chunks[2] == "web.archive.org"
-    assert chunks[3] == "web"
-    return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:]))
-
-
-def test_parse_wbm_url() -> None:
-    u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
-    assert parse_wbm_url(u) == (
-        "20010712114837",
-        datetime.datetime(2001, 7, 12, 11, 48, 37),
-        "http://www.dlib.org/dlib/june01/reich/06reich.html",
-    )
-
-
-def parse_wbm_timestamp(timestamp: str) -> datetime.datetime:
-    """
-    Takes a complete WBM timestamp string (like "20020327115625") and returns a
-    python datetime object (UTC)
-    """
-    # strip any "im_" or "id_" suffix
-    if timestamp.endswith("_"):
-        timestamp = timestamp[:-3]
-    # inflexible; require the full second-precision timestamp
-    assert len(timestamp) == 14
-    return datetime.datetime(
-        year=int(timestamp[0:4]),
-        month=int(timestamp[4:6]),
-        day=int(timestamp[6:8]),
-        hour=int(timestamp[8:10]),
-        minute=int(timestamp[10:12]),
-        second=int(timestamp[12:14]),
-    )
-
-
-def test_parse_wbm_timestamp() -> None:
-    assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37)
-
-
-def fetch_wbm(url: str) -> bytes:
-    resp = REQ_SESSION.get(url)
-    resp.raise_for_status()
-    assert resp.content
-    return resp.content
-
-
-def lookup_cdx(
-    embed_url: str, verify_hashes: bool = True, cdx_output: Any = None
-) -> Optional[WebcaptureCdxLine]:
-    sys.stderr.write(embed_url + "\n")
-    assert embed_url.startswith("/web/")
-    embed_url_segments = embed_url.split("/")
-    timestamp = embed_url_segments[2]
-    if timestamp.endswith("_"):
-        timestamp = timestamp[:-3]
-    url = "/".join(embed_url_segments[3:])
-    # print((timestamp, url))
-    params: Dict = dict(
-        url=url,
-        closest=timestamp,
-        sort="closest",
-        resolveRevisits="true",
-        matchType="exact",
-        limit=1,
-    )
-    resp = REQ_SESSION.get(
-        CDX_API_BASE,
-        params=params,
-    )
-    resp.raise_for_status()
-    # print(resp.url)
-    if resp.content:
-        hit = resp.content.decode("utf-8").split("\n")[0]
-        if cdx_output:
-            cdx_output.write(hit + "\n")
-        cdx_chunks = hit.split(" ")
-        cdx = [x if (x and x != "-") else None for x in cdx_chunks]
-        webcapture_cdx = WebcaptureCdxLine(
-            surt=cdx[0],
-            timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z",
-            url=cdx[2],
-            mimetype=cdx[3],
-            status_code=int(cdx[4] or ""),
-            sha1=b32_hex(cdx[5] or ""),
-            sha256=None,
-        )
-        if verify_hashes:
-            resp = REQ_SESSION.get(
-                GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url)  # raw timestamp
-            )
-            resp.raise_for_status()
-            assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
-            webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
-            webcapture_cdx.size = len(resp.content)
-        return webcapture_cdx
-    else:
-        return None
-
-
-def wayback_url_to_relative(url: str) -> Optional[str]:
-    """
-    Wayback URLs can be relative or absolute in rewritten documents. This
-    function converts any form of rewritten URL to a relative (to
-    web.archive.org) one, or returns None if it isn't a rewritten URL at all.
-    """
-    if url.startswith("https://web.archive.org/"):
-        url = url[23:]
-    elif url.startswith("http://web.archive.org/"):
-        url = url[22:]
-
-    if url.startswith("/web/"):
-        return url
-    else:
-        return None
-
-
-def extract_embeds(soup: BeautifulSoup) -> List[str]:
-
-    embeds = set()
-
-    # <link href="">
-    for tag in soup.find_all("link", href=True):
-        if tag["rel"] not in ("stylesheet",):
-            continue
-        url = wayback_url_to_relative(tag["href"])
-        if url:
-            embeds.add(url)
-    # <img src="">
-    for tag in soup.find_all("img", src=True):
-        url = wayback_url_to_relative(tag["src"])
-        if url:
-            embeds.add(url)
-
-    # <script src="">
-    for tag in soup.find_all("script", src=True):
-        url = wayback_url_to_relative(tag["src"])
-        if url:
-            embeds.add(url)
-
-    return list(embeds)
-
-
-def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity:
-    """
-    Given a complete wayback machine capture URL, like:
-
-        http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html
-
-    Will return a new ("bare") fatcat webcapture entity python object, with all
-    the CDX entries filled in.
-    """
-
-    wbm_html = fetch_wbm(wayback_url)
-    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
-    # with open(rewritten_path, 'r') as fp:
-    #    soup = BeautifulSoup(fp, "lxml")
-    soup = BeautifulSoup(wbm_html, "lxml")
-    embeds = extract_embeds(soup)
-    cdx_obj = lookup_cdx(
-        "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output
-    )
-    cdx_list = [cdx_obj]
-    for url in embeds:
-        cdx_obj = lookup_cdx(url, cdx_output=cdx_output)
-        cdx_list.append(cdx_obj)
-    archive_urls = [
-        WebcaptureUrl(
-            rel="wayback",
-            url="https://web.archive.org/web/",
-        )
-    ]
-    wc = WebcaptureEntity(
-        cdx=cdx_list,
-        timestamp=timestamp.isoformat() + "Z",
-        original_url=original_url,
-        archive_urls=archive_urls,
-        release_ids=None,
-    )
-    return wc
-
-
-def auto_wayback_static(
-    api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None
-) -> Tuple[Optional[str], Optional[EntityEdit]]:
-    """
-    Returns a tuple: (editgroup_id, edit). If failed, both are None
-    """
-
-    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
-    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
-    release = api.get_release(release_id, expand="webcaptures")
-
-    # check for existing webcapture with same parameters
-    for wc in release.webcaptures:
-        if wc.original_url == original_url and wc.timestamp.date() == timestamp.date():
-            # skipping: already existed
-            print(
-                "release {} already had webcapture {} {}".format(
-                    release_id, raw_timestamp, original_url
-                )
-            )
-            return (None, None)
-
-    wc = static_wayback_webcapture(wayback_url)
-    assert len(wc.cdx) >= 1
-    wc.release_ids = [release_id]
-    if not editgroup_id:
-        eg = api.create_editgroup(
-            Editgroup(
-                description="One-off import of static web content from wayback machine",
-                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"),
-            )
-        )
-        editgroup_id = eg.editgroup_id
-    edit = api.create_webcapture(eg.editgroup_id, wc)
-    return (editgroup_id, edit)
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--verbose", action="store_true", help="verbose output")
-    parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from")
-    parser.add_argument(
-        "--json-output",
-        type=argparse.FileType("w"),
-        default=sys.stdout,
-        help="where to write out webcapture entity (as JSON)",
-    )
-    parser.add_argument(
-        "--cdx-output",
-        type=argparse.FileType("w"),
-        default=None,
-        help="(optional) file to write out CDX stub",
-    )
-
-    args = parser.parse_args()
-
-    # entity-to-JSON code; duplicate of entity_to_dict()
-    api_client = ApiClient()
-    wc = static_wayback_webcapture(args.wayback_url, cdx_output=args.cdx_output)
-    wc_dict = api_client.sanitize_for_serialization(wc)
-    print(json.dumps(wc_dict))
-
-
-if __name__ == "__main__":
-    main()
author	bnewbold <bnewbold@archive.org>	2021-11-11 01:12:18 +0000
committer	bnewbold <bnewbold@archive.org>	2021-11-11 01:12:18 +0000
commit	6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch)
tree	1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_tools/importers
parent	7e3f91f1a49ea85707cae31125021ba761f5373d (diff)
parent	6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff)
download	fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip