From ba7f9214d2038882952eb50cd4dc5eff4eb0e6ff Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 18:10:35 -0800
Subject: remove deprecated extid sqlite3 lookup table feature from importers

This was used during initial bulk imports, but is no longer used and
could create serious metadata problems if used accidentially.

In retrospect, it also made metadata provenance less transparent, and
may have done more harm than good overall.
---
 python/README_import.md                   |  3 ++
 python/fatcat_import.py                   | 22 +------------
 python/fatcat_tools/importers/crossref.py | 54 -------------------------------
 python/fatcat_tools/importers/datacite.py | 54 -------------------------------
 python/fatcat_tools/importers/jalc.py     | 52 -----------------------------
 python/tests/import_crossref.py           |  8 ++---
 python/tests/import_datacite.py           |  2 --
 python/tests/import_jalc.py               |  8 ++---
 python/tests/import_jstor.py              |  8 ++---
 python/tests/import_pubmed.py             |  2 --
 10 files changed, 10 insertions(+), 203 deletions(-)

diff --git a/python/README_import.md b/python/README_import.md
index 6853a4d7..74e75e14 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -52,6 +52,7 @@ Usually tens of minutes on fast production machine.
 
 Usually 24 hours or so on fast production machine.
 
+    # NOTE: `--extid-map-file` was used during initial import, but is now deprecated
     time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
 
 ## JALC
@@ -59,6 +60,7 @@ Usually 24 hours or so on fast production machine.
 First import a random subset single threaded to create (most) containers. On a
 fast machine, this takes a couple minutes.
 
+    # NOTE: `--extid-map-file` was used during initial import, but is now deprecated
     time ./fatcat_import.py jalc /srv/fatcat/datasets/JALC-LOD-20180907.sample10k.rdf /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
 
 Then, in parallel:
@@ -116,6 +118,7 @@ Prep JSON files from sqlite (for parallel import):
 
 Run import in parallel:
 
+    # NOTE: `--extid-map-file` was used during initial import, but is now deprecated
     export FATCAT_AUTH_WORKER_CRAWL=...
     zcat /srv/fatcat/datasets/s2_doi.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id DIRECT-OA-CRAWL-2019 --no-require-grobid
 
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 445acde8..39ef200a 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -54,7 +54,6 @@ def run_crossref(args: argparse.Namespace) -> None:
     fci = CrossrefImporter(
         args.api,
         args.issn_map_file,
-        extid_map_file=args.extid_map_file,
         edit_batch_size=args.batch_size,
         bezerk_mode=args.bezerk_mode,
     )
@@ -72,7 +71,7 @@ def run_crossref(args: argparse.Namespace) -> None:
 
 
 def run_jalc(args: argparse.Namespace) -> None:
-    ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file)
+    ji = JalcImporter(args.api, args.issn_map_file)
     Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run()
 
 
@@ -370,7 +369,6 @@ def run_datacite(args: argparse.Namespace) -> None:
         edit_batch_size=args.batch_size,
         bezerk_mode=args.bezerk_mode,
         debug=args.debug,
-        extid_map_file=args.extid_map_file,
         insert_log_file=args.insert_log_file,
     )
     if args.kafka_mode:
@@ -494,12 +492,6 @@ def main() -> None:
         default=None,
         type=argparse.FileType("r"),
     )
-    sub_crossref.add_argument(
-        "--extid-map-file",
-        help="DOI-to-other-identifiers sqlite3 database",
-        default=None,
-        type=str,
-    )
     sub_crossref.add_argument(
         "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)"
     )
@@ -529,12 +521,6 @@ def main() -> None:
         default=None,
         type=argparse.FileType("r"),
     )
-    sub_jalc.add_argument(
-        "--extid-map-file",
-        help="DOI-to-other-identifiers sqlite3 database",
-        default=None,
-        type=str,
-    )
 
     sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files")
     sub_arxiv.set_defaults(
@@ -963,12 +949,6 @@ def main() -> None:
         default=None,
         type=argparse.FileType("r"),
     )
-    sub_datacite.add_argument(
-        "--extid-map-file",
-        help="DOI-to-other-identifiers sqlite3 database",
-        default=None,
-        type=str,
-    )
     sub_datacite.add_argument(
         "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
     )
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index a41e2bf5..9c69fee3 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,5 +1,4 @@
 import datetime
-import sqlite3
 from typing import Any, Dict, List, Optional, Sequence
 
 import fatcat_openapi_client
@@ -128,8 +127,6 @@ class CrossrefImporter(EntityImporter):
     """
     Importer for Crossref metadata.
 
-    Can use a local sqlite3 file for faster "external identifier" lookups
-
     See https://github.com/CrossRef/rest-api-doc for JSON schema notes
     """
 
@@ -150,50 +147,8 @@ class CrossrefImporter(EntityImporter):
         )
 
         self.create_containers: bool = kwargs.get("create_containers", True)
-        extid_map_file = kwargs.get("extid_map_file")
-        self.extid_map_db: Optional[Any] = None
-        if extid_map_file:
-            db_uri = "file:{}?mode=ro".format(extid_map_file)
-            print("Using external ID map: {}".format(db_uri))
-            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
-        else:
-            print("Not using external ID map")
-
         self.read_issn_map_file(issn_map_file)
 
-    def lookup_ext_ids(self, doi: str) -> Optional[Any]:
-        if self.extid_map_db is None:
-            return dict(
-                core_id=None,
-                pmid=None,
-                pmcid=None,
-                wikidata_qid=None,
-                arxiv_id=None,
-                jstor_id=None,
-            )
-        row = self.extid_map_db.execute(
-            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
-        ).fetchone()
-        if row is None:
-            return dict(
-                core_id=None,
-                pmid=None,
-                pmcid=None,
-                wikidata_qid=None,
-                arxiv_id=None,
-                jstor_id=None,
-            )
-        row = [str(cell or "") or None for cell in row]
-        return dict(
-            core_id=row[0],
-            pmid=row[1],
-            pmcid=row[2],
-            wikidata_qid=row[3],
-            # TODO:
-            arxiv_id=None,
-            jstor_id=None,
-        )
-
     def map_release_type(self, crossref_type: str) -> Optional[str]:
         return CROSSREF_TYPE_MAP.get(crossref_type)
 
@@ -473,9 +428,6 @@ class CrossrefImporter(EntityImporter):
             # unknown
             release_stage = None
 
-        # external identifiers
-        extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {}
-
         # filter out unreasonably huge releases
         if len(abstracts) > 100:
             self.counts["skip-huge-abstracts"] += 1
@@ -538,13 +490,7 @@ class CrossrefImporter(EntityImporter):
             publisher=publisher,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
                 doi=obj["DOI"].lower(),
-                pmid=extids["pmid"],
-                pmcid=extids["pmcid"],
-                wikidata_qid=extids["wikidata_qid"],
                 isbn13=isbn13,
-                core=extids["core_id"],
-                arxiv=extids["arxiv_id"],
-                jstor=extids["jstor_id"],
             ),
             volume=clean(obj.get("volume")),
             issue=clean(obj.get("issue")),
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d5622960..d4d7a9f5 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -12,7 +12,6 @@ import collections
 import datetime
 import json
 import re
-import sqlite3
 import sys
 from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
 
@@ -248,15 +247,6 @@ class DataciteImporter(EntityImporter):
         )
 
         self.create_containers = kwargs.get("create_containers", True)
-        extid_map_file = kwargs.get("extid_map_file")
-        self.extid_map_db = None
-        if extid_map_file:
-            db_uri = "file:{}?mode=ro".format(extid_map_file)
-            print("Using external ID map: {}".format(db_uri), file=sys.stderr)
-            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
-        else:
-            print("Not using external ID map", file=sys.stderr)
-
         self.read_issn_map_file(issn_map_file)
         self.debug = debug
         self.insert_log_file = insert_log_file
@@ -264,42 +254,6 @@ class DataciteImporter(EntityImporter):
 
         print("datacite with debug={}".format(self.debug), file=sys.stderr)
 
-    def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
-        """
-        Return dictionary of identifiers referring to the same things as the given DOI.
-        """
-        if self.extid_map_db is None:
-            return dict(
-                core_id=None,
-                pmid=None,
-                pmcid=None,
-                wikidata_qid=None,
-                arxiv_id=None,
-                jstor_id=None,
-            )
-        row = self.extid_map_db.execute(
-            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
-        ).fetchone()
-        if row is None:
-            return dict(
-                core_id=None,
-                pmid=None,
-                pmcid=None,
-                wikidata_qid=None,
-                arxiv_id=None,
-                jstor_id=None,
-            )
-        row = [str(cell or "") or None for cell in row]
-        return dict(
-            core_id=row[0],
-            pmid=row[1],
-            pmcid=row[2],
-            wikidata_qid=row[3],
-            # TODO:
-            arxiv_id=None,
-            jstor_id=None,
-        )
-
     def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:
         """
         Mapping datacite JSON to ReleaseEntity.
@@ -706,8 +660,6 @@ class DataciteImporter(EntityImporter):
         if release_month:
             extra["release_month"] = release_month
 
-        extids = self.lookup_ext_ids(doi=doi)
-
         # Assemble release.
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
@@ -722,12 +674,6 @@ class DataciteImporter(EntityImporter):
             publisher=publisher,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
                 doi=doi,
-                pmid=extids["pmid"],
-                pmcid=extids["pmcid"],
-                wikidata_qid=extids["wikidata_qid"],
-                core=extids["core_id"],
-                arxiv=extids["arxiv_id"],
-                jstor=extids["jstor_id"],
             ),
             contribs=contribs,
             volume=volume,
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 2f10e533..a737ac9f 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,5 +1,4 @@
 import datetime
-import sqlite3
 import sys
 from typing import Any, Dict, List, Optional, Sequence
 
@@ -117,50 +116,8 @@ class JalcImporter(EntityImporter):
         )
 
         self.create_containers = kwargs.get("create_containers", True)
-        extid_map_file = kwargs.get("extid_map_file")
-        self.extid_map_db = None
-        if extid_map_file:
-            db_uri = "file:{}?mode=ro".format(extid_map_file)
-            print("Using external ID map: {}".format(db_uri))
-            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
-        else:
-            print("Not using external ID map")
-
         self.read_issn_map_file(issn_map_file)
 
-    def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
-        if self.extid_map_db is None:
-            return dict(
-                core_id=None,
-                pmid=None,
-                pmcid=None,
-                wikidata_qid=None,
-                arxiv_id=None,
-                jstor_id=None,
-            )
-        row = self.extid_map_db.execute(
-            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
-        ).fetchone()
-        if row is None:
-            return dict(
-                core_id=None,
-                pmid=None,
-                pmcid=None,
-                wikidata_qid=None,
-                arxiv_id=None,
-                jstor_id=None,
-            )
-        row = [str(cell or "") or None for cell in row]
-        return dict(
-            core_id=row[0],
-            pmid=row[1],
-            pmcid=row[2],
-            wikidata_qid=row[3],
-            # TODO:
-            arxiv_id=None,
-            jstor_id=None,
-        )
-
     def want(self, raw_record: Any) -> bool:
         return True
 
@@ -330,9 +287,6 @@ class JalcImporter(EntityImporter):
         # reasonable default for this collection
         release_type = "article-journal"
 
-        # external identifiers
-        extids = self.lookup_ext_ids(doi=doi)
-
         # extra:
         #   translation_of
         #   aliases
@@ -356,12 +310,6 @@ class JalcImporter(EntityImporter):
             release_year=release_year,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
                 doi=doi,
-                pmid=extids["pmid"],
-                pmcid=extids["pmcid"],
-                wikidata_qid=extids["wikidata_qid"],
-                core=extids["core_id"],
-                arxiv=extids["arxiv_id"],
-                jstor=extids["jstor_id"],
             ),
             volume=volume,
             issue=issue,
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index eb931eb1..5f38e73e 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -10,17 +10,13 @@ from fatcat_tools.importers import CrossrefImporter, JsonLinePusher
 @pytest.fixture(scope="function")
 def crossref_importer(api):
     with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
-        yield CrossrefImporter(
-            api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True
-        )
+        yield CrossrefImporter(api, issn_file, bezerk_mode=True)
 
 
 @pytest.fixture(scope="function")
 def crossref_importer_existing(api):
     with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
-        yield CrossrefImporter(
-            api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False
-        )
+        yield CrossrefImporter(api, issn_file, bezerk_mode=False)
 
 
 @pytest.mark.skip(
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 220dc0f6..b15d14c3 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -30,7 +30,6 @@ def datacite_importer(api):
         yield DataciteImporter(
             api,
             issn_file,
-            extid_map_file="tests/files/example_map.sqlite3",
             bezerk_mode=True,
         )
 
@@ -41,7 +40,6 @@ def datacite_importer_existing(api):
         yield DataciteImporter(
             api,
             issn_file,
-            extid_map_file="tests/files/example_map.sqlite3",
             bezerk_mode=False,
         )
 
diff --git a/python/tests/import_jalc.py b/python/tests/import_jalc.py
index 4ebc87b4..8281b9a1 100644
--- a/python/tests/import_jalc.py
+++ b/python/tests/import_jalc.py
@@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, Bs4XmlLinesPusher, JalcImpo
 @pytest.fixture(scope="function")
 def jalc_importer(api):
     with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
-        yield JalcImporter(
-            api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True
-        )
+        yield JalcImporter(api, issn_file, bezerk_mode=True)
 
 
 @pytest.fixture(scope="function")
 def jalc_importer_existing(api):
     with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
-        yield JalcImporter(
-            api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False
-        )
+        yield JalcImporter(api, issn_file, bezerk_mode=False)
 
 
 def test_jalc_importer(jalc_importer):
diff --git a/python/tests/import_jstor.py b/python/tests/import_jstor.py
index 8ad550b3..7e13c8b0 100644
--- a/python/tests/import_jstor.py
+++ b/python/tests/import_jstor.py
@@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, JstorImporter
 @pytest.fixture(scope="function")
 def jstor_importer(api):
     with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
-        yield JstorImporter(
-            api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True
-        )
+        yield JstorImporter(api, issn_file, bezerk_mode=True)
 
 
 @pytest.fixture(scope="function")
 def jstor_importer_existing(api):
     with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
-        yield JstorImporter(
-            api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False
-        )
+        yield JstorImporter(api, issn_file, bezerk_mode=False)
 
 
 def test_jstor_importer(jstor_importer):
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
index a5301f29..e783db48 100644
--- a/python/tests/import_pubmed.py
+++ b/python/tests/import_pubmed.py
@@ -11,7 +11,6 @@ def pubmed_importer(api):
         yield PubmedImporter(
             api,
             issn_file,
-            extid_map_file="tests/files/example_map.sqlite3",
             bezerk_mode=True,
             lookup_refs=True,
         )
@@ -23,7 +22,6 @@ def pubmed_importer_existing(api):
         yield PubmedImporter(
             api,
             issn_file,
-            extid_map_file="tests/files/example_map.sqlite3",
             bezerk_mode=False,
             lookup_refs=True,
         )
-- 
cgit v1.2.3


From 2fd90ad2cc561fa743a617315824b2744f737575 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 18:12:39 -0800
Subject: clean_doi: stop mutating double-slash DOIs, except for 10.1037 prefix

---
 python/fatcat_tools/normal.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 34e5c3d1..0d2c84ce 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -47,7 +47,7 @@ def clean_doi(raw: Optional[str]) -> Optional[str]:
         raw = raw[8:]
     if raw.startswith("dx.doi.org/"):
         raw = raw[11:]
-    if raw[7:9] == "//":
+    if raw[7:9] == "//" and "10.1037//" in raw:
         raw = raw[:8] + raw[9:]
 
     # fatcatd uses same REGEX, but Rust regex rejects these characters, while
@@ -74,6 +74,7 @@ def test_clean_doi() -> None:
     assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
     assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
     assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
+    assert clean_doi("10.1026//1616-1041.3.2.86") == "10.1026//1616-1041.3.2.86"
     assert clean_doi("10.23750/abm.v88i2 -s.6506") is None
     assert clean_doi("10.17167/mksz.2017.2.129–155") is None
     assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf"
-- 
cgit v1.2.3


From 1024e688bb12d64648ceb638daf049d508f87561 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 18:13:15 -0800
Subject: importers: use clean_doi() in many more (all?) importers

---
 python/fatcat_tools/importers/arxiv.py           | 6 ++++--
 python/fatcat_tools/importers/cdl_dash_dat.py    | 6 ++++--
 python/fatcat_tools/importers/common.py          | 5 ++---
 python/fatcat_tools/importers/crossref.py        | 9 ++++++++-
 python/fatcat_tools/importers/grobid_metadata.py | 9 ++++++---
 python/fatcat_tools/importers/jstor.py           | 6 +++++-
 6 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 1d50dd9a..dd2c2284 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -9,6 +9,8 @@ from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 from pylatexenc.latex2text import LatexNodes2Text
 
+from fatcat_tools.normal import clean_doi
+
 from .common import EntityImporter
 from .crossref import lookup_license_slug
 
@@ -127,8 +129,8 @@ class ArxivRawImporter(EntityImporter):
         base_id = metadata.id.string
         doi = None
         if metadata.doi and metadata.doi.string:
-            doi = metadata.doi.string.lower().split()[0].strip()
-            if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):
+            doi = clean_doi(metadata.doi.string.lower().split()[0].strip())
+            if doi and not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):
                 sys.stderr.write("BOGUS DOI: {}\n".format(doi))
                 doi = None
         title = latex_to_text(metadata.title.get_text().replace("\n", " "))
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
index 1a4114a0..ec557e15 100755
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ b/python/fatcat_tools/importers/cdl_dash_dat.py
@@ -23,6 +23,8 @@ from fatcat_openapi_client import (
     ReleaseExtIds,
 )
 
+from fatcat_tools.normal import clean_doi
+
 from .common import clean
 from .crossref import lookup_license_slug
 
@@ -78,8 +80,8 @@ def cdl_dash_release(
         extra = dict()
 
     assert meta["identifier"]["type"] == "DOI"
-    doi = meta["identifier"]["value"].lower()
-    assert doi.startswith("10.")
+    doi = clean_doi(meta["identifier"]["value"].lower())
+    assert doi and doi.startswith("10.")
 
     ark_id = None
     for extid in meta.get("alternativeIdentifiers", []):
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index fd472d11..425b6f13 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -29,7 +29,7 @@ from fuzzycat.matching import match_release_fuzzy
 
 # TODO: refactor so remove need for this (re-imports for backwards compatibility)
 from fatcat_tools.normal import is_cjk  # noqa: F401
-from fatcat_tools.normal import LANG_MAP_MARC, b32_hex  # noqa: F401
+from fatcat_tools.normal import LANG_MAP_MARC, b32_hex, clean_doi  # noqa: F401
 from fatcat_tools.normal import clean_str as clean  # noqa: F401
 from fatcat_tools.transforms import entity_to_dict
 
@@ -342,8 +342,7 @@ class EntityImporter:
         return creator_id
 
     def is_doi(self, doi: str) -> bool:
-        # TODO: replace with clean_doi() from fatcat_tools.normal
-        return doi.startswith("10.") and doi.count("/") >= 1
+        return clean_doi(doi) is not None
 
     def lookup_doi(self, doi: str) -> Optional[str]:
         """Caches calls to the doi lookup API endpoint in a local dict
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 9c69fee3..c9f251fc 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,6 +4,8 @@ from typing import Any, Dict, List, Optional, Sequence
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
+from fatcat_tools.normal import clean_doi
+
 from .common import EntityImporter, clean
 
 # The docs/guide should be the canonical home for these mappings; update there
@@ -467,6 +469,11 @@ class CrossrefImporter(EntityImporter):
                 self.counts["skip-blank-title"] += 1
                 return None
 
+        doi = clean_doi(obj["DOI"].lower())
+        if not doi:
+            self.counts["skip-bad-doi"] += 1
+            return None
+
         subtitle = None
         if obj.get("subtitle"):
             subtitle = clean(obj["subtitle"][0], force_xml=True)
@@ -489,7 +496,7 @@ class CrossrefImporter(EntityImporter):
             release_year=release_year,
             publisher=publisher,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
-                doi=obj["DOI"].lower(),
+                doi=doi,
                 isbn13=isbn13,
             ),
             volume=clean(obj.get("volume")),
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index e36e1b48..7c595787 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,6 +7,8 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
 
+from fatcat_tools.normal import clean_doi
+
 from .common import EntityImporter, clean, make_rel_url
 
 MAX_ABSTRACT_BYTES = 4096
@@ -133,9 +135,10 @@ class GrobidMetadataImporter(EntityImporter):
             # only returns year, ever?
             release_year = int(obj["date"][:4])
 
-        extra = dict()
-        if obj.get("doi"):
-            extra["doi"] = obj["doi"]
+        extra: Dict[str, Any] = dict()
+        doi = clean_doi(obj.get("doi"))
+        if doi:
+            extra["doi"] = doi
         if obj["journal"] and obj["journal"].get("name"):
             extra["container_name"] = clean(obj["journal"]["name"])
 
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 2c8aa0a4..ca1f2466 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,6 +8,8 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
+from fatcat_tools.normal import clean_doi
+
 from .common import LANG_MAP_MARC, EntityImporter, clean
 from .crossref import CONTAINER_TYPE_MAP
 
@@ -146,7 +148,9 @@ class JstorImporter(EntityImporter):
 
         doi = article_meta.find("article-id", {"pub-id-type": "doi"})
         if doi:
-            doi = doi.string.lower().strip()
+            doi = clean_doi(doi.string.lower())
+        else:
+            doi = None
 
         jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"})
         if jstor_id:
-- 
cgit v1.2.3


From 23fd36a3e8505c1ed6d13367a3fb62a8bf2242d7 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 18:14:58 -0800
Subject: add notes about 'double slash in DOI' issue

---
 notes/cleanups/double_slash_dois.md | 46 +++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 notes/cleanups/double_slash_dois.md

diff --git a/notes/cleanups/double_slash_dois.md b/notes/cleanups/double_slash_dois.md
new file mode 100644
index 00000000..d4e9ded6
--- /dev/null
+++ b/notes/cleanups/double_slash_dois.md
@@ -0,0 +1,46 @@
+
+Relevant github issue: https://github.com/internetarchive/fatcat/issues/48
+
+
+## Investigate
+
+At least some of these DOIs actually seem valid, like
+`10.1026//1616-1041.3.2.86`. So shouldn't be re-writing them!
+
+    zcat release_extid.tsv.gz \
+        | cut -f1,3 \
+        | rg '\t10\.\d+//' \
+        | wc -l
+    # 59,904
+
+    zcat release_extid.tsv.gz \
+        | cut -f1,3 \
+        | rg '\t10\.\d+//' \
+        | pv -l \
+        > doubleslash_dois.tsv
+
+Which prefixes have the most double slashes?
+
+    cat doubleslash_dois.tsv | cut -f2 | cut -d/ -f1 | sort | uniq -c | sort -nr | head
+      51220 10.1037
+       2187 10.1026
+       1316 10.1024
+        826 10.1027
+        823 10.14505
+        443 10.17010
+        186 10.46925
+        163 10.37473
+        122 10.18376
+        118 10.29392
+        [...]
+
+All of the 10.1037 DOIs seem to be registered with Crossref, and at least some
+have redirects to the not-with-double-slash versions. Not all doi.org lookups
+include a redirect.
+
+I think the "correct thing to do" here is to add special-case handling for the
+pubmed and crossref importers, and in any other case allow double slashes.
+
+Not clear that there are any specific cleanups to be done for now. A broader
+"verify that DOIs are actually valid" push and cleanup would make sense; if
+that happens checking for mangled double-slash DOIs would make sense.
-- 
cgit v1.2.3


From c133f3077aa975aa4706a8e5ca894fc1b71fbc67 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Mon, 10 Aug 2020 17:27:26 -0700
Subject: datacite import: store less subject metadata

Many of these 'subject' objects have the equivalent of several lines of
text, with complex URLs that don't compress well. I think it is fine we
have included these thus far instead of parsing more deeply, but going
forward I don't think this nested 'extra' metadata is worth the database
space.
---
 python/fatcat_tools/importers/datacite.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d4d7a9f5..fe02cac4 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -597,7 +597,13 @@ class DataciteImporter(EntityImporter):
         if license_extra:
             extra_datacite["license"] = license_extra
         if attributes.get("subjects"):
-            extra_datacite["subjects"] = attributes["subjects"]
+            # these subjects with schemeUri are too much metadata, which
+            # doesn't compress. filter them out.
+            extra_subjects = [
+                subj for subj in attributes["subjects"] if not subj.get("schemeUri")
+            ]
+            if extra_subjects:
+                extra_datacite["subjects"] = extra_subjects
 
         # Include version information.
         metadata_version = attributes.get("metadataVersion") or ""
-- 
cgit v1.2.3


From ab4e1355bf93e3755985f1b5cd2589a78601d253 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 10 Nov 2021 13:08:23 -0800
Subject: remove cdl_dash_dat and wayback_static importers

Cleaning out dead code.

These importers were used to create demonstration fileset and webcapture
entities early in development. They have been replaced by the fileset
and webcapture ingest importers.
---
 python/fatcat_import.py                         |  86 -------
 python/fatcat_tools/importers/__init__.py       |   2 -
 python/fatcat_tools/importers/cdl_dash_dat.py   | 221 ------------------
 python/fatcat_tools/importers/wayback_static.py | 287 ------------------------
 4 files changed, 596 deletions(-)
 delete mode 100755 python/fatcat_tools/importers/cdl_dash_dat.py
 delete mode 100755 python/fatcat_tools/importers/wayback_static.py

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 39ef200a..33679868 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -42,8 +42,6 @@ from fatcat_tools.importers import (
     SavePaperNowWebImporter,
     ShadowLibraryImporter,
     SqlitePusher,
-    auto_cdl_dash_dat,
-    auto_wayback_static,
 )
 
 # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
@@ -315,53 +313,6 @@ def run_shadow_lib(args: argparse.Namespace) -> None:
     JsonLinePusher(fmi, args.json_file).run()
 
 
-def run_wayback_static(args: argparse.Namespace) -> None:
-    api = args.api
-
-    # find the release
-    if args.release_id:
-        release_id = args.release_id
-    elif args.extid:
-        idtype = args.extid.split(":")[0]
-        extid = ":".join(args.extid.split(":")[1:])
-        if idtype == "doi":
-            release_id = api.lookup_release(doi=extid).ident
-        elif idtype == "pmid":
-            release_id = api.lookup_release(pmid=extid).ident
-        elif idtype == "wikidata":
-            release_id = api.lookup_release(wikidata_qid=extid).ident
-        else:
-            raise NotImplementedError("extid type: {}".format(idtype))
-    else:
-        raise Exception("need either release_id or extid argument")
-
-    # create it
-    (editgroup_id, wc) = auto_wayback_static(
-        api, release_id, args.wayback_url, editgroup_id=args.editgroup_id
-    )
-    if not wc:
-        return
-    print("release_id: {}".format(release_id))
-    print("editgroup_id: {}".format(editgroup_id))
-    print("webcapture id: {}".format(wc.ident))
-    print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident))
-
-
-def run_cdl_dash_dat(args: argparse.Namespace) -> None:
-    api = args.api
-
-    # create it
-    (editgroup_id, release, fs) = auto_cdl_dash_dat(
-        api, args.dat_path, release_id=args.release_id, editgroup_id=args.editgroup_id
-    )
-    if not (fs and release):
-        return
-    print("release_id: {}".format(release.ident))
-    print("editgroup_id: {}".format(editgroup_id))
-    print("fileset id: {}".format(fs.ident))
-    print("link: https://fatcat.wiki/fileset/{}".format(fs.ident))
-
-
 def run_datacite(args: argparse.Namespace) -> None:
     dci = DataciteImporter(
         args.api,
@@ -899,43 +850,6 @@ def main() -> None:
         type=argparse.FileType("r"),
     )
 
-    sub_wayback_static = subparsers.add_parser(
-        "wayback-static", help="crude crawl+ingest tool for single-page HTML docs from wayback"
-    )
-    sub_wayback_static.set_defaults(
-        func=run_wayback_static,
-        auth_var="FATCAT_API_AUTH_TOKEN",
-    )
-    sub_wayback_static.add_argument(
-        "wayback_url", type=str, help="URL of wayback capture to extract from"
-    )
-    sub_wayback_static.add_argument(
-        "--extid", type=str, help="external identifier for release lookup"
-    )
-    sub_wayback_static.add_argument("--release-id", type=str, help="release entity identifier")
-    sub_wayback_static.add_argument(
-        "--editgroup-id",
-        type=str,
-        help="use existing editgroup (instead of creating a new one)",
-    )
-
-    sub_cdl_dash_dat = subparsers.add_parser(
-        "cdl-dash-dat", help="crude helper to import datasets from Dat/CDL mirror pilot project"
-    )
-    sub_cdl_dash_dat.set_defaults(
-        func=run_cdl_dash_dat,
-        auth_var="FATCAT_API_AUTH_TOKEN",
-    )
-    sub_cdl_dash_dat.add_argument(
-        "dat_path", type=str, help="local path dat to import (must be the dat discovery key)"
-    )
-    sub_cdl_dash_dat.add_argument("--release-id", type=str, help="release entity identifier")
-    sub_cdl_dash_dat.add_argument(
-        "--editgroup-id",
-        type=str,
-        help="use existing editgroup (instead of creating a new one)",
-    )
-
     sub_datacite = subparsers.add_parser("datacite", help="import datacite.org metadata")
     sub_datacite.add_argument(
         "json_file",
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 06ecfd58..223ae526 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -13,7 +13,6 @@ To run an import you combine two classes; one each of:
 
 from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter
 from .arxiv import ArxivRawImporter
-from .cdl_dash_dat import auto_cdl_dash_dat
 from .chocula import ChoculaImporter
 from .common import (
     LANG_MAP_MARC,
@@ -55,4 +54,3 @@ from .matched import MatchedImporter
 from .orcid import OrcidImporter
 from .pubmed import PubmedImporter
 from .shadow import ShadowLibraryImporter
-from .wayback_static import auto_wayback_static
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
deleted file mode 100755
index ec557e15..00000000
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ /dev/null
@@ -1,221 +0,0 @@
-#!/usr/bin/env python3
-
-import hashlib
-import json
-import mimetypes
-import os
-import subprocess
-import sys
-import urllib
-import urllib.parse
-from typing import Any, Dict, List, Optional, Tuple
-
-import fatcat_openapi_client
-import magic
-from fatcat_openapi_client import (
-    ApiClient,
-    Editgroup,
-    FilesetEntity,
-    FilesetFile,
-    ReleaseAbstract,
-    ReleaseContrib,
-    ReleaseEntity,
-    ReleaseExtIds,
-)
-
-from fatcat_tools.normal import clean_doi
-
-from .common import clean
-from .crossref import lookup_license_slug
-
-
-def single_file(prefix: str, path: str) -> FilesetFile:
-
-    full = prefix + path
-    size_bytes = os.stat(full).st_size
-
-    hashes = [
-        hashlib.md5(),
-        hashlib.sha1(),
-        hashlib.sha256(),
-    ]
-    with open(full, "rb") as fp:
-        while True:
-            data = fp.read(2 ** 20)
-            if not data:
-                break
-            for h in hashes:
-                h.update(data)
-    mime = magic.Magic(mime=True).from_file(full)
-    if mime == "application/octet-stream":
-        # magic apparently isn't that great; try using filename as well
-        guess = mimetypes.guess_type(full)[0]
-        if guess:
-            mime = guess
-
-    fsf = FilesetFile(
-        path=path,
-        size=size_bytes,
-        md5=hashes[0].hexdigest(),
-        sha1=hashes[1].hexdigest(),
-        sha256=hashes[2].hexdigest(),
-        extra=dict(mimetype=mime),
-    )
-    return fsf
-
-
-def make_manifest(base_dir: str) -> List[FilesetFile]:
-    manifest = []
-    for root, dirs, files in os.walk(base_dir):
-        for f in files:
-            manifest.append(single_file(root, f))
-    return manifest
-
-
-def cdl_dash_release(
-    meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None
-) -> ReleaseEntity:
-
-    if not extra:
-        extra = dict()
-
-    assert meta["identifier"]["type"] == "DOI"
-    doi = clean_doi(meta["identifier"]["value"].lower())
-    assert doi and doi.startswith("10.")
-
-    ark_id = None
-    for extid in meta.get("alternativeIdentifiers", []):
-        if extid["value"].startswith("ark:"):
-            ark_id = extid["value"]
-    assert ark_id
-
-    license_slug = lookup_license_slug(meta["rights"]["uri"])
-
-    abstracts = []
-    for desc in meta["descriptions"]:
-        if desc["type"] == "abstract":
-            abstracts.append(
-                ReleaseAbstract(mimetype="text/html", content=clean(desc["value"]))
-            )
-            # print(abstracts)
-
-    contribs = []
-    for creator in meta["creator"]:
-        contribs.append(
-            ReleaseContrib(
-                given_name=creator["given"],
-                surname=creator["family"],
-                # sorry everybody
-                raw_name="{} {}".format(creator["given"], creator["family"]),
-                raw_affiliation=creator.get("affiliation"),
-                role="author",  # presumably, for these datasets?
-            )
-        )
-
-    r = ReleaseEntity(
-        ext_ids=ReleaseExtIds(
-            doi=doi,
-            ark=ark_id,
-        ),
-        title=clean(meta["title"], force_xml=True),
-        publisher=clean(meta["publisher"]),
-        release_year=int(meta["publicationYear"]),
-        release_type="dataset",
-        license_slug=license_slug,
-        contribs=contribs,
-        abstracts=abstracts or None,
-        extra=extra,
-    )
-    return r
-
-
-def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]:
-
-    if dat_path.endswith("/"):
-        dat_path = dat_path[:-1]
-    dat_discovery = dat_path
-    extra = dict()
-    assert len(dat_discovery) == 64
-
-    with open(dat_path + "/cdl_dash_metadata.json", "r") as fp:
-        meta_dict = json.loads(fp.read())
-
-    release = cdl_dash_release(meta_dict)
-    ark_id = release.extra["ark_id"]
-
-    dash_version = None
-    # really crude XML parse-out
-    with open(dat_path + "/stash-wrapper.xml", "r") as fp:
-        for line in fp:
-            line = line.strip()
-            if line.startswith("<st:version_number>"):
-                dash_version = int(line[19:].split("<")[0])
-    assert dash_version is not None
-    extra["cdl_dash"] = dict(version=dash_version)
-    release.extra["cdl_dash"] = dict(version=dash_version)
-
-    manifest = make_manifest(dat_path + "/files/")
-
-    bundle_url = dict(
-        url="https://merritt.cdlib.org/u/{}/{}".format(
-            urllib.parse.quote(ark_id, safe=""), dash_version
-        ),
-        rel="repo-bundle",
-    )
-    repo_url = dict(
-        url="https://merritt.cdlib.org/d/{}/{}/".format(
-            urllib.parse.quote(ark_id, safe=""), dash_version
-        ),
-        rel="repo",
-    )
-    dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb")
-    fs = FilesetEntity(
-        urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra
-    )
-    return (release, fs)
-
-
-def auto_cdl_dash_dat(
-    api: ApiClient,
-    dat_path: str,
-    release_id: Optional[str] = None,
-    editgroup_id: Optional[str] = None,
-) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]:
-
-    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
-    (release, fileset) = make_release_fileset(dat_path)
-
-    if not editgroup_id:
-        eg = api.create_editgroup(
-            Editgroup(
-                description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
-                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"),
-            )
-        )
-        editgroup_id = eg.editgroup_id
-
-    if not release_id and release.ext_ids.doi:
-        try:
-            r = api.lookup_release(doi=release.ext_ids.doi)
-            release_id = r.ident
-        except fatcat_openapi_client.rest.ApiException:
-            pass
-    if not release_id:
-        edit = api.create_release(eg.editgroup_id, release)
-        release_id = edit.ident
-
-    release = api.get_release(release_id, expand="filesets")
-    if len(release.filesets):
-        print("A fileset already exists for release {}".format(release.ident))
-        return (None, None, None)
-
-    fileset.release_ids = [release.ident]
-    edit = api.create_fileset(eg.editgroup_id, fileset)
-    fileset = api.get_fileset(edit.ident)
-    return (editgroup_id, release, fileset)
-
-
-if __name__ == "__main__":
-    # pass this a discovery key that has been cloned to the local directory
-    print(make_release_fileset(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
deleted file mode 100755
index 5caed2c7..00000000
--- a/python/fatcat_tools/importers/wayback_static.py
+++ /dev/null
@@ -1,287 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Helpers to create Web Capture entities from extracted wayback content.
-
-Works as a stand-alone script (for debugging) or as library routines.
-"""
-
-import argparse
-import datetime
-import hashlib
-import json
-import subprocess
-import sys
-from typing import Any, Dict, List, Optional, Tuple
-
-import requests
-from bs4 import BeautifulSoup
-from fatcat_openapi_client import (
-    ApiClient,
-    Editgroup,
-    EntityEdit,
-    WebcaptureCdxLine,
-    WebcaptureEntity,
-    WebcaptureUrl,
-)
-
-from .common import b32_hex
-
-CDX_API_BASE = "https://web.archive.org/cdx/search/cdx"
-GWB_URL_BASE = "https://web.archive.org/web"
-REQ_SESSION = requests.Session()
-
-
-def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]:
-    """Takes a wayback machine URL, and returns a tuple:
-
-    (timestamp, datetime, original_url)
-    """
-    chunks = url.split("/")
-    assert len(chunks) >= 6
-    assert chunks[2] == "web.archive.org"
-    assert chunks[3] == "web"
-    return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:]))
-
-
-def test_parse_wbm_url() -> None:
-    u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
-    assert parse_wbm_url(u) == (
-        "20010712114837",
-        datetime.datetime(2001, 7, 12, 11, 48, 37),
-        "http://www.dlib.org/dlib/june01/reich/06reich.html",
-    )
-
-
-def parse_wbm_timestamp(timestamp: str) -> datetime.datetime:
-    """
-    Takes a complete WBM timestamp string (like "20020327115625") and returns a
-    python datetime object (UTC)
-    """
-    # strip any "im_" or "id_" suffix
-    if timestamp.endswith("_"):
-        timestamp = timestamp[:-3]
-    # inflexible; require the full second-precision timestamp
-    assert len(timestamp) == 14
-    return datetime.datetime(
-        year=int(timestamp[0:4]),
-        month=int(timestamp[4:6]),
-        day=int(timestamp[6:8]),
-        hour=int(timestamp[8:10]),
-        minute=int(timestamp[10:12]),
-        second=int(timestamp[12:14]),
-    )
-
-
-def test_parse_wbm_timestamp() -> None:
-    assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37)
-
-
-def fetch_wbm(url: str) -> bytes:
-    resp = REQ_SESSION.get(url)
-    resp.raise_for_status()
-    assert resp.content
-    return resp.content
-
-
-def lookup_cdx(
-    embed_url: str, verify_hashes: bool = True, cdx_output: Any = None
-) -> Optional[WebcaptureCdxLine]:
-    sys.stderr.write(embed_url + "\n")
-    assert embed_url.startswith("/web/")
-    embed_url_segments = embed_url.split("/")
-    timestamp = embed_url_segments[2]
-    if timestamp.endswith("_"):
-        timestamp = timestamp[:-3]
-    url = "/".join(embed_url_segments[3:])
-    # print((timestamp, url))
-    params: Dict = dict(
-        url=url,
-        closest=timestamp,
-        sort="closest",
-        resolveRevisits="true",
-        matchType="exact",
-        limit=1,
-    )
-    resp = REQ_SESSION.get(
-        CDX_API_BASE,
-        params=params,
-    )
-    resp.raise_for_status()
-    # print(resp.url)
-    if resp.content:
-        hit = resp.content.decode("utf-8").split("\n")[0]
-        if cdx_output:
-            cdx_output.write(hit + "\n")
-        cdx_chunks = hit.split(" ")
-        cdx = [x if (x and x != "-") else None for x in cdx_chunks]
-        webcapture_cdx = WebcaptureCdxLine(
-            surt=cdx[0],
-            timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z",
-            url=cdx[2],
-            mimetype=cdx[3],
-            status_code=int(cdx[4] or ""),
-            sha1=b32_hex(cdx[5] or ""),
-            sha256=None,
-        )
-        if verify_hashes:
-            resp = REQ_SESSION.get(
-                GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url)  # raw timestamp
-            )
-            resp.raise_for_status()
-            assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
-            webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
-            webcapture_cdx.size = len(resp.content)
-        return webcapture_cdx
-    else:
-        return None
-
-
-def wayback_url_to_relative(url: str) -> Optional[str]:
-    """
-    Wayback URLs can be relative or absolute in rewritten documents. This
-    function converts any form of rewritten URL to a relative (to
-    web.archive.org) one, or returns None if it isn't a rewritten URL at all.
-    """
-    if url.startswith("https://web.archive.org/"):
-        url = url[23:]
-    elif url.startswith("http://web.archive.org/"):
-        url = url[22:]
-
-    if url.startswith("/web/"):
-        return url
-    else:
-        return None
-
-
-def extract_embeds(soup: BeautifulSoup) -> List[str]:
-
-    embeds = set()
-
-    # <link href="">
-    for tag in soup.find_all("link", href=True):
-        if tag["rel"] not in ("stylesheet",):
-            continue
-        url = wayback_url_to_relative(tag["href"])
-        if url:
-            embeds.add(url)
-    # <img src="">
-    for tag in soup.find_all("img", src=True):
-        url = wayback_url_to_relative(tag["src"])
-        if url:
-            embeds.add(url)
-
-    # <script src="">
-    for tag in soup.find_all("script", src=True):
-        url = wayback_url_to_relative(tag["src"])
-        if url:
-            embeds.add(url)
-
-    return list(embeds)
-
-
-def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity:
-    """
-    Given a complete wayback machine capture URL, like:
-
-        http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html
-
-    Will return a new ("bare") fatcat webcapture entity python object, with all
-    the CDX entries filled in.
-    """
-
-    wbm_html = fetch_wbm(wayback_url)
-    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
-    # with open(rewritten_path, 'r') as fp:
-    #    soup = BeautifulSoup(fp, "lxml")
-    soup = BeautifulSoup(wbm_html, "lxml")
-    embeds = extract_embeds(soup)
-    cdx_obj = lookup_cdx(
-        "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output
-    )
-    cdx_list = [cdx_obj]
-    for url in embeds:
-        cdx_obj = lookup_cdx(url, cdx_output=cdx_output)
-        cdx_list.append(cdx_obj)
-    archive_urls = [
-        WebcaptureUrl(
-            rel="wayback",
-            url="https://web.archive.org/web/",
-        )
-    ]
-    wc = WebcaptureEntity(
-        cdx=cdx_list,
-        timestamp=timestamp.isoformat() + "Z",
-        original_url=original_url,
-        archive_urls=archive_urls,
-        release_ids=None,
-    )
-    return wc
-
-
-def auto_wayback_static(
-    api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None
-) -> Tuple[Optional[str], Optional[EntityEdit]]:
-    """
-    Returns a tuple: (editgroup_id, edit). If failed, both are None
-    """
-
-    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
-    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
-    release = api.get_release(release_id, expand="webcaptures")
-
-    # check for existing webcapture with same parameters
-    for wc in release.webcaptures:
-        if wc.original_url == original_url and wc.timestamp.date() == timestamp.date():
-            # skipping: already existed
-            print(
-                "release {} already had webcapture {} {}".format(
-                    release_id, raw_timestamp, original_url
-                )
-            )
-            return (None, None)
-
-    wc = static_wayback_webcapture(wayback_url)
-    assert len(wc.cdx) >= 1
-    wc.release_ids = [release_id]
-    if not editgroup_id:
-        eg = api.create_editgroup(
-            Editgroup(
-                description="One-off import of static web content from wayback machine",
-                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"),
-            )
-        )
-        editgroup_id = eg.editgroup_id
-    edit = api.create_webcapture(eg.editgroup_id, wc)
-    return (editgroup_id, edit)
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--verbose", action="store_true", help="verbose output")
-    parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from")
-    parser.add_argument(
-        "--json-output",
-        type=argparse.FileType("w"),
-        default=sys.stdout,
-        help="where to write out webcapture entity (as JSON)",
-    )
-    parser.add_argument(
-        "--cdx-output",
-        type=argparse.FileType("w"),
-        default=None,
-        help="(optional) file to write out CDX stub",
-    )
-
-    args = parser.parse_args()
-
-    # entity-to-JSON code; duplicate of entity_to_dict()
-    api_client = ApiClient()
-    wc = static_wayback_webcapture(args.wayback_url, cdx_output=args.cdx_output)
-    wc_dict = api_client.sanitize_for_serialization(wc)
-    print(json.dumps(wc_dict))
-
-
-if __name__ == "__main__":
-    main()
-- 
cgit v1.2.3


From 16e9979a6f347b49764c1141209e84083ea81057 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 10 Nov 2021 13:23:12 -0800
Subject: importers: refactor imports of clean() and other normalization
 helpers

---
 python/fatcat_tools/importers/__init__.py         |  3 --
 python/fatcat_tools/importers/arabesque.py        |  4 +-
 python/fatcat_tools/importers/chocula.py          |  8 ++--
 python/fatcat_tools/importers/common.py           |  5 +-
 python/fatcat_tools/importers/crossref.py         | 56 +++++++++++------------
 python/fatcat_tools/importers/datacite.py         | 24 +++++-----
 python/fatcat_tools/importers/grobid_metadata.py  | 30 ++++++------
 python/fatcat_tools/importers/jalc.py             | 22 ++++-----
 python/fatcat_tools/importers/journal_metadata.py |  8 ++--
 python/fatcat_tools/importers/jstor.py            | 14 +++---
 python/fatcat_tools/importers/orcid.py            | 10 ++--
 python/fatcat_tools/importers/pubmed.py           | 15 ++++--
 12 files changed, 104 insertions(+), 95 deletions(-)

diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 223ae526..4d4d696b 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -15,7 +15,6 @@ from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter
 from .arxiv import ArxivRawImporter
 from .chocula import ChoculaImporter
 from .common import (
-    LANG_MAP_MARC,
     Bs4XmlFileListPusher,
     Bs4XmlFilePusher,
     Bs4XmlLargeFilePusher,
@@ -27,8 +26,6 @@ from .common import (
     KafkaJsonPusher,
     LinePusher,
     SqlitePusher,
-    clean,
-    is_cjk,
     make_kafka_consumer,
 )
 from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index b4a4d9ed..92289bb3 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, FileEntity
 
-from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url
+from fatcat_tools.normal import b32_hex
+
+from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, make_rel_url
 
 ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL"
 
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 842c7853..c44fec3b 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ContainerEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
 
 
 class ChoculaImporter(EntityImporter):
@@ -40,7 +42,7 @@ class ChoculaImporter(EntityImporter):
         returns a ContainerEntity (or None if invalid or couldn't parse)
         """
 
-        name = clean(row.get("name"))
+        name = clean_str(row.get("name"))
         if not name:
             # Name is required (by schema)
             return None
@@ -85,7 +87,7 @@ class ChoculaImporter(EntityImporter):
             ident=row["ident"],
             name=name,
             container_type=container_type,
-            publisher=clean(row.get("publisher")),
+            publisher=clean_str(row.get("publisher")),
             wikidata_qid=row.get("wikidata_qid"),
             extra=extra,
         )
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 425b6f13..56c3d32e 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -27,10 +27,7 @@ from fatcat_openapi_client import (
 from fatcat_openapi_client.rest import ApiException
 from fuzzycat.matching import match_release_fuzzy
 
-# TODO: refactor so remove need for this (re-imports for backwards compatibility)
-from fatcat_tools.normal import is_cjk  # noqa: F401
-from fatcat_tools.normal import LANG_MAP_MARC, b32_hex, clean_doi  # noqa: F401
-from fatcat_tools.normal import clean_str as clean  # noqa: F401
+from fatcat_tools.normal import clean_doi
 from fatcat_tools.transforms import entity_to_dict
 
 DATE_FMT: str = "%Y-%m-%d"
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index c9f251fc..8f5a4265 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,9 +4,9 @@ from typing import Any, Dict, List, Optional, Sequence
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str
 
-from .common import EntityImporter, clean
+from .common import EntityImporter
 
 # The docs/guide should be the canonical home for these mappings; update there
 # first
@@ -232,21 +232,21 @@ class CrossrefImporter(EntityImporter):
                     if len(affiliation_list) > 1:
                         # note: affiliation => more_affiliations
                         extra["more_affiliations"] = [
-                            clean(a["name"]) for a in affiliation_list[1:]
+                            clean_str(a["name"]) for a in affiliation_list[1:]
                         ]
                 if am.get("sequence") and am.get("sequence") != "additional":
-                    extra["seq"] = clean(am.get("sequence"))
+                    extra["seq"] = clean_str(am.get("sequence"))
                 assert ctype in ("author", "editor", "translator")
-                raw_name = clean(raw_name)
+                raw_name = clean_str(raw_name)
                 # TODO: what if 'raw_name' is None?
                 contribs.append(
                     ReleaseContrib(
                         creator_id=creator_id,
                         index=index,
                         raw_name=raw_name,
-                        given_name=clean(am.get("given")),
-                        surname=clean(am.get("family")),
-                        raw_affiliation=clean(raw_affiliation),
+                        given_name=clean_str(am.get("given")),
+                        surname=clean_str(am.get("family")),
+                        raw_affiliation=clean_str(raw_affiliation),
                         role=ctype,
                         extra=extra or None,
                     )
@@ -263,11 +263,11 @@ class CrossrefImporter(EntityImporter):
         container_id = None
         if issnl:
             container_id = self.lookup_issnl(issnl)
-        publisher = clean(obj.get("publisher"))
+        publisher = clean_str(obj.get("publisher"))
 
         container_name = obj.get("container-title")
         if container_name:
-            container_name = clean(container_name[0], force_xml=True)
+            container_name = clean_str(container_name[0], force_xml=True)
         if not container_name:
             container_name = None
         if (
@@ -323,7 +323,7 @@ class CrossrefImporter(EntityImporter):
                 ref_extra["journal-title"] = rm["journal-title"]
             if rm.get("DOI"):
                 ref_extra["doi"] = rm.get("DOI").lower()
-            author = clean(rm.get("author"))
+            author = clean_str(rm.get("author"))
             if author:
                 ref_extra["authors"] = [author]
             for k in (
@@ -347,8 +347,8 @@ class CrossrefImporter(EntityImporter):
                 "series-title",
                 "volume-title",
             ):
-                if clean(rm.get(k)):
-                    ref_extra[k] = clean(rm[k])
+                if clean_str(rm.get(k)):
+                    ref_extra[k] = clean_str(rm[k])
             refs.append(
                 fatcat_openapi_client.ReleaseRef(
                     index=i,
@@ -356,9 +356,9 @@ class CrossrefImporter(EntityImporter):
                     target_release_id=None,
                     key=key,
                     year=year,
-                    container_name=clean(ref_container_name),
-                    title=clean(rm.get("article-title")),
-                    locator=clean(rm.get("first-page")),
+                    container_name=clean_str(ref_container_name),
+                    title=clean_str(rm.get("article-title")),
+                    locator=clean_str(rm.get("first-page")),
                     # TODO: just dump JSON somewhere here?
                     extra=ref_extra or None,
                 )
@@ -366,7 +366,7 @@ class CrossrefImporter(EntityImporter):
 
         # abstracts
         abstracts = []
-        abstract = clean(obj.get("abstract"))
+        abstract = clean_str(obj.get("abstract"))
         if abstract and len(abstract) > 10:
             abstracts.append(
                 fatcat_openapi_client.ReleaseAbstract(
@@ -387,9 +387,9 @@ class CrossrefImporter(EntityImporter):
                 if type(val) == list:
                     val = val[0]
                 if type(val) == str:
-                    val = clean(val)
+                    val = clean_str(val)
                     if val:
-                        extra[key] = clean(val)
+                        extra[key] = clean_str(val)
                 else:
                     extra[key] = val
         # crossref-nested extra keys
@@ -397,14 +397,14 @@ class CrossrefImporter(EntityImporter):
             val = obj.get(key)
             if val:
                 if type(val) == str:
-                    extra_crossref[key] = clean(val)
+                    extra_crossref[key] = clean_str(val)
                 else:
                     extra_crossref[key] = val
         if license_extra:
             extra_crossref["license"] = license_extra
 
         if len(obj["title"]) > 1:
-            aliases = [clean(t) for t in obj["title"][1:]]
+            aliases = [clean_str(t) for t in obj["title"][1:]]
             aliases = [t for t in aliases if t]
             if aliases:
                 extra["aliases"] = aliases
@@ -459,11 +459,11 @@ class CrossrefImporter(EntityImporter):
         if obj.get("original-title"):
             ot = obj.get("original-title")
             if ot is not None:
-                original_title = clean(ot[0], force_xml=True)
+                original_title = clean_str(ot[0], force_xml=True)
 
         title: Optional[str] = None
         if obj.get("title"):
-            title = clean(obj["title"][0], force_xml=True)
+            title = clean_str(obj["title"][0], force_xml=True)
             if not title or len(title) <= 1:
                 # title can't be just a single character
                 self.counts["skip-blank-title"] += 1
@@ -476,7 +476,7 @@ class CrossrefImporter(EntityImporter):
 
         subtitle = None
         if obj.get("subtitle"):
-            subtitle = clean(obj["subtitle"][0], force_xml=True)
+            subtitle = clean_str(obj["subtitle"][0], force_xml=True)
             if not subtitle or len(subtitle) <= 1:
                 # subtitle can't be just a single character
                 subtitle = None
@@ -499,10 +499,10 @@ class CrossrefImporter(EntityImporter):
                 doi=doi,
                 isbn13=isbn13,
             ),
-            volume=clean(obj.get("volume")),
-            issue=clean(obj.get("issue")),
-            pages=clean(obj.get("page")),
-            language=clean(obj.get("language")),
+            volume=clean_str(obj.get("volume")),
+            issue=clean_str(obj.get("issue")),
+            pages=clean_str(obj.get("page")),
+            language=clean_str(obj.get("language")),
             license_slug=license_slug,
             extra=extra or None,
             abstracts=abstracts or None,
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index fe02cac4..441514b8 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -21,10 +21,10 @@ import langdetect
 import pycountry
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str
 from fatcat_tools.transforms import entity_to_dict
 
-from .common import EntityImporter, clean
+from .common import EntityImporter
 
 # Cutoff length for abstracts.
 MAX_ABSTRACT_LENGTH = 2048
@@ -322,7 +322,7 @@ class DataciteImporter(EntityImporter):
             print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
             return False
 
-        title = clean(title)
+        title = clean_str(title)
         if not title:
             print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
             return False
@@ -341,7 +341,7 @@ class DataciteImporter(EntityImporter):
         if not subtitle:
             subtitle = None
         else:
-            subtitle = clean(subtitle)
+            subtitle = clean_str(subtitle)
 
         # Dates. A few internal dates (registered, created, updated) and
         # published (0..2554). We try to work with typed date list, in
@@ -399,7 +399,7 @@ class DataciteImporter(EntityImporter):
             publisher = None
 
         if publisher:
-            publisher = clean(publisher)
+            publisher = clean_str(publisher)
 
         # Container. For the moment, only ISSN as container.
         container_id = None
@@ -460,10 +460,10 @@ class DataciteImporter(EntityImporter):
         issue = container.get("issue")
 
         if volume:
-            volume = clean(volume)
+            volume = clean_str(volume)
 
         if issue:
-            issue = clean(issue)
+            issue = clean_str(issue)
 
         # Pages.
         pages = None
@@ -548,7 +548,7 @@ class DataciteImporter(EntityImporter):
                     "[{}] language detection failed with {} on {}".format(doi, err, text),
                     file=sys.stderr,
                 )
-            abstract_text = clean(text)
+            abstract_text = clean_str(text)
             if not abstract_text:
                 continue
             abstracts.append(
@@ -874,14 +874,14 @@ class DataciteImporter(EntityImporter):
                 if len(affiliations) == 0:
                     raw_affiliation = None
                 else:
-                    raw_affiliation = clean(affiliations[0])
+                    raw_affiliation = clean_str(affiliations[0])
 
                 name = c.get("name")
                 given_name = c.get("givenName")
                 surname = c.get("familyName")
 
                 if name:
-                    name = clean(name)
+                    name = clean_str(name)
                 if not any((name, given_name, surname)):
                     continue
                 if not name:
@@ -895,8 +895,8 @@ class DataciteImporter(EntityImporter):
                     name = index_form_to_display_name(name)
 
                 if given_name:
-                    given_name = clean(given_name)
-                surname = clean(surname)
+                    given_name = clean_str(given_name)
+                surname = clean_str(surname)
 
                 # Perform a final assertion that name does not reduce to zero
                 # (e.g. whitespace only name).
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 7c595787..9db499a0 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str
 
-from .common import EntityImporter, clean, make_rel_url
+from .common import EntityImporter, make_rel_url
 
 MAX_ABSTRACT_BYTES = 4096
 
@@ -86,7 +86,7 @@ class GrobidMetadataImporter(EntityImporter):
         abstract = obj.get("abstract")
         if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
             abobj = fatcat_openapi_client.ReleaseAbstract(
-                mimetype="text/plain", content=clean(obj.get("abstract"))
+                mimetype="text/plain", content=clean_str(obj.get("abstract"))
             )
             abstracts = [abobj]
         else:
@@ -97,9 +97,9 @@ class GrobidMetadataImporter(EntityImporter):
             contribs.append(
                 fatcat_openapi_client.ReleaseContrib(
                     index=i,
-                    raw_name=clean(a["name"]),
-                    given_name=clean(a.get("given_name")),
-                    surname=clean(a.get("surname")),
+                    raw_name=clean_str(a["name"]),
+                    given_name=clean_str(a.get("given_name")),
+                    surname=clean_str(a.get("surname")),
                     role="author",
                     extra=None,
                 )
@@ -116,15 +116,15 @@ class GrobidMetadataImporter(EntityImporter):
                     pass
             for key in ("volume", "url", "issue", "publisher"):
                 if raw.get(key):
-                    cite_extra[key] = clean(raw[key])
+                    cite_extra[key] = clean_str(raw[key])
             if raw.get("authors"):
-                cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]
+                cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]]
 
             refs.append(
                 fatcat_openapi_client.ReleaseRef(
-                    key=clean(raw.get("id")),
+                    key=clean_str(raw.get("id")),
                     year=year,
-                    title=clean(raw["title"]),
+                    title=clean_str(raw["title"]),
                     extra=cite_extra or None,
                 )
             )
@@ -140,7 +140,7 @@ class GrobidMetadataImporter(EntityImporter):
         if doi:
             extra["doi"] = doi
         if obj["journal"] and obj["journal"].get("name"):
-            extra["container_name"] = clean(obj["journal"]["name"])
+            extra["container_name"] = clean_str(obj["journal"]["name"])
 
         # TODO: ISSN/eISSN handling? or just journal name lookup?
 
@@ -149,7 +149,7 @@ class GrobidMetadataImporter(EntityImporter):
         if self.longtail_oa:
             extra["longtail_oa"] = True
 
-        clean_title = clean(obj["title"], force_xml=True)
+        clean_title = clean_str(obj["title"], force_xml=True)
         if not clean_title or len(clean_title) < 2:
             return None
         title = clean_title
@@ -161,9 +161,9 @@ class GrobidMetadataImporter(EntityImporter):
             release_year=release_year,
             contribs=contribs,
             refs=refs,
-            publisher=clean(obj["journal"].get("publisher")),
-            volume=clean(obj["journal"].get("volume")),
-            issue=clean(obj["journal"].get("issue")),
+            publisher=clean_str(obj["journal"].get("publisher")),
+            volume=clean_str(obj["journal"].get("volume")),
+            issue=clean_str(obj["journal"].get("issue")),
             abstracts=abstracts or None,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(),
             extra=extra or None,
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index a737ac9f..9916a55f 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -6,9 +6,9 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str, is_cjk
 
-from .common import DATE_FMT, EntityImporter, clean, is_cjk
+from .common import DATE_FMT, EntityImporter
 
 
 # TODO: should be List[Tag] not List[Any] for full type annotations
@@ -36,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]:
     for raw in raw_persons:
         name = raw.find("name") or None
         if name:
-            name = clean(name.get_text().replace("\n", " "))
+            name = clean_str(name.get_text().replace("\n", " "))
         surname = raw.find("familyName") or None
         if surname:
-            surname = clean(surname.get_text().replace("\n", " "))
+            surname = clean_str(surname.get_text().replace("\n", " "))
         given_name = raw.find("givenName") or None
         if given_name:
-            given_name = clean(given_name.get_text().replace("\n", " "))
+            given_name = clean_str(given_name.get_text().replace("\n", " "))
         lang = "en"
         if is_cjk(name):
             lang = "ja"
@@ -230,16 +230,16 @@ class JalcImporter(EntityImporter):
                 for p in record.find_all("publicationName")
                 if p.get_text()
             ]
-            pubs = [clean(p) for p in pubs if p]
+            pubs = [clean_str(p) for p in pubs if p]
             assert pubs
             if len(pubs) > 1 and pubs[0] == pubs[1]:
                 pubs = [pubs[0]]
             if len(pubs) > 1 and is_cjk(pubs[0]):
                 # eng/jpn ordering is not reliable
                 pubs = [pubs[1], pubs[0]]
-            container_name = clean(pubs[0])
+            container_name = clean_str(pubs[0])
             if len(pubs) > 1:
-                container_extra["original_name"] = clean(pubs[1])
+                container_extra["original_name"] = clean_str(pubs[1])
 
         if record.publisher:
             pubs = [
@@ -254,7 +254,7 @@ class JalcImporter(EntityImporter):
                 # ordering is not reliable
                 pubs = [pubs[1], pubs[0]]
             if pubs:
-                publisher = clean(pubs[0])
+                publisher = clean_str(pubs[0])
                 if len(pubs) > 1:
                     container_extra["publisher_aliases"] = pubs[1:]
 
@@ -296,14 +296,14 @@ class JalcImporter(EntityImporter):
         # (informally)
         extra["jalc"] = extra_jalc
 
-        title = clean(title)
+        title = clean_str(title)
         if not title:
             return None
 
         re = ReleaseEntity(
             work_id=None,
             title=title,
-            original_title=clean(original_title),
+            original_title=clean_str(original_title),
             release_type=release_type,
             release_stage="published",
             release_date=release_date,
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index a45e49f3..fc1dfcbd 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ContainerEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
 
 
 def or_none(s: Optional[str]) -> Optional[str]:
@@ -105,7 +107,7 @@ class JournalMetadataImporter(EntityImporter):
         if extra_ia:
             extra["ia"] = extra_ia
 
-        name = clean(row.get("name"))
+        name = clean_str(row.get("name"))
         if not name:
             return None
 
@@ -115,7 +117,7 @@ class JournalMetadataImporter(EntityImporter):
             issnp=row.get("issnp"),
             container_type=None,  # TODO
             name=name,
-            publisher=clean(row.get("publisher")),
+            publisher=clean_str(row.get("publisher")),
             wikidata_qid=None,  # TODO
             extra=extra,
         )
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index ca1f2466..c2f650b0 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,9 +8,9 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import LANG_MAP_MARC, clean_doi, clean_str
 
-from .common import LANG_MAP_MARC, EntityImporter, clean
+from .common import EntityImporter
 from .crossref import CONTAINER_TYPE_MAP
 
 # TODO: more entries?
@@ -140,7 +140,7 @@ class JstorImporter(EntityImporter):
                 issnl=issnl,
                 publisher=publisher,
                 container_type=self.map_container_type(release_type),
-                name=clean(journal_title, force_xml=True),
+                name=clean_str(journal_title, force_xml=True),
             )
             ce_edit = self.create_container(ce)
             container_id = ce_edit.ident
@@ -166,13 +166,13 @@ class JstorImporter(EntityImporter):
             for c in cgroup.find_all("contrib"):
                 given = c.find("given-names")
                 if given:
-                    given = clean(given.get_text().replace("\n", " "))
+                    given = clean_str(given.get_text().replace("\n", " "))
                 surname = c.find("surname")
                 if surname:
-                    surname = clean(surname.get_text().replace("\n", " "))
+                    surname = clean_str(surname.get_text().replace("\n", " "))
                 raw_name = c.find("string-name")
                 if raw_name:
-                    raw_name = clean(raw_name.get_text().replace("\n", " "))
+                    raw_name = clean_str(raw_name.get_text().replace("\n", " "))
 
                 if not raw_name:
                     if given and surname:
@@ -234,7 +234,7 @@ class JstorImporter(EntityImporter):
 
         # JSTOR issue-id
         if article_meta.find("issue-id"):
-            issue_id = clean(article_meta.find("issue-id").string)
+            issue_id = clean_str(article_meta.find("issue-id").string)
             if issue_id:
                 extra_jstor["issue_id"] = issue_id
 
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 430cdd0f..f3d82a86 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -4,7 +4,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, CreatorEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
 
 
 def value_or_none(e: Any) -> Any:
@@ -65,14 +67,14 @@ class OrcidImporter(EntityImporter):
         if not self.is_orcid(orcid):
             sys.stderr.write("Bad ORCID: {}\n".format(orcid))
             return None
-        display = clean(display)
+        display = clean_str(display)
         if not display:
             # must have *some* name
             return None
         ce = CreatorEntity(
             orcid=orcid,
-            given_name=clean(given),
-            surname=clean(sur),
+            given_name=clean_str(given),
+            surname=clean_str(sur),
             display_name=display,
             extra=extra,
         )
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index d32fcefa..3274234f 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,9 +8,16 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid
+from fatcat_tools.normal import (
+    LANG_MAP_MARC,
+    clean_doi,
+    clean_issn,
+    clean_pmcid,
+    clean_pmid,
+    clean_str,
+)
 
-from .common import LANG_MAP_MARC, EntityImporter, clean
+from .common import EntityImporter
 
 # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
 PUBMED_RELEASE_TYPE_MAP = {
@@ -704,14 +711,14 @@ class PubmedImporter(EntityImporter):
         if extra_pubmed:
             extra["pubmed"] = extra_pubmed
 
-        title = clean(title)
+        title = clean_str(title)
         if not title:
             return None
 
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
             title=title,
-            original_title=clean(original_title),
+            original_title=clean_str(original_title),
             release_type=release_type,
             release_stage=release_stage,
             release_date=release_date,
-- 
cgit v1.2.3


From ddc757bc1d5c610f42e9f5f10a4f060f517b66ca Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 10 Nov 2021 13:52:39 -0800
Subject: refactor importer metadata tables into separate file; move some
 helpers around

- MAX_ABSTRACT_LENGTH set in a single place (importer common)
- merge datacite license slug table in to common table, removing some
  TDM-specific licenses (which do not apply in the context of preserving
  the full work)
---
 python/fatcat_tools/biblio_lookup_tables.py      | 623 +++++++++++++++++++++++
 python/fatcat_tools/importers/__init__.py        |   3 +-
 python/fatcat_tools/importers/common.py          |  61 +--
 python/fatcat_tools/importers/crossref.py        |  94 +---
 python/fatcat_tools/importers/datacite.py        | 155 +-----
 python/fatcat_tools/importers/doaj_article.py    |   5 +-
 python/fatcat_tools/importers/grobid_metadata.py |   6 +-
 python/fatcat_tools/importers/jstor.py           |   3 +-
 python/fatcat_tools/importers/pubmed.py          | 319 +-----------
 python/fatcat_tools/normal.py                    | 115 ++---
 10 files changed, 682 insertions(+), 702 deletions(-)
 create mode 100644 python/fatcat_tools/biblio_lookup_tables.py

diff --git a/python/fatcat_tools/biblio_lookup_tables.py b/python/fatcat_tools/biblio_lookup_tables.py
new file mode 100644
index 00000000..a9a097ae
--- /dev/null
+++ b/python/fatcat_tools/biblio_lookup_tables.py
@@ -0,0 +1,623 @@
+"""
+This file contains lookup tables and other static data structures used in
+bibliographic metadata munging.
+"""
+
+from typing import Dict, Optional
+
+# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
+# 2/T and 2/B?
+# PubMed/MEDLINE and JSTOR use these MARC codes
+# https://www.loc.gov/marc/languages/language_name.html
+LANG_MAP_MARC: Dict[str, Optional[str]] = {
+    "afr": "af",
+    "alb": "sq",
+    "amh": "am",
+    "ara": "ar",
+    "arm": "hy",
+    "aze": "az",
+    "ben": "bn",
+    "bos": "bs",
+    "bul": "bg",
+    "cat": "ca",
+    "chi": "zh",
+    "cze": "cs",
+    "dan": "da",
+    "dut": "nl",
+    "eng": "en",
+    "epo": "eo",
+    "est": "et",
+    "fin": "fi",
+    "fre": "fr",
+    "geo": "ka",
+    "ger": "de",
+    "gla": "gd",
+    "gre": "el",
+    "heb": "he",
+    "hin": "hi",
+    "hrv": "hr",
+    "hun": "hu",
+    "ice": "is",
+    "ind": "id",
+    "ita": "it",
+    "jpn": "ja",
+    "kin": "rw",
+    "kor": "ko",
+    "lat": "la",
+    "lav": "lv",
+    "lit": "lt",
+    "mac": "mk",
+    "mal": "ml",
+    "mao": "mi",
+    "may": "ms",
+    "nor": "no",
+    "per": "fa",
+    "per": "fa",
+    "pol": "pl",
+    "por": "pt",
+    "pus": "ps",
+    "rum": "ro",
+    "rus": "ru",
+    "san": "sa",
+    "slo": "sk",
+    "slv": "sl",
+    "spa": "es",
+    "srp": "sr",
+    "swe": "sv",
+    "tha": "th",
+    "tur": "tr",
+    "ukr": "uk",
+    "urd": "ur",
+    "vie": "vi",
+    "wel": "cy",
+    # additions
+    "gle": "ga",  # "Irish" (Gaelic)
+    "jav": "jv",  # Javanese
+    "welsh": "cy",  # Welsh
+    "oci": "oc",  # Occitan
+    # Don't have ISO 639-1 codes
+    "grc": "el",  # Ancient Greek; map to modern greek
+    "map": None,  # Austronesian (collection)
+    "syr": None,  # Syriac, Modern
+    "gem": None,  # Old Saxon
+    "non": None,  # Old Norse
+    "emg": None,  # Eastern Meohang
+    "neg": None,  # Negidal
+    "mul": None,  # Multiple languages
+    "und": None,  # Undetermined
+}
+
+# these are mappings from web domains to URL 'rel' for things like file entity
+# URL notation
+DOMAIN_REL_MAP: Dict[str, str] = {
+    "archive.org": "archive",
+    # LOCKSS, Portico, DuraSpace, etc would also be "archive"
+    "arxiv.org": "repository",
+    "babel.hathitrust.org": "repository",
+    "cds.cern.ch": "repository",
+    "deepblue.lib.umich.edu": "repository",
+    "europepmc.org": "repository",
+    "hal.inria.fr": "repository",
+    "scielo.isciii.es": "repository",
+    "www.dtic.mil": "repository",
+    "www.jstage.jst.go.jp": "repository",
+    "www.jstor.org": "repository",
+    "www.ncbi.nlm.nih.gov": "repository",
+    "ftp.ncbi.nlm.nih.gov": "repository",
+    "www.scielo.br": "repository",
+    "www.scielo.cl": "repository",
+    "www.scielo.org.mx": "repository",
+    "zenodo.org": "repository",
+    "www.biorxiv.org": "repository",
+    "www.medrxiv.org": "repository",
+    "citeseerx.ist.psu.edu": "aggregator",
+    "publisher-connector.core.ac.uk": "aggregator",
+    "core.ac.uk": "aggregator",
+    "static.aminer.org": "aggregator",
+    "aminer.org": "aggregator",
+    "pdfs.semanticscholar.org": "aggregator",
+    "semanticscholar.org": "aggregator",
+    "www.semanticscholar.org": "aggregator",
+    "academic.oup.com": "publisher",
+    "cdn.elifesciences.org": "publisher",
+    "cell.com": "publisher",
+    "dl.acm.org": "publisher",
+    "downloads.hindawi.com": "publisher",
+    "elifesciences.org": "publisher",
+    "iopscience.iop.org": "publisher",
+    "journals.plos.org": "publisher",
+    "link.springer.com": "publisher",
+    "onlinelibrary.wiley.com": "publisher",
+    "works.bepress.com": "publisher",
+    "www.biomedcentral.com": "publisher",
+    "www.cell.com": "publisher",
+    "www.nature.com": "publisher",
+    "www.pnas.org": "publisher",
+    "www.tandfonline.com": "publisher",
+    "www.frontiersin.org": "publisher",
+    "www.degruyter.com": "publisher",
+    "www.mdpi.com": "publisher",
+    "www.ahajournals.org": "publisher",
+    "ehp.niehs.nih.gov": "publisher",
+    "journals.tsu.ru": "publisher",
+    "www.cogentoa.com": "publisher",
+    "www.researchgate.net": "academicsocial",
+    "academia.edu": "academicsocial",
+    "wayback.archive-it.org": "webarchive",
+    "web.archive.org": "webarchive",
+    "archive.is": "webarchive",
+}
+
+# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
+PUBMED_RELEASE_TYPE_MAP = {
+    # Adaptive Clinical Trial
+    "Address": "speech",
+    "Autobiography": "book",
+    # Bibliography
+    "Biography": "book",
+    # Case Reports
+    "Classical Article": "article-journal",
+    # Clinical Conference
+    # Clinical Study
+    # Clinical Trial
+    # Clinical Trial, Phase I
+    # Clinical Trial, Phase II
+    # Clinical Trial, Phase III
+    # Clinical Trial, Phase IV
+    # Clinical Trial Protocol
+    # Clinical Trial, Veterinary
+    # Collected Works
+    # Comparative Study
+    # Congress
+    # Consensus Development Conference
+    # Consensus Development Conference, NIH
+    # Controlled Clinical Trial
+    "Dataset": "dataset",
+    # Dictionary
+    # Directory
+    # Duplicate Publication
+    "Editorial": "editorial",
+    # English Abstract   # doesn't indicate that this is abstract-only
+    # Equivalence Trial
+    # Evaluation Studies
+    # Expression of Concern
+    # Festschrift
+    # Government Document
+    # Guideline
+    "Historical Article": "article-journal",
+    # Interactive Tutorial
+    "Interview": "interview",
+    "Introductory Journal Article": "article-journal",
+    "Journal Article": "article-journal",
+    "Lecture": "speech",
+    "Legal Case": "legal_case",
+    "Legislation": "legislation",
+    "Letter": "letter",
+    # Meta-Analysis
+    # Multicenter Study
+    # News
+    "Newspaper Article": "article-newspaper",
+    # Observational Study
+    # Observational Study, Veterinary
+    # Overall
+    # Patient Education Handout
+    # Periodical Index
+    # Personal Narrative
+    # Portrait
+    # Practice Guideline
+    # Pragmatic Clinical Trial
+    # Publication Components
+    # Publication Formats
+    # Publication Type Category
+    # Randomized Controlled Trial
+    # Research Support, American Recovery and Reinvestment Act
+    # Research Support, N.I.H., Extramural
+    # Research Support, N.I.H., Intramural
+    # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
+    # Research Support, U.S. Gov't, P.H.S.
+    # Review     # in the "literature review" sense, not "product review"
+    # Scientific Integrity Review
+    # Study Characteristics
+    # Support of Research
+    # Systematic Review
+    "Technical Report": "report",
+    # Twin Study
+    # Validation Studies
+    # Video-Audio Media
+    # Webcasts
+}
+
+MONTH_ABBR_MAP: Dict[str, int] = {
+    "Jan": 1,
+    "01": 1,
+    "Feb": 2,
+    "02": 2,
+    "Mar": 3,
+    "03": 3,
+    "Apr": 4,
+    "04": 4,
+    "May": 5,
+    "05": 5,
+    "Jun": 6,
+    "06": 6,
+    "Jul": 7,
+    "07": 7,
+    "Aug": 8,
+    "08": 8,
+    "Sep": 9,
+    "09": 9,
+    "Oct": 10,
+    "10": 10,
+    "Nov": 11,
+    "11": 11,
+    "Dec": 12,
+    "12": 12,
+}
+
+# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
+COUNTRY_NAME_MAP: Dict[str, str] = {
+    "Afghanistan": "af",
+    "Albania": "al",
+    "Algeria": "dz",
+    "Andorra": "ad",
+    "Angola": "ao",
+    "Antigua and Barbuda": "ag",
+    "Argentina": "ar",
+    "Armenia": "am",
+    "Australia": "au",
+    "Austria": "at",
+    "Azerbaijan": "az",
+    "Bahamas": "bs",
+    "Bahrain": "bh",
+    "Bangladesh": "bd",
+    "Barbados": "bb",
+    "Belarus": "by",
+    "Belgium": "be",
+    "Belize": "bz",
+    "Benin": "bj",
+    "Bhutan": "bt",
+    "Bolivia": "bo",
+    "Bosnia and Herzegowina": "ba",
+    "Botswana": "bw",
+    "Brazil": "br",
+    "Brunei Darussalam": "bn",
+    "Bulgaria": "bg",
+    "Burkina Faso": "bf",
+    "Burundi": "bi",
+    "Cambodia": "kh",
+    "Cameroon": "cm",
+    "Canada": "ca",
+    "Cape Verde": "cv",
+    "Central African Republic": "cf",
+    "Chad": "td",
+    "Chile": "cl",
+    "China": "cn",
+    "Colombia": "co",
+    "Comoros": "km",
+    "Congo, Democratic Republic": "cd",
+    "Congo, People’s Republic": "cg",
+    "Costa Rica": "cr",
+    "Cote d'Ivoire": "ci",
+    "Croatia (Local Name: Hrvatska)": "hr",
+    "Cuba": "cu",
+    "Cyprus": "cy",
+    "Czech Republic": "cz",
+    "Denmark": "dk",
+    "Djibouti": "dj",
+    "Dominica": "dm",
+    "Dominican Republic": "do",
+    "East Timor": "tl",
+    "Ecuador": "ec",
+    "El Salvador": "sv",
+    "Equatorial Guinea": "gq",
+    "Eritrea": "er",
+    "Estonia": "ee",
+    "Ethiopia": "et",
+    "Fiji": "fj",
+    "Finland": "fi",
+    "France": "fr",
+    "Gabon": "ga",
+    "Gambia": "gm",
+    "Georgia": "ge",
+    "Germany": "de",
+    "Ghana": "gh",
+    "Greece": "gr",
+    "Greenland": "gl",
+    "Grenada": "gd",
+    "Guatemala": "gt",
+    "Guinea": "gn",
+    "Guinea-Bissau": "gw",
+    "Guyana": "gy",
+    "Haiti": "ht",
+    "Honduras": "hn",
+    "Hong Kong": "hk",
+    "Hungary": "hu",
+    "Iceland": "is",
+    "India": "in",
+    "Indonesia": "id",
+    "Iran": "ir",
+    "Iraq": "iq",
+    "Ireland": "ie",
+    "Israel": "il",
+    "Italy": "it",
+    "Jamaica": "jm",
+    "Japan": "jp",
+    "Jordan": "jo",
+    "Kazakhstan": "kz",
+    "Kenya": "ke",
+    "Kiribati": "ki",
+    "Korea, Democratic People's Republic": "kp",
+    "Korea, Republic": "kr",
+    "Kuwait": "kw",
+    "Kyrgyzstan": "kg",
+    "Laos": "la",
+    "Latvia": "lv",
+    "Lebanon": "lb",
+    "Lesotho": "ls",
+    "Liberia": "lr",
+    "Libya": "ly",
+    "Liechtenstein": "li",
+    "Lithuania": "lt",
+    "Luxembourg": "lu",
+    "Macedonia": "mk",
+    "Madagascar": "mg",
+    "Malawi": "mw",
+    "Malaysia": "my",
+    "Maldives": "mv",
+    "Mali": "ml",
+    "Malta": "mt",
+    "Marshall Islands": "mh",
+    "Mauritania": "mr",
+    "Mauritius": "mu",
+    "Mexico": "mx",
+    "Micronesia": "fm",
+    "Moldova": "md",
+    "Monaco": "mc",
+    "Mongolia": "mn",
+    "Morocco": "ma",
+    "Mozambique": "mz",
+    "Myanmar": "mm",
+    "Namibia": "na",
+    "Nauru": "nr",
+    "Nepal": "np",
+    "Netherlands": "nl",
+    "New Zealand": "nz",
+    "Nicaragua": "ni",
+    "Niger": "ne",
+    "Nigeria": "ng",
+    "Norway": "no",
+    "Oman": "om",
+    "Pakistan": "pk",
+    "Palau": "pw",
+    "Panama": "pa",
+    "Papua New Guinea": "pg",
+    "Paraguay": "py",
+    "Peru": "pe",
+    "Philippines": "ph",
+    "Poland": "pl",
+    "Portugal": "pt",
+    "Puerto Rico": "pr",
+    "Qatar": "qa",
+    "Romania": "ro",
+    "Russian Federation": "ru",
+    "Rwanda": "rw",
+    "Saint Kitts and Nevis": "kn",
+    "Saint Lucia": "lc",
+    "Saint Vincent and the Grenadines": "vc",
+    "Samoa": "ws",
+    "San Marino": "sm",
+    "Sao Tome and Príncipe": "st",
+    "Saudi Arabia": "sa",
+    "Senegal": "sn",
+    "Serbia and Montenegro": "cs",
+    "Seychelles": "sc",
+    "Sierra Leone": "sl",
+    "Singapore": "sg",
+    "Slovakia (Slovak Republic)": "sk",
+    "Slovenia": "si",
+    "Solomon Islands": "sb",
+    "Somalia": "so",
+    "South Africa": "za",
+    "Spain": "es",
+    "Sri Lanka": "lk",
+    "Sudan": "sd",
+    "Suriname": "sr",
+    "Swaziland": "sz",
+    "Sweden": "se",
+    "Switzerland": "ch",
+    "Syrian Arab Republic": "sy",
+    "Taiwan": "tw",
+    "Tajikistan": "tj",
+    "Tanzania": "tz",
+    "Tanzania": "tz",
+    "Thailand": "th",
+    "Togo": "tg",
+    "Tonga": "to",
+    "Trinidad and Tobago": "tt",
+    "Tunisia": "tn",
+    "Turkey": "tr",
+    "Turkmenistan": "tm",
+    "Tuvalu": "tv",
+    "Uganda": "ug",
+    "Ukraine": "ua",
+    "United Arab Emirates": "ae",
+    "United Kingdom": "gb",
+    "United States": "us",
+    "Uruguay": "uy",
+    # Additions from running over large files
+    "Bosnia and Herzegovina": "ba",
+    # "International"
+    "China (Republic : 1949- )": "tw",  # pretty sure this is tw not cn
+    "Russia (Federation)": "ru",
+    "Scotland": "gb",
+    "England": "gb",
+    "Korea (South)": "kr",
+    "Georgia (Republic)": "ge",
+    "Egypt": "eg",
+}
+
+CONTAINER_TYPE_MAP: Dict[str, str] = {
+    "article-journal": "journal",
+    "paper-conference": "conference",
+    "book": "book-series",
+}
+
+# These are based, informally, on sorting the most popular licenses found in
+# Crossref metadata. There were over 500 unique strings and only a few most
+# popular are here; many were variants of the CC URLs. Would be useful to
+# normalize CC licenses better.
+# The current norm is to only add license slugs that are at least partially OA.
+LICENSE_SLUG_MAP: Dict[str, str] = {
+    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
+    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
+    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
+    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
+    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
+    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
+    "//creativecommons.org/licenses/by/2.0/": "CC-BY",
+    "//creativecommons.org/licenses/by/3.0/": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/": "CC-BY",
+    "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
+    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
+    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
+    "//spdx.org/licenses/CC0-1.0.json": "CC-0",
+    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
+    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
+    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
+    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
+    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
+    "//spdx.org/licenses/MIT.json": "MIT",
+    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
+    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
+    "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
+    "//www.karger.com/Services/SiteLicenses": "KARGER",
+    "//www.karger.com/Services/SiteLicenses/": "KARGER",
+    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
+    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
+    "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
+    "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
+    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
+    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
+    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
+    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
+    "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
+    "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
+    "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
+    "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
+    "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
+    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
+    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
+    "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
+    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
+    # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+    # //www.springer.com/tdm doesn't seem like a license
+    # //iopscience.iop.org/page/copyright is closed
+    # //www.acm.org/publications/policies/copyright_policy#Background is closed
+    # //rsc.li/journals-terms-of-use is closed for vor (am open)
+    # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
+    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
+    # skip these TDM licenses; they don't apply to content
+    # "//www.springer.com/tdm/": "SPRINGER-TDM",
+    # "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
+    # "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
+}
+
+# Map various datacite type types to CSL-ish types. None means TODO or remove.
+DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
+    "ris": {
+        "THES": "thesis",
+        "SOUND": "song",  # 99.9% maps to citeproc song, so use that (exception: report)
+        "CHAP": "chapter",
+        "FIGURE": "figure",
+        "RPRT": "report",
+        "JOUR": "article-journal",
+        "MPCT": "motion_picture",
+        "GEN": "article-journal",  # GEN consist of 99% article and report, post-weblog, misc - and one dataset
+        "BOOK": "book",
+        "DATA": "dataset",
+        "COMP": "software",
+    },
+    "schemaOrg": {
+        "Dataset": "dataset",
+        "Book": "book",
+        "ScholarlyArticle": "article-journal",
+        "ImageObject": "graphic",
+        "Collection": None,
+        "MediaObject": None,
+        "Event": None,
+        "SoftwareSourceCode": "software",
+        "Chapter": "chapter",
+        "CreativeWork": None,  # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
+        "PublicationIssue": "article",
+        "AudioObject": None,
+        "Thesis": "thesis",
+    },
+    "citeproc": {
+        "article": "article",
+        "article-journal": "article-journal",
+        "article-magazine": "article-magazine",
+        "article-newspaper": "article-newspaper",
+        "bill": "bill",
+        "book": "book",
+        "broadcast": "broadcast",
+        "chapter": "chapter",
+        "dataset": "dataset",
+        "entry-dictionary": "entry-dictionary",
+        "entry-encyclopedia": "entry-encyclopedia",
+        "entry": "entry",
+        "figure": "figure",
+        "graphic": "graphic",
+        "interview": "interview",
+        "legal_case": "legal_case",
+        "legislation": "legislation",
+        "manuscript": "manuscript",
+        "map": "map",
+        "motion_picture": "motion_picture",
+        "musical_score": "musical_score",
+        "pamphlet": "pamphlet",
+        "paper-conference": "paper-conference",
+        "patent": "patent",
+        "personal_communication": "personal_communication",
+        "post": "post",
+        "post-weblog": "post-weblog",
+        "report": "report",
+        "review-book": "review-book",
+        "review": "review",
+        "song": "song",
+        "speech": "speech",
+        "thesis": "thesis",
+        "treaty": "treaty",
+        "webpage": "webpage",
+    },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
+    "bibtex": {
+        "phdthesis": "thesis",
+        "inbook": "chapter",
+        "misc": None,
+        "article": "article-journal",
+        "book": "book",
+    },
+    "resourceTypeGeneral": {
+        "Image": "graphic",
+        "Dataset": "dataset",
+        "PhysicalObject": None,
+        "Collection": None,
+        "Text": None,  # "Greyliterature, labnotes, accompanyingmaterials"
+        "Sound": None,
+        "InteractiveResource": None,
+        "Event": None,
+        "Software": "software",
+        "Other": None,
+        "Workflow": None,
+        "Audiovisual": None,
+    },  # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
+}
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 4d4d696b..654be2e9 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -26,9 +26,8 @@ from .common import (
     KafkaJsonPusher,
     LinePusher,
     SqlitePusher,
-    make_kafka_consumer,
 )
-from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug
+from .crossref import CrossrefImporter
 from .datacite import DataciteImporter
 from .dblp_container import DblpContainerImporter
 from .dblp_release import DblpReleaseImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 56c3d32e..7c587395 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -27,71 +27,14 @@ from fatcat_openapi_client import (
 from fatcat_openapi_client.rest import ApiException
 from fuzzycat.matching import match_release_fuzzy
 
+from fatcat_tools.biblio_lookup_tables import DOMAIN_REL_MAP
 from fatcat_tools.normal import clean_doi
 from fatcat_tools.transforms import entity_to_dict
 
 DATE_FMT: str = "%Y-%m-%d"
 SANE_MAX_RELEASES: int = 200
 SANE_MAX_URLS: int = 100
-
-DOMAIN_REL_MAP: Dict[str, str] = {
-    "archive.org": "archive",
-    # LOCKSS, Portico, DuraSpace, etc would also be "archive"
-    "arxiv.org": "repository",
-    "babel.hathitrust.org": "repository",
-    "cds.cern.ch": "repository",
-    "deepblue.lib.umich.edu": "repository",
-    "europepmc.org": "repository",
-    "hal.inria.fr": "repository",
-    "scielo.isciii.es": "repository",
-    "www.dtic.mil": "repository",
-    "www.jstage.jst.go.jp": "repository",
-    "www.jstor.org": "repository",
-    "www.ncbi.nlm.nih.gov": "repository",
-    "ftp.ncbi.nlm.nih.gov": "repository",
-    "www.scielo.br": "repository",
-    "www.scielo.cl": "repository",
-    "www.scielo.org.mx": "repository",
-    "zenodo.org": "repository",
-    "www.biorxiv.org": "repository",
-    "www.medrxiv.org": "repository",
-    "citeseerx.ist.psu.edu": "aggregator",
-    "publisher-connector.core.ac.uk": "aggregator",
-    "core.ac.uk": "aggregator",
-    "static.aminer.org": "aggregator",
-    "aminer.org": "aggregator",
-    "pdfs.semanticscholar.org": "aggregator",
-    "semanticscholar.org": "aggregator",
-    "www.semanticscholar.org": "aggregator",
-    "academic.oup.com": "publisher",
-    "cdn.elifesciences.org": "publisher",
-    "cell.com": "publisher",
-    "dl.acm.org": "publisher",
-    "downloads.hindawi.com": "publisher",
-    "elifesciences.org": "publisher",
-    "iopscience.iop.org": "publisher",
-    "journals.plos.org": "publisher",
-    "link.springer.com": "publisher",
-    "onlinelibrary.wiley.com": "publisher",
-    "works.bepress.com": "publisher",
-    "www.biomedcentral.com": "publisher",
-    "www.cell.com": "publisher",
-    "www.nature.com": "publisher",
-    "www.pnas.org": "publisher",
-    "www.tandfonline.com": "publisher",
-    "www.frontiersin.org": "publisher",
-    "www.degruyter.com": "publisher",
-    "www.mdpi.com": "publisher",
-    "www.ahajournals.org": "publisher",
-    "ehp.niehs.nih.gov": "publisher",
-    "journals.tsu.ru": "publisher",
-    "www.cogentoa.com": "publisher",
-    "www.researchgate.net": "academicsocial",
-    "academia.edu": "academicsocial",
-    "wayback.archive-it.org": "webarchive",
-    "web.archive.org": "webarchive",
-    "archive.is": "webarchive",
-}
+MAX_ABSTRACT_LENGTH: int = 2048
 
 
 def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]:
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 8f5a4265..52bd7465 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,7 +4,8 @@ from typing import Any, Dict, List, Optional, Sequence
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import CONTAINER_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
 
 from .common import EntityImporter
 
@@ -33,97 +34,6 @@ CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
     "standard": "standard",
 }
 
-CONTAINER_TYPE_MAP: Dict[str, str] = {
-    "article-journal": "journal",
-    "paper-conference": "conference",
-    "book": "book-series",
-}
-
-# These are based, informally, on sorting the most popular licenses found in
-# Crossref metadata. There were over 500 unique strings and only a few most
-# popular are here; many were variants of the CC URLs. Would be useful to
-# normalize CC licenses better.
-# The current norm is to only add license slugs that are at least partially OA.
-LICENSE_SLUG_MAP: Dict[str, str] = {
-    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
-    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
-    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
-    "//creativecommons.org/licenses/by/2.0/": "CC-BY",
-    "//creativecommons.org/licenses/by/3.0/": "CC-BY",
-    "//creativecommons.org/licenses/by/4.0/": "CC-BY",
-    "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
-    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
-    "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
-    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
-    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
-    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
-    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
-    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
-    "//spdx.org/licenses/CC0-1.0.json": "CC-0",
-    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
-    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/MIT.json": "MIT",
-    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.karger.com/Services/SiteLicenses": "KARGER",
-    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
-    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
-    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
-    "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
-    "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
-    "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
-    # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
-    # //www.springer.com/tdm doesn't seem like a license
-    # //iopscience.iop.org/page/copyright is closed
-    # //www.acm.org/publications/policies/copyright_policy#Background is closed
-    # //rsc.li/journals-terms-of-use is closed for vor (am open)
-    # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
-    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
-}
-
-
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
-    if not raw:
-        return None
-    raw = raw.strip().replace("http://", "//").replace("https://", "//")
-    if "creativecommons.org" in raw.lower():
-        raw = raw.lower()
-        raw = raw.replace("/legalcode", "/").replace("/uk", "")
-        if not raw.endswith("/"):
-            raw = raw + "/"
-    return LICENSE_SLUG_MAP.get(raw)
-
-
-def test_lookup_license_slug() -> None:
-
-    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
-    assert (
-        lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
-        == "CC-BY"
-    )
-    assert (
-        lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
-        == "CC-0"
-    )
-    assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
-    assert (
-        lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
-        == "CC-BY-NC-SA"
-    )
-    assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
-    assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
-    assert lookup_license_slug("") is None
-    assert lookup_license_slug(None) is None
-
 
 class CrossrefImporter(EntityImporter):
     """
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 441514b8..b310f8bc 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -21,113 +21,19 @@ import langdetect
 import pycountry
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import DATACITE_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
 from fatcat_tools.transforms import entity_to_dict
 
-from .common import EntityImporter
-
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter
 
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
-CONTAINER_TYPE_MAP: Dict[str, str] = {
+DATACITE_CONTAINER_TYPE_MAP: Dict[str, str] = {
     "Journal": "journal",
     "Series": "journal",
     "Book Series": "book-series",
 }
 
-# The docs/guide should be the canonical home for these mappings; update there
-# first.  Map various datacite type types to CSL-ish types. None means TODO or
-# remove.
-DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
-    "ris": {
-        "THES": "thesis",
-        "SOUND": "song",  # 99.9% maps to citeproc song, so use that (exception: report)
-        "CHAP": "chapter",
-        "FIGURE": "figure",
-        "RPRT": "report",
-        "JOUR": "article-journal",
-        "MPCT": "motion_picture",
-        "GEN": "article-journal",  # GEN consist of 99% article and report, post-weblog, misc - and one dataset
-        "BOOK": "book",
-        "DATA": "dataset",
-        "COMP": "software",
-    },
-    "schemaOrg": {
-        "Dataset": "dataset",
-        "Book": "book",
-        "ScholarlyArticle": "article-journal",
-        "ImageObject": "graphic",
-        "Collection": None,
-        "MediaObject": None,
-        "Event": None,
-        "SoftwareSourceCode": "software",
-        "Chapter": "chapter",
-        "CreativeWork": None,  # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
-        "PublicationIssue": "article",
-        "AudioObject": None,
-        "Thesis": "thesis",
-    },
-    "citeproc": {
-        "article": "article",
-        "article-journal": "article-journal",
-        "article-magazine": "article-magazine",
-        "article-newspaper": "article-newspaper",
-        "bill": "bill",
-        "book": "book",
-        "broadcast": "broadcast",
-        "chapter": "chapter",
-        "dataset": "dataset",
-        "entry-dictionary": "entry-dictionary",
-        "entry-encyclopedia": "entry-encyclopedia",
-        "entry": "entry",
-        "figure": "figure",
-        "graphic": "graphic",
-        "interview": "interview",
-        "legal_case": "legal_case",
-        "legislation": "legislation",
-        "manuscript": "manuscript",
-        "map": "map",
-        "motion_picture": "motion_picture",
-        "musical_score": "musical_score",
-        "pamphlet": "pamphlet",
-        "paper-conference": "paper-conference",
-        "patent": "patent",
-        "personal_communication": "personal_communication",
-        "post": "post",
-        "post-weblog": "post-weblog",
-        "report": "report",
-        "review-book": "review-book",
-        "review": "review",
-        "song": "song",
-        "speech": "speech",
-        "thesis": "thesis",
-        "treaty": "treaty",
-        "webpage": "webpage",
-    },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
-    "bibtex": {
-        "phdthesis": "thesis",
-        "inbook": "chapter",
-        "misc": None,
-        "article": "article-journal",
-        "book": "book",
-    },
-    "resourceTypeGeneral": {
-        "Image": "graphic",
-        "Dataset": "dataset",
-        "PhysicalObject": None,
-        "Collection": None,
-        "Text": None,  # "Greyliterature, labnotes, accompanyingmaterials"
-        "Sound": None,
-        "InteractiveResource": None,
-        "Event": None,
-        "Software": "software",
-        "Other": None,
-        "Workflow": None,
-        "Audiovisual": None,
-    },  # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
-}
-
 # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
 DATACITE_UNKNOWN_MARKERS: List[str] = [
     "(:unac)",  # temporarily inaccessible
@@ -180,43 +86,6 @@ DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [
     }
 ]
 
-# TODO(martin): merge this with other maps and lookup functions, eventually.
-LICENSE_SLUG_MAP: Dict[str, str] = {
-    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
-    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
-    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
-    "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
-    "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
-    "//onlinelibrary.wiley.com/termsandconditions/": "WILEY",
-    "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
-    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
-    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
-    "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
-    "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
-    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
-    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
-    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
-    "//www.karger.com/Services/SiteLicenses/": "KARGER",
-    "//www.springer.com/tdm/": "SPRINGER-TDM",
-    "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
-    "//spdx.org/licenses/CC0-1.0.json": "CC-0",
-    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
-    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/MIT.json": "MIT",
-    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-}
-
 
 class DataciteImporter(EntityImporter):
     """
@@ -406,8 +275,8 @@ class DataciteImporter(EntityImporter):
         container_name = None
 
         container = attributes.get("container", {}) or {}
-        if container.get("type") in CONTAINER_TYPE_MAP.keys():
-            container_type = CONTAINER_TYPE_MAP.get(container["type"])
+        if container.get("type") in DATACITE_CONTAINER_TYPE_MAP.keys():
+            container_type = DATACITE_CONTAINER_TYPE_MAP.get(container["type"])
             if container.get("identifier") and container.get("identifierType") == "ISSN":
                 issn = container.get("identifier")
                 if issn and len(issn) == 8:
@@ -488,7 +357,7 @@ class DataciteImporter(EntityImporter):
         license_extra = []
 
         for lic in attributes.get("rightsList", []):
-            slug = lookup_license_slug(lic.get("rightsUri"))
+            slug = datacite_lookup_license_slug(lic.get("rightsUri"))
             if slug:
                 license_slug = slug
             license_extra.append(lic)
@@ -968,7 +837,7 @@ def contributor_list_contains_contributor(
     return False
 
 
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+def datacite_lookup_license_slug(raw: Optional[str]) -> Optional[str]:
     """
     Resolve a variety of strings into a some pseudo-canonical form, e.g.
     CC-BY-ND, CC-0, MIT and so on.
@@ -1063,12 +932,8 @@ def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
             return None
         return "RS-{}".format(name.upper())
 
-    # Fallback to mapped values.
-    raw = raw.lower()
-    raw = raw.strip().replace("http://", "//").replace("https://", "//")
-    if not raw.endswith("/"):
-        raw = raw + "/"
-    return LICENSE_SLUG_MAP.get(raw)
+    # Fallback to generic license lookup
+    return lookup_license_slug(raw)
 
 
 def find_original_language_title(
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index f5c886a2..92dbe574 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Sequence
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.importers.common import EntityImporter
+from fatcat_tools.importers.common import MAX_ABSTRACT_LENGTH, EntityImporter
 from fatcat_tools.normal import (
     clean_doi,
     clean_orcid,
@@ -24,9 +24,6 @@ from fatcat_tools.normal import (
     parse_month,
 )
 
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
-
 
 class DoajArticleImporter(EntityImporter):
     def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None:
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 9db499a0..3c85132c 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -9,9 +9,7 @@ from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
 
 from fatcat_tools.normal import clean_doi, clean_str
 
-from .common import EntityImporter, make_rel_url
-
-MAX_ABSTRACT_BYTES = 4096
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url
 
 
 class GrobidMetadataImporter(EntityImporter):
@@ -84,7 +82,7 @@ class GrobidMetadataImporter(EntityImporter):
         extra_grobid: Dict[str, Any] = dict()
 
         abstract = obj.get("abstract")
-        if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
+        if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10:
             abobj = fatcat_openapi_client.ReleaseAbstract(
                 mimetype="text/plain", content=clean_str(obj.get("abstract"))
             )
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index c2f650b0..79691c9a 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,7 +8,8 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import LANG_MAP_MARC, clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import LANG_MAP_MARC
+from fatcat_tools.normal import clean_doi, clean_str
 
 from .common import EntityImporter
 from .crossref import CONTAINER_TYPE_MAP
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3274234f..5bc7a9ff 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,325 +8,16 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import (
+from fatcat_tools.biblio_lookup_tables import (
+    COUNTRY_NAME_MAP,
     LANG_MAP_MARC,
-    clean_doi,
-    clean_issn,
-    clean_pmcid,
-    clean_pmid,
-    clean_str,
+    MONTH_ABBR_MAP,
+    PUBMED_RELEASE_TYPE_MAP,
 )
+from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid, clean_str
 
 from .common import EntityImporter
 
-# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
-PUBMED_RELEASE_TYPE_MAP = {
-    # Adaptive Clinical Trial
-    "Address": "speech",
-    "Autobiography": "book",
-    # Bibliography
-    "Biography": "book",
-    # Case Reports
-    "Classical Article": "article-journal",
-    # Clinical Conference
-    # Clinical Study
-    # Clinical Trial
-    # Clinical Trial, Phase I
-    # Clinical Trial, Phase II
-    # Clinical Trial, Phase III
-    # Clinical Trial, Phase IV
-    # Clinical Trial Protocol
-    # Clinical Trial, Veterinary
-    # Collected Works
-    # Comparative Study
-    # Congress
-    # Consensus Development Conference
-    # Consensus Development Conference, NIH
-    # Controlled Clinical Trial
-    "Dataset": "dataset",
-    # Dictionary
-    # Directory
-    # Duplicate Publication
-    "Editorial": "editorial",
-    # English Abstract   # doesn't indicate that this is abstract-only
-    # Equivalence Trial
-    # Evaluation Studies
-    # Expression of Concern
-    # Festschrift
-    # Government Document
-    # Guideline
-    "Historical Article": "article-journal",
-    # Interactive Tutorial
-    "Interview": "interview",
-    "Introductory Journal Article": "article-journal",
-    "Journal Article": "article-journal",
-    "Lecture": "speech",
-    "Legal Case": "legal_case",
-    "Legislation": "legislation",
-    "Letter": "letter",
-    # Meta-Analysis
-    # Multicenter Study
-    # News
-    "Newspaper Article": "article-newspaper",
-    # Observational Study
-    # Observational Study, Veterinary
-    # Overall
-    # Patient Education Handout
-    # Periodical Index
-    # Personal Narrative
-    # Portrait
-    # Practice Guideline
-    # Pragmatic Clinical Trial
-    # Publication Components
-    # Publication Formats
-    # Publication Type Category
-    # Randomized Controlled Trial
-    # Research Support, American Recovery and Reinvestment Act
-    # Research Support, N.I.H., Extramural
-    # Research Support, N.I.H., Intramural
-    # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
-    # Research Support, U.S. Gov't, P.H.S.
-    # Review     # in the "literature review" sense, not "product review"
-    # Scientific Integrity Review
-    # Study Characteristics
-    # Support of Research
-    # Systematic Review
-    "Technical Report": "report",
-    # Twin Study
-    # Validation Studies
-    # Video-Audio Media
-    # Webcasts
-}
-
-MONTH_ABBR_MAP = {
-    "Jan": 1,
-    "01": 1,
-    "Feb": 2,
-    "02": 2,
-    "Mar": 3,
-    "03": 3,
-    "Apr": 4,
-    "04": 4,
-    "May": 5,
-    "05": 5,
-    "Jun": 6,
-    "06": 6,
-    "Jul": 7,
-    "07": 7,
-    "Aug": 8,
-    "08": 8,
-    "Sep": 9,
-    "09": 9,
-    "Oct": 10,
-    "10": 10,
-    "Nov": 11,
-    "11": 11,
-    "Dec": 12,
-    "12": 12,
-}
-
-# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
-COUNTRY_NAME_MAP = {
-    "Afghanistan": "af",
-    "Albania": "al",
-    "Algeria": "dz",
-    "Andorra": "ad",
-    "Angola": "ao",
-    "Antigua and Barbuda": "ag",
-    "Argentina": "ar",
-    "Armenia": "am",
-    "Australia": "au",
-    "Austria": "at",
-    "Azerbaijan": "az",
-    "Bahamas": "bs",
-    "Bahrain": "bh",
-    "Bangladesh": "bd",
-    "Barbados": "bb",
-    "Belarus": "by",
-    "Belgium": "be",
-    "Belize": "bz",
-    "Benin": "bj",
-    "Bhutan": "bt",
-    "Bolivia": "bo",
-    "Bosnia and Herzegowina": "ba",
-    "Botswana": "bw",
-    "Brazil": "br",
-    "Brunei Darussalam": "bn",
-    "Bulgaria": "bg",
-    "Burkina Faso": "bf",
-    "Burundi": "bi",
-    "Cambodia": "kh",
-    "Cameroon": "cm",
-    "Canada": "ca",
-    "Cape Verde": "cv",
-    "Central African Republic": "cf",
-    "Chad": "td",
-    "Chile": "cl",
-    "China": "cn",
-    "Colombia": "co",
-    "Comoros": "km",
-    "Congo, Democratic Republic": "cd",
-    "Congo, People’s Republic": "cg",
-    "Costa Rica": "cr",
-    "Cote d'Ivoire": "ci",
-    "Croatia (Local Name: Hrvatska)": "hr",
-    "Cuba": "cu",
-    "Cyprus": "cy",
-    "Czech Republic": "cz",
-    "Denmark": "dk",
-    "Djibouti": "dj",
-    "Dominica": "dm",
-    "Dominican Republic": "do",
-    "East Timor": "tl",
-    "Ecuador": "ec",
-    "El Salvador": "sv",
-    "Equatorial Guinea": "gq",
-    "Eritrea": "er",
-    "Estonia": "ee",
-    "Ethiopia": "et",
-    "Fiji": "fj",
-    "Finland": "fi",
-    "France": "fr",
-    "Gabon": "ga",
-    "Gambia": "gm",
-    "Georgia": "ge",
-    "Germany": "de",
-    "Ghana": "gh",
-    "Greece": "gr",
-    "Greenland": "gl",
-    "Grenada": "gd",
-    "Guatemala": "gt",
-    "Guinea": "gn",
-    "Guinea-Bissau": "gw",
-    "Guyana": "gy",
-    "Haiti": "ht",
-    "Honduras": "hn",
-    "Hong Kong": "hk",
-    "Hungary": "hu",
-    "Iceland": "is",
-    "India": "in",
-    "Indonesia": "id",
-    "Iran": "ir",
-    "Iraq": "iq",
-    "Ireland": "ie",
-    "Israel": "il",
-    "Italy": "it",
-    "Jamaica": "jm",
-    "Japan": "jp",
-    "Jordan": "jo",
-    "Kazakhstan": "kz",
-    "Kenya": "ke",
-    "Kiribati": "ki",
-    "Korea, Democratic People's Republic": "kp",
-    "Korea, Republic": "kr",
-    "Kuwait": "kw",
-    "Kyrgyzstan": "kg",
-    "Laos": "la",
-    "Latvia": "lv",
-    "Lebanon": "lb",
-    "Lesotho": "ls",
-    "Liberia": "lr",
-    "Libya": "ly",
-    "Liechtenstein": "li",
-    "Lithuania": "lt",
-    "Luxembourg": "lu",
-    "Macedonia": "mk",
-    "Madagascar": "mg",
-    "Malawi": "mw",
-    "Malaysia": "my",
-    "Maldives": "mv",
-    "Mali": "ml",
-    "Malta": "mt",
-    "Marshall Islands": "mh",
-    "Mauritania": "mr",
-    "Mauritius": "mu",
-    "Mexico": "mx",
-    "Micronesia": "fm",
-    "Moldova": "md",
-    "Monaco": "mc",
-    "Mongolia": "mn",
-    "Morocco": "ma",
-    "Mozambique": "mz",
-    "Myanmar": "mm",
-    "Namibia": "na",
-    "Nauru": "nr",
-    "Nepal": "np",
-    "Netherlands": "nl",
-    "New Zealand": "nz",
-    "Nicaragua": "ni",
-    "Niger": "ne",
-    "Nigeria": "ng",
-    "Norway": "no",
-    "Oman": "om",
-    "Pakistan": "pk",
-    "Palau": "pw",
-    "Panama": "pa",
-    "Papua New Guinea": "pg",
-    "Paraguay": "py",
-    "Peru": "pe",
-    "Philippines": "ph",
-    "Poland": "pl",
-    "Portugal": "pt",
-    "Puerto Rico": "pr",
-    "Qatar": "qa",
-    "Romania": "ro",
-    "Russian Federation": "ru",
-    "Rwanda": "rw",
-    "Saint Kitts and Nevis": "kn",
-    "Saint Lucia": "lc",
-    "Saint Vincent and the Grenadines": "vc",
-    "Samoa": "ws",
-    "San Marino": "sm",
-    "Sao Tome and Príncipe": "st",
-    "Saudi Arabia": "sa",
-    "Senegal": "sn",
-    "Serbia and Montenegro": "cs",
-    "Seychelles": "sc",
-    "Sierra Leone": "sl",
-    "Singapore": "sg",
-    "Slovakia (Slovak Republic)": "sk",
-    "Slovenia": "si",
-    "Solomon Islands": "sb",
-    "Somalia": "so",
-    "South Africa": "za",
-    "Spain": "es",
-    "Sri Lanka": "lk",
-    "Sudan": "sd",
-    "Suriname": "sr",
-    "Swaziland": "sz",
-    "Sweden": "se",
-    "Switzerland": "ch",
-    "Syrian Arab Republic": "sy",
-    "Taiwan": "tw",
-    "Tajikistan": "tj",
-    "Tanzania": "tz",
-    "Tanzania": "tz",
-    "Thailand": "th",
-    "Togo": "tg",
-    "Tonga": "to",
-    "Trinidad and Tobago": "tt",
-    "Tunisia": "tn",
-    "Turkey": "tr",
-    "Turkmenistan": "tm",
-    "Tuvalu": "tv",
-    "Uganda": "ug",
-    "Ukraine": "ua",
-    "United Arab Emirates": "ae",
-    "United Kingdom": "gb",
-    "United States": "us",
-    "Uruguay": "uy",
-    # Additions from running over large files
-    "Bosnia and Herzegovina": "ba",
-    # "International"
-    "China (Republic : 1949- )": "tw",  # pretty sure this is tw not cn
-    "Russia (Federation)": "ru",
-    "Scotland": "gb",
-    "England": "gb",
-    "Korea (South)": "kr",
-    "Georgia (Republic)": "ge",
-    "Egypt": "eg",
-}
-
 
 class PubmedImporter(EntityImporter):
     """
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 0d2c84ce..fc80411c 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -12,6 +12,8 @@ import ftfy
 import langdetect
 import pycountry
 
+from .biblio_lookup_tables import LICENSE_SLUG_MAP
+
 DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
 
 
@@ -606,84 +608,35 @@ def test_parse_country_name() -> None:
     assert parse_country_name("Japan") == "jp"
 
 
-# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
-# 2/T and 2/B?
-# PubMed/MEDLINE and JSTOR use these MARC codes
-# https://www.loc.gov/marc/languages/language_name.html
-LANG_MAP_MARC = {
-    "afr": "af",
-    "alb": "sq",
-    "amh": "am",
-    "ara": "ar",
-    "arm": "hy",
-    "aze": "az",
-    "ben": "bn",
-    "bos": "bs",
-    "bul": "bg",
-    "cat": "ca",
-    "chi": "zh",
-    "cze": "cs",
-    "dan": "da",
-    "dut": "nl",
-    "eng": "en",
-    "epo": "eo",
-    "est": "et",
-    "fin": "fi",
-    "fre": "fr",
-    "geo": "ka",
-    "ger": "de",
-    "gla": "gd",
-    "gre": "el",
-    "heb": "he",
-    "hin": "hi",
-    "hrv": "hr",
-    "hun": "hu",
-    "ice": "is",
-    "ind": "id",
-    "ita": "it",
-    "jpn": "ja",
-    "kin": "rw",
-    "kor": "ko",
-    "lat": "la",
-    "lav": "lv",
-    "lit": "lt",
-    "mac": "mk",
-    "mal": "ml",
-    "mao": "mi",
-    "may": "ms",
-    "nor": "no",
-    "per": "fa",
-    "per": "fa",
-    "pol": "pl",
-    "por": "pt",
-    "pus": "ps",
-    "rum": "ro",
-    "rus": "ru",
-    "san": "sa",
-    "slo": "sk",
-    "slv": "sl",
-    "spa": "es",
-    "srp": "sr",
-    "swe": "sv",
-    "tha": "th",
-    "tur": "tr",
-    "ukr": "uk",
-    "urd": "ur",
-    "vie": "vi",
-    "wel": "cy",
-    # additions
-    "gle": "ga",  # "Irish" (Gaelic)
-    "jav": "jv",  # Javanese
-    "welsh": "cy",  # Welsh
-    "oci": "oc",  # Occitan
-    # Don't have ISO 639-1 codes
-    "grc": "el",  # Ancient Greek; map to modern greek
-    "map": None,  # Austronesian (collection)
-    "syr": None,  # Syriac, Modern
-    "gem": None,  # Old Saxon
-    "non": None,  # Old Norse
-    "emg": None,  # Eastern Meohang
-    "neg": None,  # Negidal
-    "mul": None,  # Multiple languages
-    "und": None,  # Undetermined
-}
+def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+    if not raw:
+        return None
+    raw = raw.strip().replace("http://", "//").replace("https://", "//")
+    if "creativecommons.org" in raw.lower():
+        raw = raw.lower()
+        raw = raw.replace("/legalcode", "/").replace("/uk", "")
+        if not raw.endswith("/"):
+            raw = raw + "/"
+    return LICENSE_SLUG_MAP.get(raw)
+
+
+def test_lookup_license_slug() -> None:
+
+    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
+    assert (
+        lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
+        == "CC-BY"
+    )
+    assert (
+        lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
+        == "CC-0"
+    )
+    assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
+    assert (
+        lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
+        == "CC-BY-NC-SA"
+    )
+    assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
+    assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
+    assert lookup_license_slug("") is None
+    assert lookup_license_slug(None) is None
-- 
cgit v1.2.3


From 1d1700678747ae711afbf105b962c5a1db3e7196 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 10 Nov 2021 14:12:59 -0800
Subject: improve lookup_license_slug helper and lookup table

---
 python/fatcat_tools/biblio_lookup_tables.py | 103 ++++++++++++++--------------
 python/fatcat_tools/normal.py               |  15 ++--
 2 files changed, 62 insertions(+), 56 deletions(-)

diff --git a/python/fatcat_tools/biblio_lookup_tables.py b/python/fatcat_tools/biblio_lookup_tables.py
index a9a097ae..edb1f5ef 100644
--- a/python/fatcat_tools/biblio_lookup_tables.py
+++ b/python/fatcat_tools/biblio_lookup_tables.py
@@ -467,69 +467,72 @@ CONTAINER_TYPE_MAP: Dict[str, str] = {
 # popular are here; many were variants of the CC URLs. Would be useful to
 # normalize CC licenses better.
 # The current norm is to only add license slugs that are at least partially OA.
+# NOTE: URL patterns should be lower-case, and have any trailing slash ("/")
+# removed. Slugs are usually upper-case acronyms
 LICENSE_SLUG_MAP: Dict[str, str] = {
     "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
     "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0": "CC-0",
     "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
     "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
-    "//creativecommons.org/licenses/by/2.0/": "CC-BY",
-    "//creativecommons.org/licenses/by/3.0/": "CC-BY",
-    "//creativecommons.org/licenses/by/4.0/": "CC-BY",
-    "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
-    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
-    "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
-    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
-    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
-    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
-    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
-    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
-    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
-    "//spdx.org/licenses/CC0-1.0.json": "CC-0",
-    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
-    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/MIT.json": "MIT",
-    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.karger.com/Services/SiteLicenses": "KARGER",
-    "//www.karger.com/Services/SiteLicenses/": "KARGER",
-    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
-    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
-    "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
-    "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
+    "//creativecommons.org/share-your-work/public-domain/cc0": "CC-0",
+    "//creativecommons.org/licenses/by/2.0": "CC-BY",
+    "//creativecommons.org/licenses/by/3.0": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0": "CC-BY",
+    "//creativecommons.org/licenses/by-sa/3.0": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/4.0": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-nd/3.0": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nd/4.0": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nc/3.0": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/4.0": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc-sa/3.0": "CC-BY-NC-SA",
+    "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA",
+    "//creativecommons.org/licenses/by-nc-nd/3.0": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-NC-ND",
+    "//creativecommons.org/share-your-work/public-domain/cc0": "CC-0",
+    "//spdx.org/licenses/cc0-1.0.json": "CC-0",
+    "//spdx.org/licenses/cc-by-1.0.json": "CC-BY",
+    "//spdx.org/licenses/cc-by-4.0.json": "CC-BY",
+    "//spdx.org/licenses/cc-by-nc-4.0.json": "CC-BY-NC",
+    "//spdx.org/licenses/cc-by-sa-3.0.json": "CC-BY-SA",
+    "//spdx.org/licenses/cc-by-sa-4.0.json": "CC-BY-SA",
+    "//spdx.org/licenses/mit.json": "MIT",
+    "//spdx.org/licenses/ogl-canada-2.0.json": "OGL-Canada",
+    "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0",
+    "//www.elsevier.com/tdm/userlicense/1.0": "ELSEVIER-USER-1.0",
+    "//www.karger.com/services/siteLicenses": "KARGER",
+    "//www.karger.com/services/siteLicenses": "KARGER",
+    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml": "ADS-UK",
+    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml": "ADS-UK",
+    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess": "ADS-UK",
+    "//homepage.data-planet.com/terms-use": "SAGE-DATA-PLANET",
+    "//homepage.data-planet.com/terms-use": "SAGE-DATA-PLANET",
+    "//publikationen.bibliothek.kit.edu/kitopen-lizenz": "KIT-OPEN",
+    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
     "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
-    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
     "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
-    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
-    "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
-    "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
+    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
+    "//www.ametsoc.org/pubsreuselicenses": "AMETSOC",
+    "//www.ametsoc.org/pubsreuselicenses": "AMETSOC",
+    "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
     "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
-    "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
     "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
-    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
-    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
-    "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
-    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
-    # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
-    # //www.springer.com/tdm doesn't seem like a license
+    "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
+    "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
+    "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3",
+    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2",
+    "//arxiv.org/licenses/nonexclusive-distrib/1.0": "ARXIV-1.0",
+    # skip these non-OA licenses
     # //iopscience.iop.org/page/copyright is closed
     # //www.acm.org/publications/policies/copyright_policy#Background is closed
-    # //rsc.li/journals-terms-of-use is closed for vor (am open)
     # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
-    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
     # skip these TDM licenses; they don't apply to content
-    # "//www.springer.com/tdm/": "SPRINGER-TDM",
-    # "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
-    # "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
+    # "//www.springer.com/tdm": "SPRINGER-TDM",
+    # "//journals.sagepub.com/page/policies/text-and-data-mining-license": "SAGE-TDM",
+    # "//doi.wiley.com/10.1002/tdm_license_1.1": "WILEY-TDM-1.1",
+    # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+    # //www.springer.com/tdm doesn't seem like a license
+    # //rsc.li/journals-terms-of-use is closed for vor (am open)
 }
 
 # Map various datacite type types to CSL-ish types. None means TODO or remove.
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index fc80411c..dd0a4f74 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -611,12 +611,15 @@ def test_parse_country_name() -> None:
 def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
     if not raw:
         return None
-    raw = raw.strip().replace("http://", "//").replace("https://", "//")
-    if "creativecommons.org" in raw.lower():
-        raw = raw.lower()
-        raw = raw.replace("/legalcode", "/").replace("/uk", "")
-        if not raw.endswith("/"):
-            raw = raw + "/"
+    # normalize to lower-case and not ending with a slash
+    raw = raw.strip().lower()
+    if raw.endswith("/"):
+        raw = raw[:-1]
+    # remove http/https prefix
+    raw = raw.replace("http://", "//").replace("https://", "//")
+    # special-case normalization of CC licenses
+    if "creativecommons.org" in raw:
+        raw = raw.replace("/legalcode", "").replace("/uk", "")
     return LICENSE_SLUG_MAP.get(raw)
 
 
-- 
cgit v1.2.3


From 6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 10 Nov 2021 14:15:23 -0800
Subject: update datacite tests for license slug changes

Use datacite-specific wrapper function, and remove a couple
non-OA/TDM-limited licenses.
---
 python/tests/files/datacite/datacite_result_00.json |  3 +--
 python/tests/import_datacite.py                     | 12 ++++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json
index 130a46ff..06147cfc 100644
--- a/python/tests/files/datacite/datacite_result_00.json
+++ b/python/tests/files/datacite/datacite_result_00.json
@@ -87,6 +87,5 @@
   "release_type": "article-journal",
   "release_year": 2019,
   "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea",
-  "volume": "38",
-  "license_slug": "SPRINGER-TDM"
+  "volume": "38"
 }
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index b15d14c3..28884cda 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -15,9 +15,9 @@ from fatcat_tools.importers import DataciteImporter, JsonLinePusher
 from fatcat_tools.importers.datacite import (
     clean_doi,
     contributor_list_contains_contributor,
+    datacite_lookup_license_slug,
     find_original_language_title,
     index_form_to_display_name,
-    lookup_license_slug,
     parse_datacite_dates,
     parse_datacite_titles,
 )
@@ -463,9 +463,9 @@ def test_lookup_license_slug():
         Case("http://creativecommons.org/licenses/by-nd/4.0/legalcode", "CC-BY-ND"),
         Case("http://creativecommons.org/licenses/by/2.0/uk/legalcode", "CC-BY"),
         Case("http://creativecommons.org/publicdomain/zero/1.0/legalcode", "CC-0"),
-        Case("http://doi.wiley.com/10.1002/tdm_license_1.1", "WILEY-TDM-1.1"),
+        Case("http://doi.wiley.com/10.1002/tdm_license_1.1", None),
         Case("http://homepage.data-planet.com/terms-use", "SAGE-DATA-PLANET"),
-        Case("http://www.springer.com/tdm", "SPRINGER-TDM"),
+        Case("http://www.springer.com/tdm", None),
         Case(
             "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml",
             "ADS-UK",
@@ -477,11 +477,11 @@ def test_lookup_license_slug():
         Case("https://www.elsevier.com/tdm/userlicense/1.0", "ELSEVIER-USER-1.0"),
         Case("https://www.gnu.org/licenses/gpl-3.0.html", "GPL-3.0"),
         Case("http://rightsstatements.org/page/InC/1.0?language=en", "RS-INC"),
-        Case("http://onlinelibrary.wiley.com/termsAndConditions", "WILEY"),
+        Case("http://onlinelibrary.wiley.com/termsAndConditions", None),
         Case("https://publikationen.bibliothek.kit.edu/kitopen-lizenz", "KIT-OPEN"),
         Case(
             "http://journals.sagepub.com/page/policies/text-and-data-mining-license",
-            "SAGE-TDM",
+            None,
         ),
         Case(
             "https://creativecommons.org/publicdomain/mark/1.0/deed.de",
@@ -506,7 +506,7 @@ def test_lookup_license_slug():
     ]
 
     for c in cases:
-        got = lookup_license_slug(c.input)
+        got = datacite_lookup_license_slug(c.input)
         assert c.output == got, "{}: got {}, want {}".format(c.input, got, c.output)
 
 
-- 
cgit v1.2.3