From ab4e1355bf93e3755985f1b5cd2589a78601d253 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 10 Nov 2021 13:08:23 -0800
Subject: remove cdl_dash_dat and wayback_static importers

Cleaning out dead code.

These importers were used to create demonstration fileset and webcapture
entities early in development. They have been replaced by the fileset
and webcapture ingest importers.
---
 python/fatcat_tools/importers/__init__.py | 2 --
 1 file changed, 2 deletions(-)

(limited to 'python/fatcat_tools/importers/__init__.py')

diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 06ecfd58..223ae526 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -13,7 +13,6 @@ To run an import you combine two classes; one each of:
 
 from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter
 from .arxiv import ArxivRawImporter
-from .cdl_dash_dat import auto_cdl_dash_dat
 from .chocula import ChoculaImporter
 from .common import (
     LANG_MAP_MARC,
@@ -55,4 +54,3 @@ from .matched import MatchedImporter
 from .orcid import OrcidImporter
 from .pubmed import PubmedImporter
 from .shadow import ShadowLibraryImporter
-from .wayback_static import auto_wayback_static
-- 
cgit v1.2.3


From 16e9979a6f347b49764c1141209e84083ea81057 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 10 Nov 2021 13:23:12 -0800
Subject: importers: refactor imports of clean() and other normalization
 helpers

---
 python/fatcat_tools/importers/__init__.py         |  3 --
 python/fatcat_tools/importers/arabesque.py        |  4 +-
 python/fatcat_tools/importers/chocula.py          |  8 ++--
 python/fatcat_tools/importers/common.py           |  5 +-
 python/fatcat_tools/importers/crossref.py         | 56 +++++++++++------------
 python/fatcat_tools/importers/datacite.py         | 24 +++++-----
 python/fatcat_tools/importers/grobid_metadata.py  | 30 ++++++------
 python/fatcat_tools/importers/jalc.py             | 22 ++++-----
 python/fatcat_tools/importers/journal_metadata.py |  8 ++--
 python/fatcat_tools/importers/jstor.py            | 14 +++---
 python/fatcat_tools/importers/orcid.py            | 10 ++--
 python/fatcat_tools/importers/pubmed.py           | 15 ++++--
 12 files changed, 104 insertions(+), 95 deletions(-)

(limited to 'python/fatcat_tools/importers/__init__.py')

diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 223ae526..4d4d696b 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -15,7 +15,6 @@ from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter
 from .arxiv import ArxivRawImporter
 from .chocula import ChoculaImporter
 from .common import (
-    LANG_MAP_MARC,
     Bs4XmlFileListPusher,
     Bs4XmlFilePusher,
     Bs4XmlLargeFilePusher,
@@ -27,8 +26,6 @@ from .common import (
     KafkaJsonPusher,
     LinePusher,
     SqlitePusher,
-    clean,
-    is_cjk,
     make_kafka_consumer,
 )
 from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index b4a4d9ed..92289bb3 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, FileEntity
 
-from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url
+from fatcat_tools.normal import b32_hex
+
+from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, make_rel_url
 
 ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL"
 
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 842c7853..c44fec3b 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ContainerEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
 
 
 class ChoculaImporter(EntityImporter):
@@ -40,7 +42,7 @@ class ChoculaImporter(EntityImporter):
         returns a ContainerEntity (or None if invalid or couldn't parse)
         """
 
-        name = clean(row.get("name"))
+        name = clean_str(row.get("name"))
         if not name:
             # Name is required (by schema)
             return None
@@ -85,7 +87,7 @@ class ChoculaImporter(EntityImporter):
             ident=row["ident"],
             name=name,
             container_type=container_type,
-            publisher=clean(row.get("publisher")),
+            publisher=clean_str(row.get("publisher")),
             wikidata_qid=row.get("wikidata_qid"),
             extra=extra,
         )
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 425b6f13..56c3d32e 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -27,10 +27,7 @@ from fatcat_openapi_client import (
 from fatcat_openapi_client.rest import ApiException
 from fuzzycat.matching import match_release_fuzzy
 
-# TODO: refactor so remove need for this (re-imports for backwards compatibility)
-from fatcat_tools.normal import is_cjk  # noqa: F401
-from fatcat_tools.normal import LANG_MAP_MARC, b32_hex, clean_doi  # noqa: F401
-from fatcat_tools.normal import clean_str as clean  # noqa: F401
+from fatcat_tools.normal import clean_doi
 from fatcat_tools.transforms import entity_to_dict
 
 DATE_FMT: str = "%Y-%m-%d"
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index c9f251fc..8f5a4265 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,9 +4,9 @@ from typing import Any, Dict, List, Optional, Sequence
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str
 
-from .common import EntityImporter, clean
+from .common import EntityImporter
 
 # The docs/guide should be the canonical home for these mappings; update there
 # first
@@ -232,21 +232,21 @@ class CrossrefImporter(EntityImporter):
                     if len(affiliation_list) > 1:
                         # note: affiliation => more_affiliations
                         extra["more_affiliations"] = [
-                            clean(a["name"]) for a in affiliation_list[1:]
+                            clean_str(a["name"]) for a in affiliation_list[1:]
                         ]
                 if am.get("sequence") and am.get("sequence") != "additional":
-                    extra["seq"] = clean(am.get("sequence"))
+                    extra["seq"] = clean_str(am.get("sequence"))
                 assert ctype in ("author", "editor", "translator")
-                raw_name = clean(raw_name)
+                raw_name = clean_str(raw_name)
                 # TODO: what if 'raw_name' is None?
                 contribs.append(
                     ReleaseContrib(
                         creator_id=creator_id,
                         index=index,
                         raw_name=raw_name,
-                        given_name=clean(am.get("given")),
-                        surname=clean(am.get("family")),
-                        raw_affiliation=clean(raw_affiliation),
+                        given_name=clean_str(am.get("given")),
+                        surname=clean_str(am.get("family")),
+                        raw_affiliation=clean_str(raw_affiliation),
                         role=ctype,
                         extra=extra or None,
                     )
@@ -263,11 +263,11 @@ class CrossrefImporter(EntityImporter):
         container_id = None
         if issnl:
             container_id = self.lookup_issnl(issnl)
-        publisher = clean(obj.get("publisher"))
+        publisher = clean_str(obj.get("publisher"))
 
         container_name = obj.get("container-title")
         if container_name:
-            container_name = clean(container_name[0], force_xml=True)
+            container_name = clean_str(container_name[0], force_xml=True)
         if not container_name:
             container_name = None
         if (
@@ -323,7 +323,7 @@ class CrossrefImporter(EntityImporter):
                 ref_extra["journal-title"] = rm["journal-title"]
             if rm.get("DOI"):
                 ref_extra["doi"] = rm.get("DOI").lower()
-            author = clean(rm.get("author"))
+            author = clean_str(rm.get("author"))
             if author:
                 ref_extra["authors"] = [author]
             for k in (
@@ -347,8 +347,8 @@ class CrossrefImporter(EntityImporter):
                 "series-title",
                 "volume-title",
             ):
-                if clean(rm.get(k)):
-                    ref_extra[k] = clean(rm[k])
+                if clean_str(rm.get(k)):
+                    ref_extra[k] = clean_str(rm[k])
             refs.append(
                 fatcat_openapi_client.ReleaseRef(
                     index=i,
@@ -356,9 +356,9 @@ class CrossrefImporter(EntityImporter):
                     target_release_id=None,
                     key=key,
                     year=year,
-                    container_name=clean(ref_container_name),
-                    title=clean(rm.get("article-title")),
-                    locator=clean(rm.get("first-page")),
+                    container_name=clean_str(ref_container_name),
+                    title=clean_str(rm.get("article-title")),
+                    locator=clean_str(rm.get("first-page")),
                     # TODO: just dump JSON somewhere here?
                     extra=ref_extra or None,
                 )
@@ -366,7 +366,7 @@ class CrossrefImporter(EntityImporter):
 
         # abstracts
         abstracts = []
-        abstract = clean(obj.get("abstract"))
+        abstract = clean_str(obj.get("abstract"))
         if abstract and len(abstract) > 10:
             abstracts.append(
                 fatcat_openapi_client.ReleaseAbstract(
@@ -387,9 +387,9 @@ class CrossrefImporter(EntityImporter):
                 if type(val) == list:
                     val = val[0]
                 if type(val) == str:
-                    val = clean(val)
+                    val = clean_str(val)
                     if val:
-                        extra[key] = clean(val)
+                        extra[key] = clean_str(val)
                 else:
                     extra[key] = val
         # crossref-nested extra keys
@@ -397,14 +397,14 @@ class CrossrefImporter(EntityImporter):
             val = obj.get(key)
             if val:
                 if type(val) == str:
-                    extra_crossref[key] = clean(val)
+                    extra_crossref[key] = clean_str(val)
                 else:
                     extra_crossref[key] = val
         if license_extra:
             extra_crossref["license"] = license_extra
 
         if len(obj["title"]) > 1:
-            aliases = [clean(t) for t in obj["title"][1:]]
+            aliases = [clean_str(t) for t in obj["title"][1:]]
             aliases = [t for t in aliases if t]
             if aliases:
                 extra["aliases"] = aliases
@@ -459,11 +459,11 @@ class CrossrefImporter(EntityImporter):
         if obj.get("original-title"):
             ot = obj.get("original-title")
             if ot is not None:
-                original_title = clean(ot[0], force_xml=True)
+                original_title = clean_str(ot[0], force_xml=True)
 
         title: Optional[str] = None
         if obj.get("title"):
-            title = clean(obj["title"][0], force_xml=True)
+            title = clean_str(obj["title"][0], force_xml=True)
             if not title or len(title) <= 1:
                 # title can't be just a single character
                 self.counts["skip-blank-title"] += 1
@@ -476,7 +476,7 @@ class CrossrefImporter(EntityImporter):
 
         subtitle = None
         if obj.get("subtitle"):
-            subtitle = clean(obj["subtitle"][0], force_xml=True)
+            subtitle = clean_str(obj["subtitle"][0], force_xml=True)
             if not subtitle or len(subtitle) <= 1:
                 # subtitle can't be just a single character
                 subtitle = None
@@ -499,10 +499,10 @@ class CrossrefImporter(EntityImporter):
                 doi=doi,
                 isbn13=isbn13,
             ),
-            volume=clean(obj.get("volume")),
-            issue=clean(obj.get("issue")),
-            pages=clean(obj.get("page")),
-            language=clean(obj.get("language")),
+            volume=clean_str(obj.get("volume")),
+            issue=clean_str(obj.get("issue")),
+            pages=clean_str(obj.get("page")),
+            language=clean_str(obj.get("language")),
             license_slug=license_slug,
             extra=extra or None,
             abstracts=abstracts or None,
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index fe02cac4..441514b8 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -21,10 +21,10 @@ import langdetect
 import pycountry
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str
 from fatcat_tools.transforms import entity_to_dict
 
-from .common import EntityImporter, clean
+from .common import EntityImporter
 
 # Cutoff length for abstracts.
 MAX_ABSTRACT_LENGTH = 2048
@@ -322,7 +322,7 @@ class DataciteImporter(EntityImporter):
             print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
             return False
 
-        title = clean(title)
+        title = clean_str(title)
         if not title:
             print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
             return False
@@ -341,7 +341,7 @@ class DataciteImporter(EntityImporter):
         if not subtitle:
             subtitle = None
         else:
-            subtitle = clean(subtitle)
+            subtitle = clean_str(subtitle)
 
         # Dates. A few internal dates (registered, created, updated) and
         # published (0..2554). We try to work with typed date list, in
@@ -399,7 +399,7 @@ class DataciteImporter(EntityImporter):
             publisher = None
 
         if publisher:
-            publisher = clean(publisher)
+            publisher = clean_str(publisher)
 
         # Container. For the moment, only ISSN as container.
         container_id = None
@@ -460,10 +460,10 @@ class DataciteImporter(EntityImporter):
         issue = container.get("issue")
 
         if volume:
-            volume = clean(volume)
+            volume = clean_str(volume)
 
         if issue:
-            issue = clean(issue)
+            issue = clean_str(issue)
 
         # Pages.
         pages = None
@@ -548,7 +548,7 @@ class DataciteImporter(EntityImporter):
                     "[{}] language detection failed with {} on {}".format(doi, err, text),
                     file=sys.stderr,
                 )
-            abstract_text = clean(text)
+            abstract_text = clean_str(text)
             if not abstract_text:
                 continue
             abstracts.append(
@@ -874,14 +874,14 @@ class DataciteImporter(EntityImporter):
                 if len(affiliations) == 0:
                     raw_affiliation = None
                 else:
-                    raw_affiliation = clean(affiliations[0])
+                    raw_affiliation = clean_str(affiliations[0])
 
                 name = c.get("name")
                 given_name = c.get("givenName")
                 surname = c.get("familyName")
 
                 if name:
-                    name = clean(name)
+                    name = clean_str(name)
                 if not any((name, given_name, surname)):
                     continue
                 if not name:
@@ -895,8 +895,8 @@ class DataciteImporter(EntityImporter):
                     name = index_form_to_display_name(name)
 
                 if given_name:
-                    given_name = clean(given_name)
-                surname = clean(surname)
+                    given_name = clean_str(given_name)
+                surname = clean_str(surname)
 
                 # Perform a final assertion that name does not reduce to zero
                 # (e.g. whitespace only name).
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 7c595787..9db499a0 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str
 
-from .common import EntityImporter, clean, make_rel_url
+from .common import EntityImporter, make_rel_url
 
 MAX_ABSTRACT_BYTES = 4096
 
@@ -86,7 +86,7 @@ class GrobidMetadataImporter(EntityImporter):
         abstract = obj.get("abstract")
         if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
             abobj = fatcat_openapi_client.ReleaseAbstract(
-                mimetype="text/plain", content=clean(obj.get("abstract"))
+                mimetype="text/plain", content=clean_str(obj.get("abstract"))
             )
             abstracts = [abobj]
         else:
@@ -97,9 +97,9 @@ class GrobidMetadataImporter(EntityImporter):
             contribs.append(
                 fatcat_openapi_client.ReleaseContrib(
                     index=i,
-                    raw_name=clean(a["name"]),
-                    given_name=clean(a.get("given_name")),
-                    surname=clean(a.get("surname")),
+                    raw_name=clean_str(a["name"]),
+                    given_name=clean_str(a.get("given_name")),
+                    surname=clean_str(a.get("surname")),
                     role="author",
                     extra=None,
                 )
@@ -116,15 +116,15 @@ class GrobidMetadataImporter(EntityImporter):
                     pass
             for key in ("volume", "url", "issue", "publisher"):
                 if raw.get(key):
-                    cite_extra[key] = clean(raw[key])
+                    cite_extra[key] = clean_str(raw[key])
             if raw.get("authors"):
-                cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]
+                cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]]
 
             refs.append(
                 fatcat_openapi_client.ReleaseRef(
-                    key=clean(raw.get("id")),
+                    key=clean_str(raw.get("id")),
                     year=year,
-                    title=clean(raw["title"]),
+                    title=clean_str(raw["title"]),
                     extra=cite_extra or None,
                 )
             )
@@ -140,7 +140,7 @@ class GrobidMetadataImporter(EntityImporter):
         if doi:
             extra["doi"] = doi
         if obj["journal"] and obj["journal"].get("name"):
-            extra["container_name"] = clean(obj["journal"]["name"])
+            extra["container_name"] = clean_str(obj["journal"]["name"])
 
         # TODO: ISSN/eISSN handling? or just journal name lookup?
 
@@ -149,7 +149,7 @@ class GrobidMetadataImporter(EntityImporter):
         if self.longtail_oa:
             extra["longtail_oa"] = True
 
-        clean_title = clean(obj["title"], force_xml=True)
+        clean_title = clean_str(obj["title"], force_xml=True)
         if not clean_title or len(clean_title) < 2:
             return None
         title = clean_title
@@ -161,9 +161,9 @@ class GrobidMetadataImporter(EntityImporter):
             release_year=release_year,
             contribs=contribs,
             refs=refs,
-            publisher=clean(obj["journal"].get("publisher")),
-            volume=clean(obj["journal"].get("volume")),
-            issue=clean(obj["journal"].get("issue")),
+            publisher=clean_str(obj["journal"].get("publisher")),
+            volume=clean_str(obj["journal"].get("volume")),
+            issue=clean_str(obj["journal"].get("issue")),
             abstracts=abstracts or None,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(),
             extra=extra or None,
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index a737ac9f..9916a55f 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -6,9 +6,9 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str, is_cjk
 
-from .common import DATE_FMT, EntityImporter, clean, is_cjk
+from .common import DATE_FMT, EntityImporter
 
 
 # TODO: should be List[Tag] not List[Any] for full type annotations
@@ -36,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]:
     for raw in raw_persons:
         name = raw.find("name") or None
         if name:
-            name = clean(name.get_text().replace("\n", " "))
+            name = clean_str(name.get_text().replace("\n", " "))
         surname = raw.find("familyName") or None
         if surname:
-            surname = clean(surname.get_text().replace("\n", " "))
+            surname = clean_str(surname.get_text().replace("\n", " "))
         given_name = raw.find("givenName") or None
         if given_name:
-            given_name = clean(given_name.get_text().replace("\n", " "))
+            given_name = clean_str(given_name.get_text().replace("\n", " "))
         lang = "en"
         if is_cjk(name):
             lang = "ja"
@@ -230,16 +230,16 @@ class JalcImporter(EntityImporter):
                 for p in record.find_all("publicationName")
                 if p.get_text()
             ]
-            pubs = [clean(p) for p in pubs if p]
+            pubs = [clean_str(p) for p in pubs if p]
             assert pubs
             if len(pubs) > 1 and pubs[0] == pubs[1]:
                 pubs = [pubs[0]]
             if len(pubs) > 1 and is_cjk(pubs[0]):
                 # eng/jpn ordering is not reliable
                 pubs = [pubs[1], pubs[0]]
-            container_name = clean(pubs[0])
+            container_name = clean_str(pubs[0])
             if len(pubs) > 1:
-                container_extra["original_name"] = clean(pubs[1])
+                container_extra["original_name"] = clean_str(pubs[1])
 
         if record.publisher:
             pubs = [
@@ -254,7 +254,7 @@ class JalcImporter(EntityImporter):
                 # ordering is not reliable
                 pubs = [pubs[1], pubs[0]]
             if pubs:
-                publisher = clean(pubs[0])
+                publisher = clean_str(pubs[0])
                 if len(pubs) > 1:
                     container_extra["publisher_aliases"] = pubs[1:]
 
@@ -296,14 +296,14 @@ class JalcImporter(EntityImporter):
         # (informally)
         extra["jalc"] = extra_jalc
 
-        title = clean(title)
+        title = clean_str(title)
         if not title:
             return None
 
         re = ReleaseEntity(
             work_id=None,
             title=title,
-            original_title=clean(original_title),
+            original_title=clean_str(original_title),
             release_type=release_type,
             release_stage="published",
             release_date=release_date,
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index a45e49f3..fc1dfcbd 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ContainerEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
 
 
 def or_none(s: Optional[str]) -> Optional[str]:
@@ -105,7 +107,7 @@ class JournalMetadataImporter(EntityImporter):
         if extra_ia:
             extra["ia"] = extra_ia
 
-        name = clean(row.get("name"))
+        name = clean_str(row.get("name"))
         if not name:
             return None
 
@@ -115,7 +117,7 @@ class JournalMetadataImporter(EntityImporter):
             issnp=row.get("issnp"),
             container_type=None,  # TODO
             name=name,
-            publisher=clean(row.get("publisher")),
+            publisher=clean_str(row.get("publisher")),
             wikidata_qid=None,  # TODO
             extra=extra,
         )
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index ca1f2466..c2f650b0 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,9 +8,9 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import LANG_MAP_MARC, clean_doi, clean_str
 
-from .common import LANG_MAP_MARC, EntityImporter, clean
+from .common import EntityImporter
 from .crossref import CONTAINER_TYPE_MAP
 
 # TODO: more entries?
@@ -140,7 +140,7 @@ class JstorImporter(EntityImporter):
                 issnl=issnl,
                 publisher=publisher,
                 container_type=self.map_container_type(release_type),
-                name=clean(journal_title, force_xml=True),
+                name=clean_str(journal_title, force_xml=True),
             )
             ce_edit = self.create_container(ce)
             container_id = ce_edit.ident
@@ -166,13 +166,13 @@ class JstorImporter(EntityImporter):
             for c in cgroup.find_all("contrib"):
                 given = c.find("given-names")
                 if given:
-                    given = clean(given.get_text().replace("\n", " "))
+                    given = clean_str(given.get_text().replace("\n", " "))
                 surname = c.find("surname")
                 if surname:
-                    surname = clean(surname.get_text().replace("\n", " "))
+                    surname = clean_str(surname.get_text().replace("\n", " "))
                 raw_name = c.find("string-name")
                 if raw_name:
-                    raw_name = clean(raw_name.get_text().replace("\n", " "))
+                    raw_name = clean_str(raw_name.get_text().replace("\n", " "))
 
                 if not raw_name:
                     if given and surname:
@@ -234,7 +234,7 @@ class JstorImporter(EntityImporter):
 
         # JSTOR issue-id
         if article_meta.find("issue-id"):
-            issue_id = clean(article_meta.find("issue-id").string)
+            issue_id = clean_str(article_meta.find("issue-id").string)
             if issue_id:
                 extra_jstor["issue_id"] = issue_id
 
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 430cdd0f..f3d82a86 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -4,7 +4,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, CreatorEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
 
 
 def value_or_none(e: Any) -> Any:
@@ -65,14 +67,14 @@ class OrcidImporter(EntityImporter):
         if not self.is_orcid(orcid):
             sys.stderr.write("Bad ORCID: {}\n".format(orcid))
             return None
-        display = clean(display)
+        display = clean_str(display)
         if not display:
             # must have *some* name
             return None
         ce = CreatorEntity(
             orcid=orcid,
-            given_name=clean(given),
-            surname=clean(sur),
+            given_name=clean_str(given),
+            surname=clean_str(sur),
             display_name=display,
             extra=extra,
         )
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index d32fcefa..3274234f 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,9 +8,16 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid
+from fatcat_tools.normal import (
+    LANG_MAP_MARC,
+    clean_doi,
+    clean_issn,
+    clean_pmcid,
+    clean_pmid,
+    clean_str,
+)
 
-from .common import LANG_MAP_MARC, EntityImporter, clean
+from .common import EntityImporter
 
 # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
 PUBMED_RELEASE_TYPE_MAP = {
@@ -704,14 +711,14 @@ class PubmedImporter(EntityImporter):
         if extra_pubmed:
             extra["pubmed"] = extra_pubmed
 
-        title = clean(title)
+        title = clean_str(title)
         if not title:
             return None
 
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
             title=title,
-            original_title=clean(original_title),
+            original_title=clean_str(original_title),
             release_type=release_type,
             release_stage=release_stage,
             release_date=release_date,
-- 
cgit v1.2.3


From ddc757bc1d5c610f42e9f5f10a4f060f517b66ca Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 10 Nov 2021 13:52:39 -0800
Subject: refactor importer metadata tables into separate file; move some
 helpers around

- MAX_ABSTRACT_LENGTH set in a single place (importer common)
- merge datacite license slug table in to common table, removing some
  TDM-specific licenses (which do not apply in the context of preserving
  the full work)
---
 python/fatcat_tools/biblio_lookup_tables.py      | 623 +++++++++++++++++++++++
 python/fatcat_tools/importers/__init__.py        |   3 +-
 python/fatcat_tools/importers/common.py          |  61 +--
 python/fatcat_tools/importers/crossref.py        |  94 +---
 python/fatcat_tools/importers/datacite.py        | 155 +-----
 python/fatcat_tools/importers/doaj_article.py    |   5 +-
 python/fatcat_tools/importers/grobid_metadata.py |   6 +-
 python/fatcat_tools/importers/jstor.py           |   3 +-
 python/fatcat_tools/importers/pubmed.py          | 319 +-----------
 python/fatcat_tools/normal.py                    | 115 ++---
 10 files changed, 682 insertions(+), 702 deletions(-)
 create mode 100644 python/fatcat_tools/biblio_lookup_tables.py

(limited to 'python/fatcat_tools/importers/__init__.py')

diff --git a/python/fatcat_tools/biblio_lookup_tables.py b/python/fatcat_tools/biblio_lookup_tables.py
new file mode 100644
index 00000000..a9a097ae
--- /dev/null
+++ b/python/fatcat_tools/biblio_lookup_tables.py
@@ -0,0 +1,623 @@
+"""
+This file contains lookup tables and other static data structures used in
+bibliographic metadata munging.
+"""
+
+from typing import Dict, Optional
+
+# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
+# 2/T and 2/B?
+# PubMed/MEDLINE and JSTOR use these MARC codes
+# https://www.loc.gov/marc/languages/language_name.html
+LANG_MAP_MARC: Dict[str, Optional[str]] = {
+    "afr": "af",
+    "alb": "sq",
+    "amh": "am",
+    "ara": "ar",
+    "arm": "hy",
+    "aze": "az",
+    "ben": "bn",
+    "bos": "bs",
+    "bul": "bg",
+    "cat": "ca",
+    "chi": "zh",
+    "cze": "cs",
+    "dan": "da",
+    "dut": "nl",
+    "eng": "en",
+    "epo": "eo",
+    "est": "et",
+    "fin": "fi",
+    "fre": "fr",
+    "geo": "ka",
+    "ger": "de",
+    "gla": "gd",
+    "gre": "el",
+    "heb": "he",
+    "hin": "hi",
+    "hrv": "hr",
+    "hun": "hu",
+    "ice": "is",
+    "ind": "id",
+    "ita": "it",
+    "jpn": "ja",
+    "kin": "rw",
+    "kor": "ko",
+    "lat": "la",
+    "lav": "lv",
+    "lit": "lt",
+    "mac": "mk",
+    "mal": "ml",
+    "mao": "mi",
+    "may": "ms",
+    "nor": "no",
+    "per": "fa",
+    "per": "fa",
+    "pol": "pl",
+    "por": "pt",
+    "pus": "ps",
+    "rum": "ro",
+    "rus": "ru",
+    "san": "sa",
+    "slo": "sk",
+    "slv": "sl",
+    "spa": "es",
+    "srp": "sr",
+    "swe": "sv",
+    "tha": "th",
+    "tur": "tr",
+    "ukr": "uk",
+    "urd": "ur",
+    "vie": "vi",
+    "wel": "cy",
+    # additions
+    "gle": "ga",  # "Irish" (Gaelic)
+    "jav": "jv",  # Javanese
+    "welsh": "cy",  # Welsh
+    "oci": "oc",  # Occitan
+    # Don't have ISO 639-1 codes
+    "grc": "el",  # Ancient Greek; map to modern greek
+    "map": None,  # Austronesian (collection)
+    "syr": None,  # Syriac, Modern
+    "gem": None,  # Old Saxon
+    "non": None,  # Old Norse
+    "emg": None,  # Eastern Meohang
+    "neg": None,  # Negidal
+    "mul": None,  # Multiple languages
+    "und": None,  # Undetermined
+}
+
+# these are mappings from web domains to URL 'rel' for things like file entity
+# URL notation
+DOMAIN_REL_MAP: Dict[str, str] = {
+    "archive.org": "archive",
+    # LOCKSS, Portico, DuraSpace, etc would also be "archive"
+    "arxiv.org": "repository",
+    "babel.hathitrust.org": "repository",
+    "cds.cern.ch": "repository",
+    "deepblue.lib.umich.edu": "repository",
+    "europepmc.org": "repository",
+    "hal.inria.fr": "repository",
+    "scielo.isciii.es": "repository",
+    "www.dtic.mil": "repository",
+    "www.jstage.jst.go.jp": "repository",
+    "www.jstor.org": "repository",
+    "www.ncbi.nlm.nih.gov": "repository",
+    "ftp.ncbi.nlm.nih.gov": "repository",
+    "www.scielo.br": "repository",
+    "www.scielo.cl": "repository",
+    "www.scielo.org.mx": "repository",
+    "zenodo.org": "repository",
+    "www.biorxiv.org": "repository",
+    "www.medrxiv.org": "repository",
+    "citeseerx.ist.psu.edu": "aggregator",
+    "publisher-connector.core.ac.uk": "aggregator",
+    "core.ac.uk": "aggregator",
+    "static.aminer.org": "aggregator",
+    "aminer.org": "aggregator",
+    "pdfs.semanticscholar.org": "aggregator",
+    "semanticscholar.org": "aggregator",
+    "www.semanticscholar.org": "aggregator",
+    "academic.oup.com": "publisher",
+    "cdn.elifesciences.org": "publisher",
+    "cell.com": "publisher",
+    "dl.acm.org": "publisher",
+    "downloads.hindawi.com": "publisher",
+    "elifesciences.org": "publisher",
+    "iopscience.iop.org": "publisher",
+    "journals.plos.org": "publisher",
+    "link.springer.com": "publisher",
+    "onlinelibrary.wiley.com": "publisher",
+    "works.bepress.com": "publisher",
+    "www.biomedcentral.com": "publisher",
+    "www.cell.com": "publisher",
+    "www.nature.com": "publisher",
+    "www.pnas.org": "publisher",
+    "www.tandfonline.com": "publisher",
+    "www.frontiersin.org": "publisher",
+    "www.degruyter.com": "publisher",
+    "www.mdpi.com": "publisher",
+    "www.ahajournals.org": "publisher",
+    "ehp.niehs.nih.gov": "publisher",
+    "journals.tsu.ru": "publisher",
+    "www.cogentoa.com": "publisher",
+    "www.researchgate.net": "academicsocial",
+    "academia.edu": "academicsocial",
+    "wayback.archive-it.org": "webarchive",
+    "web.archive.org": "webarchive",
+    "archive.is": "webarchive",
+}
+
+# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
+PUBMED_RELEASE_TYPE_MAP = {
+    # Adaptive Clinical Trial
+    "Address": "speech",
+    "Autobiography": "book",
+    # Bibliography
+    "Biography": "book",
+    # Case Reports
+    "Classical Article": "article-journal",
+    # Clinical Conference
+    # Clinical Study
+    # Clinical Trial
+    # Clinical Trial, Phase I
+    # Clinical Trial, Phase II
+    # Clinical Trial, Phase III
+    # Clinical Trial, Phase IV
+    # Clinical Trial Protocol
+    # Clinical Trial, Veterinary
+    # Collected Works
+    # Comparative Study
+    # Congress
+    # Consensus Development Conference
+    # Consensus Development Conference, NIH
+    # Controlled Clinical Trial
+    "Dataset": "dataset",
+    # Dictionary
+    # Directory
+    # Duplicate Publication
+    "Editorial": "editorial",
+    # English Abstract   # doesn't indicate that this is abstract-only
+    # Equivalence Trial
+    # Evaluation Studies
+    # Expression of Concern
+    # Festschrift
+    # Government Document
+    # Guideline
+    "Historical Article": "article-journal",
+    # Interactive Tutorial
+    "Interview": "interview",
+    "Introductory Journal Article": "article-journal",
+    "Journal Article": "article-journal",
+    "Lecture": "speech",
+    "Legal Case": "legal_case",
+    "Legislation": "legislation",
+    "Letter": "letter",
+    # Meta-Analysis
+    # Multicenter Study
+    # News
+    "Newspaper Article": "article-newspaper",
+    # Observational Study
+    # Observational Study, Veterinary
+    # Overall
+    # Patient Education Handout
+    # Periodical Index
+    # Personal Narrative
+    # Portrait
+    # Practice Guideline
+    # Pragmatic Clinical Trial
+    # Publication Components
+    # Publication Formats
+    # Publication Type Category
+    # Randomized Controlled Trial
+    # Research Support, American Recovery and Reinvestment Act
+    # Research Support, N.I.H., Extramural
+    # Research Support, N.I.H., Intramural
+    # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
+    # Research Support, U.S. Gov't, P.H.S.
+    # Review     # in the "literature review" sense, not "product review"
+    # Scientific Integrity Review
+    # Study Characteristics
+    # Support of Research
+    # Systematic Review
+    "Technical Report": "report",
+    # Twin Study
+    # Validation Studies
+    # Video-Audio Media
+    # Webcasts
+}
+
+MONTH_ABBR_MAP: Dict[str, int] = {
+    "Jan": 1,
+    "01": 1,
+    "Feb": 2,
+    "02": 2,
+    "Mar": 3,
+    "03": 3,
+    "Apr": 4,
+    "04": 4,
+    "May": 5,
+    "05": 5,
+    "Jun": 6,
+    "06": 6,
+    "Jul": 7,
+    "07": 7,
+    "Aug": 8,
+    "08": 8,
+    "Sep": 9,
+    "09": 9,
+    "Oct": 10,
+    "10": 10,
+    "Nov": 11,
+    "11": 11,
+    "Dec": 12,
+    "12": 12,
+}
+
+# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
+COUNTRY_NAME_MAP: Dict[str, str] = {
+    "Afghanistan": "af",
+    "Albania": "al",
+    "Algeria": "dz",
+    "Andorra": "ad",
+    "Angola": "ao",
+    "Antigua and Barbuda": "ag",
+    "Argentina": "ar",
+    "Armenia": "am",
+    "Australia": "au",
+    "Austria": "at",
+    "Azerbaijan": "az",
+    "Bahamas": "bs",
+    "Bahrain": "bh",
+    "Bangladesh": "bd",
+    "Barbados": "bb",
+    "Belarus": "by",
+    "Belgium": "be",
+    "Belize": "bz",
+    "Benin": "bj",
+    "Bhutan": "bt",
+    "Bolivia": "bo",
+    "Bosnia and Herzegowina": "ba",
+    "Botswana": "bw",
+    "Brazil": "br",
+    "Brunei Darussalam": "bn",
+    "Bulgaria": "bg",
+    "Burkina Faso": "bf",
+    "Burundi": "bi",
+    "Cambodia": "kh",
+    "Cameroon": "cm",
+    "Canada": "ca",
+    "Cape Verde": "cv",
+    "Central African Republic": "cf",
+    "Chad": "td",
+    "Chile": "cl",
+    "China": "cn",
+    "Colombia": "co",
+    "Comoros": "km",
+    "Congo, Democratic Republic": "cd",
+    "Congo, People’s Republic": "cg",
+    "Costa Rica": "cr",
+    "Cote d'Ivoire": "ci",
+    "Croatia (Local Name: Hrvatska)": "hr",
+    "Cuba": "cu",
+    "Cyprus": "cy",
+    "Czech Republic": "cz",
+    "Denmark": "dk",
+    "Djibouti": "dj",
+    "Dominica": "dm",
+    "Dominican Republic": "do",
+    "East Timor": "tl",
+    "Ecuador": "ec",
+    "El Salvador": "sv",
+    "Equatorial Guinea": "gq",
+    "Eritrea": "er",
+    "Estonia": "ee",
+    "Ethiopia": "et",
+    "Fiji": "fj",
+    "Finland": "fi",
+    "France": "fr",
+    "Gabon": "ga",
+    "Gambia": "gm",
+    "Georgia": "ge",
+    "Germany": "de",
+    "Ghana": "gh",
+    "Greece": "gr",
+    "Greenland": "gl",
+    "Grenada": "gd",
+    "Guatemala": "gt",
+    "Guinea": "gn",
+    "Guinea-Bissau": "gw",
+    "Guyana": "gy",
+    "Haiti": "ht",
+    "Honduras": "hn",
+    "Hong Kong": "hk",
+    "Hungary": "hu",
+    "Iceland": "is",
+    "India": "in",
+    "Indonesia": "id",
+    "Iran": "ir",
+    "Iraq": "iq",
+    "Ireland": "ie",
+    "Israel": "il",
+    "Italy": "it",
+    "Jamaica": "jm",
+    "Japan": "jp",
+    "Jordan": "jo",
+    "Kazakhstan": "kz",
+    "Kenya": "ke",
+    "Kiribati": "ki",
+    "Korea, Democratic People's Republic": "kp",
+    "Korea, Republic": "kr",
+    "Kuwait": "kw",
+    "Kyrgyzstan": "kg",
+    "Laos": "la",
+    "Latvia": "lv",
+    "Lebanon": "lb",
+    "Lesotho": "ls",
+    "Liberia": "lr",
+    "Libya": "ly",
+    "Liechtenstein": "li",
+    "Lithuania": "lt",
+    "Luxembourg": "lu",
+    "Macedonia": "mk",
+    "Madagascar": "mg",
+    "Malawi": "mw",
+    "Malaysia": "my",
+    "Maldives": "mv",
+    "Mali": "ml",
+    "Malta": "mt",
+    "Marshall Islands": "mh",
+    "Mauritania": "mr",
+    "Mauritius": "mu",
+    "Mexico": "mx",
+    "Micronesia": "fm",
+    "Moldova": "md",
+    "Monaco": "mc",
+    "Mongolia": "mn",
+    "Morocco": "ma",
+    "Mozambique": "mz",
+    "Myanmar": "mm",
+    "Namibia": "na",
+    "Nauru": "nr",
+    "Nepal": "np",
+    "Netherlands": "nl",
+    "New Zealand": "nz",
+    "Nicaragua": "ni",
+    "Niger": "ne",
+    "Nigeria": "ng",
+    "Norway": "no",
+    "Oman": "om",
+    "Pakistan": "pk",
+    "Palau": "pw",
+    "Panama": "pa",
+    "Papua New Guinea": "pg",
+    "Paraguay": "py",
+    "Peru": "pe",
+    "Philippines": "ph",
+    "Poland": "pl",
+    "Portugal": "pt",
+    "Puerto Rico": "pr",
+    "Qatar": "qa",
+    "Romania": "ro",
+    "Russian Federation": "ru",
+    "Rwanda": "rw",
+    "Saint Kitts and Nevis": "kn",
+    "Saint Lucia": "lc",
+    "Saint Vincent and the Grenadines": "vc",
+    "Samoa": "ws",
+    "San Marino": "sm",
+    "Sao Tome and Príncipe": "st",
+    "Saudi Arabia": "sa",
+    "Senegal": "sn",
+    "Serbia and Montenegro": "cs",
+    "Seychelles": "sc",
+    "Sierra Leone": "sl",
+    "Singapore": "sg",
+    "Slovakia (Slovak Republic)": "sk",
+    "Slovenia": "si",
+    "Solomon Islands": "sb",
+    "Somalia": "so",
+    "South Africa": "za",
+    "Spain": "es",
+    "Sri Lanka": "lk",
+    "Sudan": "sd",
+    "Suriname": "sr",
+    "Swaziland": "sz",
+    "Sweden": "se",
+    "Switzerland": "ch",
+    "Syrian Arab Republic": "sy",
+    "Taiwan": "tw",
+    "Tajikistan": "tj",
+    "Tanzania": "tz",
+    "Tanzania": "tz",
+    "Thailand": "th",
+    "Togo": "tg",
+    "Tonga": "to",
+    "Trinidad and Tobago": "tt",
+    "Tunisia": "tn",
+    "Turkey": "tr",
+    "Turkmenistan": "tm",
+    "Tuvalu": "tv",
+    "Uganda": "ug",
+    "Ukraine": "ua",
+    "United Arab Emirates": "ae",
+    "United Kingdom": "gb",
+    "United States": "us",
+    "Uruguay": "uy",
+    # Additions from running over large files
+    "Bosnia and Herzegovina": "ba",
+    # "International"
+    "China (Republic : 1949- )": "tw",  # pretty sure this is tw not cn
+    "Russia (Federation)": "ru",
+    "Scotland": "gb",
+    "England": "gb",
+    "Korea (South)": "kr",
+    "Georgia (Republic)": "ge",
+    "Egypt": "eg",
+}
+
+CONTAINER_TYPE_MAP: Dict[str, str] = {
+    "article-journal": "journal",
+    "paper-conference": "conference",
+    "book": "book-series",
+}
+
+# These are based, informally, on sorting the most popular licenses found in
+# Crossref metadata. There were over 500 unique strings and only a few most
+# popular are here; many were variants of the CC URLs. Would be useful to
+# normalize CC licenses better.
+# The current norm is to only add license slugs that are at least partially OA.
+LICENSE_SLUG_MAP: Dict[str, str] = {
+    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
+    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
+    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
+    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
+    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
+    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
+    "//creativecommons.org/licenses/by/2.0/": "CC-BY",
+    "//creativecommons.org/licenses/by/3.0/": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/": "CC-BY",
+    "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
+    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
+    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
+    "//spdx.org/licenses/CC0-1.0.json": "CC-0",
+    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
+    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
+    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
+    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
+    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
+    "//spdx.org/licenses/MIT.json": "MIT",
+    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
+    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
+    "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
+    "//www.karger.com/Services/SiteLicenses": "KARGER",
+    "//www.karger.com/Services/SiteLicenses/": "KARGER",
+    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
+    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
+    "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
+    "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
+    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
+    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
+    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
+    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
+    "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
+    "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
+    "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
+    "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
+    "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
+    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
+    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
+    "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
+    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
+    # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+    # //www.springer.com/tdm doesn't seem like a license
+    # //iopscience.iop.org/page/copyright is closed
+    # //www.acm.org/publications/policies/copyright_policy#Background is closed
+    # //rsc.li/journals-terms-of-use is closed for vor (am open)
+    # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
+    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
+    # skip these TDM licenses; they don't apply to content
+    # "//www.springer.com/tdm/": "SPRINGER-TDM",
+    # "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
+    # "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
+}
+
+# Map various datacite type types to CSL-ish types. None means TODO or remove.
+DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
+    "ris": {
+        "THES": "thesis",
+        "SOUND": "song",  # 99.9% maps to citeproc song, so use that (exception: report)
+        "CHAP": "chapter",
+        "FIGURE": "figure",
+        "RPRT": "report",
+        "JOUR": "article-journal",
+        "MPCT": "motion_picture",
+        "GEN": "article-journal",  # GEN consist of 99% article and report, post-weblog, misc - and one dataset
+        "BOOK": "book",
+        "DATA": "dataset",
+        "COMP": "software",
+    },
+    "schemaOrg": {
+        "Dataset": "dataset",
+        "Book": "book",
+        "ScholarlyArticle": "article-journal",
+        "ImageObject": "graphic",
+        "Collection": None,
+        "MediaObject": None,
+        "Event": None,
+        "SoftwareSourceCode": "software",
+        "Chapter": "chapter",
+        "CreativeWork": None,  # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
+        "PublicationIssue": "article",
+        "AudioObject": None,
+        "Thesis": "thesis",
+    },
+    "citeproc": {
+        "article": "article",
+        "article-journal": "article-journal",
+        "article-magazine": "article-magazine",
+        "article-newspaper": "article-newspaper",
+        "bill": "bill",
+        "book": "book",
+        "broadcast": "broadcast",
+        "chapter": "chapter",
+        "dataset": "dataset",
+        "entry-dictionary": "entry-dictionary",
+        "entry-encyclopedia": "entry-encyclopedia",
+        "entry": "entry",
+        "figure": "figure",
+        "graphic": "graphic",
+        "interview": "interview",
+        "legal_case": "legal_case",
+        "legislation": "legislation",
+        "manuscript": "manuscript",
+        "map": "map",
+        "motion_picture": "motion_picture",
+        "musical_score": "musical_score",
+        "pamphlet": "pamphlet",
+        "paper-conference": "paper-conference",
+        "patent": "patent",
+        "personal_communication": "personal_communication",
+        "post": "post",
+        "post-weblog": "post-weblog",
+        "report": "report",
+        "review-book": "review-book",
+        "review": "review",
+        "song": "song",
+        "speech": "speech",
+        "thesis": "thesis",
+        "treaty": "treaty",
+        "webpage": "webpage",
+    },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
+    "bibtex": {
+        "phdthesis": "thesis",
+        "inbook": "chapter",
+        "misc": None,
+        "article": "article-journal",
+        "book": "book",
+    },
+    "resourceTypeGeneral": {
+        "Image": "graphic",
+        "Dataset": "dataset",
+        "PhysicalObject": None,
+        "Collection": None,
+        "Text": None,  # "Greyliterature, labnotes, accompanyingmaterials"
+        "Sound": None,
+        "InteractiveResource": None,
+        "Event": None,
+        "Software": "software",
+        "Other": None,
+        "Workflow": None,
+        "Audiovisual": None,
+    },  # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
+}
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 4d4d696b..654be2e9 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -26,9 +26,8 @@ from .common import (
     KafkaJsonPusher,
     LinePusher,
     SqlitePusher,
-    make_kafka_consumer,
 )
-from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug
+from .crossref import CrossrefImporter
 from .datacite import DataciteImporter
 from .dblp_container import DblpContainerImporter
 from .dblp_release import DblpReleaseImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 56c3d32e..7c587395 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -27,71 +27,14 @@ from fatcat_openapi_client import (
 from fatcat_openapi_client.rest import ApiException
 from fuzzycat.matching import match_release_fuzzy
 
+from fatcat_tools.biblio_lookup_tables import DOMAIN_REL_MAP
 from fatcat_tools.normal import clean_doi
 from fatcat_tools.transforms import entity_to_dict
 
 DATE_FMT: str = "%Y-%m-%d"
 SANE_MAX_RELEASES: int = 200
 SANE_MAX_URLS: int = 100
-
-DOMAIN_REL_MAP: Dict[str, str] = {
-    "archive.org": "archive",
-    # LOCKSS, Portico, DuraSpace, etc would also be "archive"
-    "arxiv.org": "repository",
-    "babel.hathitrust.org": "repository",
-    "cds.cern.ch": "repository",
-    "deepblue.lib.umich.edu": "repository",
-    "europepmc.org": "repository",
-    "hal.inria.fr": "repository",
-    "scielo.isciii.es": "repository",
-    "www.dtic.mil": "repository",
-    "www.jstage.jst.go.jp": "repository",
-    "www.jstor.org": "repository",
-    "www.ncbi.nlm.nih.gov": "repository",
-    "ftp.ncbi.nlm.nih.gov": "repository",
-    "www.scielo.br": "repository",
-    "www.scielo.cl": "repository",
-    "www.scielo.org.mx": "repository",
-    "zenodo.org": "repository",
-    "www.biorxiv.org": "repository",
-    "www.medrxiv.org": "repository",
-    "citeseerx.ist.psu.edu": "aggregator",
-    "publisher-connector.core.ac.uk": "aggregator",
-    "core.ac.uk": "aggregator",
-    "static.aminer.org": "aggregator",
-    "aminer.org": "aggregator",
-    "pdfs.semanticscholar.org": "aggregator",
-    "semanticscholar.org": "aggregator",
-    "www.semanticscholar.org": "aggregator",
-    "academic.oup.com": "publisher",
-    "cdn.elifesciences.org": "publisher",
-    "cell.com": "publisher",
-    "dl.acm.org": "publisher",
-    "downloads.hindawi.com": "publisher",
-    "elifesciences.org": "publisher",
-    "iopscience.iop.org": "publisher",
-    "journals.plos.org": "publisher",
-    "link.springer.com": "publisher",
-    "onlinelibrary.wiley.com": "publisher",
-    "works.bepress.com": "publisher",
-    "www.biomedcentral.com": "publisher",
-    "www.cell.com": "publisher",
-    "www.nature.com": "publisher",
-    "www.pnas.org": "publisher",
-    "www.tandfonline.com": "publisher",
-    "www.frontiersin.org": "publisher",
-    "www.degruyter.com": "publisher",
-    "www.mdpi.com": "publisher",
-    "www.ahajournals.org": "publisher",
-    "ehp.niehs.nih.gov": "publisher",
-    "journals.tsu.ru": "publisher",
-    "www.cogentoa.com": "publisher",
-    "www.researchgate.net": "academicsocial",
-    "academia.edu": "academicsocial",
-    "wayback.archive-it.org": "webarchive",
-    "web.archive.org": "webarchive",
-    "archive.is": "webarchive",
-}
+MAX_ABSTRACT_LENGTH: int = 2048
 
 
 def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]:
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 8f5a4265..52bd7465 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,7 +4,8 @@ from typing import Any, Dict, List, Optional, Sequence
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import CONTAINER_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
 
 from .common import EntityImporter
 
@@ -33,97 +34,6 @@ CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
     "standard": "standard",
 }
 
-CONTAINER_TYPE_MAP: Dict[str, str] = {
-    "article-journal": "journal",
-    "paper-conference": "conference",
-    "book": "book-series",
-}
-
-# These are based, informally, on sorting the most popular licenses found in
-# Crossref metadata. There were over 500 unique strings and only a few most
-# popular are here; many were variants of the CC URLs. Would be useful to
-# normalize CC licenses better.
-# The current norm is to only add license slugs that are at least partially OA.
-LICENSE_SLUG_MAP: Dict[str, str] = {
-    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
-    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
-    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
-    "//creativecommons.org/licenses/by/2.0/": "CC-BY",
-    "//creativecommons.org/licenses/by/3.0/": "CC-BY",
-    "//creativecommons.org/licenses/by/4.0/": "CC-BY",
-    "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
-    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
-    "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
-    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
-    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
-    "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
-    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
-    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
-    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
-    "//spdx.org/licenses/CC0-1.0.json": "CC-0",
-    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
-    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/MIT.json": "MIT",
-    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.karger.com/Services/SiteLicenses": "KARGER",
-    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
-    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
-    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
-    "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
-    "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
-    "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
-    # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
-    # //www.springer.com/tdm doesn't seem like a license
-    # //iopscience.iop.org/page/copyright is closed
-    # //www.acm.org/publications/policies/copyright_policy#Background is closed
-    # //rsc.li/journals-terms-of-use is closed for vor (am open)
-    # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
-    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
-}
-
-
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
-    if not raw:
-        return None
-    raw = raw.strip().replace("http://", "//").replace("https://", "//")
-    if "creativecommons.org" in raw.lower():
-        raw = raw.lower()
-        raw = raw.replace("/legalcode", "/").replace("/uk", "")
-        if not raw.endswith("/"):
-            raw = raw + "/"
-    return LICENSE_SLUG_MAP.get(raw)
-
-
-def test_lookup_license_slug() -> None:
-
-    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
-    assert (
-        lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
-        == "CC-BY"
-    )
-    assert (
-        lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
-        == "CC-0"
-    )
-    assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
-    assert (
-        lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
-        == "CC-BY-NC-SA"
-    )
-    assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
-    assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
-    assert lookup_license_slug("") is None
-    assert lookup_license_slug(None) is None
-
 
 class CrossrefImporter(EntityImporter):
     """
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 441514b8..b310f8bc 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -21,113 +21,19 @@ import langdetect
 import pycountry
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import DATACITE_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
 from fatcat_tools.transforms import entity_to_dict
 
-from .common import EntityImporter
-
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter
 
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
-CONTAINER_TYPE_MAP: Dict[str, str] = {
+DATACITE_CONTAINER_TYPE_MAP: Dict[str, str] = {
     "Journal": "journal",
     "Series": "journal",
     "Book Series": "book-series",
 }
 
-# The docs/guide should be the canonical home for these mappings; update there
-# first.  Map various datacite type types to CSL-ish types. None means TODO or
-# remove.
-DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
-    "ris": {
-        "THES": "thesis",
-        "SOUND": "song",  # 99.9% maps to citeproc song, so use that (exception: report)
-        "CHAP": "chapter",
-        "FIGURE": "figure",
-        "RPRT": "report",
-        "JOUR": "article-journal",
-        "MPCT": "motion_picture",
-        "GEN": "article-journal",  # GEN consist of 99% article and report, post-weblog, misc - and one dataset
-        "BOOK": "book",
-        "DATA": "dataset",
-        "COMP": "software",
-    },
-    "schemaOrg": {
-        "Dataset": "dataset",
-        "Book": "book",
-        "ScholarlyArticle": "article-journal",
-        "ImageObject": "graphic",
-        "Collection": None,
-        "MediaObject": None,
-        "Event": None,
-        "SoftwareSourceCode": "software",
-        "Chapter": "chapter",
-        "CreativeWork": None,  # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
-        "PublicationIssue": "article",
-        "AudioObject": None,
-        "Thesis": "thesis",
-    },
-    "citeproc": {
-        "article": "article",
-        "article-journal": "article-journal",
-        "article-magazine": "article-magazine",
-        "article-newspaper": "article-newspaper",
-        "bill": "bill",
-        "book": "book",
-        "broadcast": "broadcast",
-        "chapter": "chapter",
-        "dataset": "dataset",
-        "entry-dictionary": "entry-dictionary",
-        "entry-encyclopedia": "entry-encyclopedia",
-        "entry": "entry",
-        "figure": "figure",
-        "graphic": "graphic",
-        "interview": "interview",
-        "legal_case": "legal_case",
-        "legislation": "legislation",
-        "manuscript": "manuscript",
-        "map": "map",
-        "motion_picture": "motion_picture",
-        "musical_score": "musical_score",
-        "pamphlet": "pamphlet",
-        "paper-conference": "paper-conference",
-        "patent": "patent",
-        "personal_communication": "personal_communication",
-        "post": "post",
-        "post-weblog": "post-weblog",
-        "report": "report",
-        "review-book": "review-book",
-        "review": "review",
-        "song": "song",
-        "speech": "speech",
-        "thesis": "thesis",
-        "treaty": "treaty",
-        "webpage": "webpage",
-    },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
-    "bibtex": {
-        "phdthesis": "thesis",
-        "inbook": "chapter",
-        "misc": None,
-        "article": "article-journal",
-        "book": "book",
-    },
-    "resourceTypeGeneral": {
-        "Image": "graphic",
-        "Dataset": "dataset",
-        "PhysicalObject": None,
-        "Collection": None,
-        "Text": None,  # "Greyliterature, labnotes, accompanyingmaterials"
-        "Sound": None,
-        "InteractiveResource": None,
-        "Event": None,
-        "Software": "software",
-        "Other": None,
-        "Workflow": None,
-        "Audiovisual": None,
-    },  # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
-}
-
 # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
 DATACITE_UNKNOWN_MARKERS: List[str] = [
     "(:unac)",  # temporarily inaccessible
@@ -180,43 +86,6 @@ DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [
     }
 ]
 
-# TODO(martin): merge this with other maps and lookup functions, eventually.
-LICENSE_SLUG_MAP: Dict[str, str] = {
-    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
-    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
-    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
-    "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
-    "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
-    "//onlinelibrary.wiley.com/termsandconditions/": "WILEY",
-    "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
-    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
-    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
-    "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
-    "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
-    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
-    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
-    "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
-    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
-    "//www.karger.com/Services/SiteLicenses/": "KARGER",
-    "//www.springer.com/tdm/": "SPRINGER-TDM",
-    "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
-    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
-    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
-    "//spdx.org/licenses/CC0-1.0.json": "CC-0",
-    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
-    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
-    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
-    "//spdx.org/licenses/MIT.json": "MIT",
-    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-}
-
 
 class DataciteImporter(EntityImporter):
     """
@@ -406,8 +275,8 @@ class DataciteImporter(EntityImporter):
         container_name = None
 
         container = attributes.get("container", {}) or {}
-        if container.get("type") in CONTAINER_TYPE_MAP.keys():
-            container_type = CONTAINER_TYPE_MAP.get(container["type"])
+        if container.get("type") in DATACITE_CONTAINER_TYPE_MAP.keys():
+            container_type = DATACITE_CONTAINER_TYPE_MAP.get(container["type"])
             if container.get("identifier") and container.get("identifierType") == "ISSN":
                 issn = container.get("identifier")
                 if issn and len(issn) == 8:
@@ -488,7 +357,7 @@ class DataciteImporter(EntityImporter):
         license_extra = []
 
         for lic in attributes.get("rightsList", []):
-            slug = lookup_license_slug(lic.get("rightsUri"))
+            slug = datacite_lookup_license_slug(lic.get("rightsUri"))
             if slug:
                 license_slug = slug
             license_extra.append(lic)
@@ -968,7 +837,7 @@ def contributor_list_contains_contributor(
     return False
 
 
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+def datacite_lookup_license_slug(raw: Optional[str]) -> Optional[str]:
     """
     Resolve a variety of strings into a some pseudo-canonical form, e.g.
     CC-BY-ND, CC-0, MIT and so on.
@@ -1063,12 +932,8 @@ def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
             return None
         return "RS-{}".format(name.upper())
 
-    # Fallback to mapped values.
-    raw = raw.lower()
-    raw = raw.strip().replace("http://", "//").replace("https://", "//")
-    if not raw.endswith("/"):
-        raw = raw + "/"
-    return LICENSE_SLUG_MAP.get(raw)
+    # Fallback to generic license lookup
+    return lookup_license_slug(raw)
 
 
 def find_original_language_title(
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index f5c886a2..92dbe574 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Sequence
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.importers.common import EntityImporter
+from fatcat_tools.importers.common import MAX_ABSTRACT_LENGTH, EntityImporter
 from fatcat_tools.normal import (
     clean_doi,
     clean_orcid,
@@ -24,9 +24,6 @@ from fatcat_tools.normal import (
     parse_month,
 )
 
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
-
 
 class DoajArticleImporter(EntityImporter):
     def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None:
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 9db499a0..3c85132c 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -9,9 +9,7 @@ from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
 
 from fatcat_tools.normal import clean_doi, clean_str
 
-from .common import EntityImporter, make_rel_url
-
-MAX_ABSTRACT_BYTES = 4096
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url
 
 
 class GrobidMetadataImporter(EntityImporter):
@@ -84,7 +82,7 @@ class GrobidMetadataImporter(EntityImporter):
         extra_grobid: Dict[str, Any] = dict()
 
         abstract = obj.get("abstract")
-        if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
+        if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10:
             abobj = fatcat_openapi_client.ReleaseAbstract(
                 mimetype="text/plain", content=clean_str(obj.get("abstract"))
             )
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index c2f650b0..79691c9a 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,7 +8,8 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import LANG_MAP_MARC, clean_doi, clean_str
+from fatcat_tools.biblio_lookup_tables import LANG_MAP_MARC
+from fatcat_tools.normal import clean_doi, clean_str
 
 from .common import EntityImporter
 from .crossref import CONTAINER_TYPE_MAP
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3274234f..5bc7a9ff 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,325 +8,16 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import (
+from fatcat_tools.biblio_lookup_tables import (
+    COUNTRY_NAME_MAP,
     LANG_MAP_MARC,
-    clean_doi,
-    clean_issn,
-    clean_pmcid,
-    clean_pmid,
-    clean_str,
+    MONTH_ABBR_MAP,
+    PUBMED_RELEASE_TYPE_MAP,
 )
+from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid, clean_str
 
 from .common import EntityImporter
 
-# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
-PUBMED_RELEASE_TYPE_MAP = {
-    # Adaptive Clinical Trial
-    "Address": "speech",
-    "Autobiography": "book",
-    # Bibliography
-    "Biography": "book",
-    # Case Reports
-    "Classical Article": "article-journal",
-    # Clinical Conference
-    # Clinical Study
-    # Clinical Trial
-    # Clinical Trial, Phase I
-    # Clinical Trial, Phase II
-    # Clinical Trial, Phase III
-    # Clinical Trial, Phase IV
-    # Clinical Trial Protocol
-    # Clinical Trial, Veterinary
-    # Collected Works
-    # Comparative Study
-    # Congress
-    # Consensus Development Conference
-    # Consensus Development Conference, NIH
-    # Controlled Clinical Trial
-    "Dataset": "dataset",
-    # Dictionary
-    # Directory
-    # Duplicate Publication
-    "Editorial": "editorial",
-    # English Abstract   # doesn't indicate that this is abstract-only
-    # Equivalence Trial
-    # Evaluation Studies
-    # Expression of Concern
-    # Festschrift
-    # Government Document
-    # Guideline
-    "Historical Article": "article-journal",
-    # Interactive Tutorial
-    "Interview": "interview",
-    "Introductory Journal Article": "article-journal",
-    "Journal Article": "article-journal",
-    "Lecture": "speech",
-    "Legal Case": "legal_case",
-    "Legislation": "legislation",
-    "Letter": "letter",
-    # Meta-Analysis
-    # Multicenter Study
-    # News
-    "Newspaper Article": "article-newspaper",
-    # Observational Study
-    # Observational Study, Veterinary
-    # Overall
-    # Patient Education Handout
-    # Periodical Index
-    # Personal Narrative
-    # Portrait
-    # Practice Guideline
-    # Pragmatic Clinical Trial
-    # Publication Components
-    # Publication Formats
-    # Publication Type Category
-    # Randomized Controlled Trial
-    # Research Support, American Recovery and Reinvestment Act
-    # Research Support, N.I.H., Extramural
-    # Research Support, N.I.H., Intramural
-    # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
-    # Research Support, U.S. Gov't, P.H.S.
-    # Review     # in the "literature review" sense, not "product review"
-    # Scientific Integrity Review
-    # Study Characteristics
-    # Support of Research
-    # Systematic Review
-    "Technical Report": "report",
-    # Twin Study
-    # Validation Studies
-    # Video-Audio Media
-    # Webcasts
-}
-
-MONTH_ABBR_MAP = {
-    "Jan": 1,
-    "01": 1,
-    "Feb": 2,
-    "02": 2,
-    "Mar": 3,
-    "03": 3,
-    "Apr": 4,
-    "04": 4,
-    "May": 5,
-    "05": 5,
-    "Jun": 6,
-    "06": 6,
-    "Jul": 7,
-    "07": 7,
-    "Aug": 8,
-    "08": 8,
-    "Sep": 9,
-    "09": 9,
-    "Oct": 10,
-    "10": 10,
-    "Nov": 11,
-    "11": 11,
-    "Dec": 12,
-    "12": 12,
-}
-
-# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
-COUNTRY_NAME_MAP = {
-    "Afghanistan": "af",
-    "Albania": "al",
-    "Algeria": "dz",
-    "Andorra": "ad",
-    "Angola": "ao",
-    "Antigua and Barbuda": "ag",
-    "Argentina": "ar",
-    "Armenia": "am",
-    "Australia": "au",
-    "Austria": "at",
-    "Azerbaijan": "az",
-    "Bahamas": "bs",
-    "Bahrain": "bh",
-    "Bangladesh": "bd",
-    "Barbados": "bb",
-    "Belarus": "by",
-    "Belgium": "be",
-    "Belize": "bz",
-    "Benin": "bj",
-    "Bhutan": "bt",
-    "Bolivia": "bo",
-    "Bosnia and Herzegowina": "ba",
-    "Botswana": "bw",
-    "Brazil": "br",
-    "Brunei Darussalam": "bn",
-    "Bulgaria": "bg",
-    "Burkina Faso": "bf",
-    "Burundi": "bi",
-    "Cambodia": "kh",
-    "Cameroon": "cm",
-    "Canada": "ca",
-    "Cape Verde": "cv",
-    "Central African Republic": "cf",
-    "Chad": "td",
-    "Chile": "cl",
-    "China": "cn",
-    "Colombia": "co",
-    "Comoros": "km",
-    "Congo, Democratic Republic": "cd",
-    "Congo, People’s Republic": "cg",
-    "Costa Rica": "cr",
-    "Cote d'Ivoire": "ci",
-    "Croatia (Local Name: Hrvatska)": "hr",
-    "Cuba": "cu",
-    "Cyprus": "cy",
-    "Czech Republic": "cz",
-    "Denmark": "dk",
-    "Djibouti": "dj",
-    "Dominica": "dm",
-    "Dominican Republic": "do",
-    "East Timor": "tl",
-    "Ecuador": "ec",
-    "El Salvador": "sv",
-    "Equatorial Guinea": "gq",
-    "Eritrea": "er",
-    "Estonia": "ee",
-    "Ethiopia": "et",
-    "Fiji": "fj",
-    "Finland": "fi",
-    "France": "fr",
-    "Gabon": "ga",
-    "Gambia": "gm",
-    "Georgia": "ge",
-    "Germany": "de",
-    "Ghana": "gh",
-    "Greece": "gr",
-    "Greenland": "gl",
-    "Grenada": "gd",
-    "Guatemala": "gt",
-    "Guinea": "gn",
-    "Guinea-Bissau": "gw",
-    "Guyana": "gy",
-    "Haiti": "ht",
-    "Honduras": "hn",
-    "Hong Kong": "hk",
-    "Hungary": "hu",
-    "Iceland": "is",
-    "India": "in",
-    "Indonesia": "id",
-    "Iran": "ir",
-    "Iraq": "iq",
-    "Ireland": "ie",
-    "Israel": "il",
-    "Italy": "it",
-    "Jamaica": "jm",
-    "Japan": "jp",
-    "Jordan": "jo",
-    "Kazakhstan": "kz",
-    "Kenya": "ke",
-    "Kiribati": "ki",
-    "Korea, Democratic People's Republic": "kp",
-    "Korea, Republic": "kr",
-    "Kuwait": "kw",
-    "Kyrgyzstan": "kg",
-    "Laos": "la",
-    "Latvia": "lv",
-    "Lebanon": "lb",
-    "Lesotho": "ls",
-    "Liberia": "lr",
-    "Libya": "ly",
-    "Liechtenstein": "li",
-    "Lithuania": "lt",
-    "Luxembourg": "lu",
-    "Macedonia": "mk",
-    "Madagascar": "mg",
-    "Malawi": "mw",
-    "Malaysia": "my",
-    "Maldives": "mv",
-    "Mali": "ml",
-    "Malta": "mt",
-    "Marshall Islands": "mh",
-    "Mauritania": "mr",
-    "Mauritius": "mu",
-    "Mexico": "mx",
-    "Micronesia": "fm",
-    "Moldova": "md",
-    "Monaco": "mc",
-    "Mongolia": "mn",
-    "Morocco": "ma",
-    "Mozambique": "mz",
-    "Myanmar": "mm",
-    "Namibia": "na",
-    "Nauru": "nr",
-    "Nepal": "np",
-    "Netherlands": "nl",
-    "New Zealand": "nz",
-    "Nicaragua": "ni",
-    "Niger": "ne",
-    "Nigeria": "ng",
-    "Norway": "no",
-    "Oman": "om",
-    "Pakistan": "pk",
-    "Palau": "pw",
-    "Panama": "pa",
-    "Papua New Guinea": "pg",
-    "Paraguay": "py",
-    "Peru": "pe",
-    "Philippines": "ph",
-    "Poland": "pl",
-    "Portugal": "pt",
-    "Puerto Rico": "pr",
-    "Qatar": "qa",
-    "Romania": "ro",
-    "Russian Federation": "ru",
-    "Rwanda": "rw",
-    "Saint Kitts and Nevis": "kn",
-    "Saint Lucia": "lc",
-    "Saint Vincent and the Grenadines": "vc",
-    "Samoa": "ws",
-    "San Marino": "sm",
-    "Sao Tome and Príncipe": "st",
-    "Saudi Arabia": "sa",
-    "Senegal": "sn",
-    "Serbia and Montenegro": "cs",
-    "Seychelles": "sc",
-    "Sierra Leone": "sl",
-    "Singapore": "sg",
-    "Slovakia (Slovak Republic)": "sk",
-    "Slovenia": "si",
-    "Solomon Islands": "sb",
-    "Somalia": "so",
-    "South Africa": "za",
-    "Spain": "es",
-    "Sri Lanka": "lk",
-    "Sudan": "sd",
-    "Suriname": "sr",
-    "Swaziland": "sz",
-    "Sweden": "se",
-    "Switzerland": "ch",
-    "Syrian Arab Republic": "sy",
-    "Taiwan": "tw",
-    "Tajikistan": "tj",
-    "Tanzania": "tz",
-    "Tanzania": "tz",
-    "Thailand": "th",
-    "Togo": "tg",
-    "Tonga": "to",
-    "Trinidad and Tobago": "tt",
-    "Tunisia": "tn",
-    "Turkey": "tr",
-    "Turkmenistan": "tm",
-    "Tuvalu": "tv",
-    "Uganda": "ug",
-    "Ukraine": "ua",
-    "United Arab Emirates": "ae",
-    "United Kingdom": "gb",
-    "United States": "us",
-    "Uruguay": "uy",
-    # Additions from running over large files
-    "Bosnia and Herzegovina": "ba",
-    # "International"
-    "China (Republic : 1949- )": "tw",  # pretty sure this is tw not cn
-    "Russia (Federation)": "ru",
-    "Scotland": "gb",
-    "England": "gb",
-    "Korea (South)": "kr",
-    "Georgia (Republic)": "ge",
-    "Egypt": "eg",
-}
-
 
 class PubmedImporter(EntityImporter):
     """
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 0d2c84ce..fc80411c 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -12,6 +12,8 @@ import ftfy
 import langdetect
 import pycountry
 
+from .biblio_lookup_tables import LICENSE_SLUG_MAP
+
 DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
 
 
@@ -606,84 +608,35 @@ def test_parse_country_name() -> None:
     assert parse_country_name("Japan") == "jp"
 
 
-# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
-# 2/T and 2/B?
-# PubMed/MEDLINE and JSTOR use these MARC codes
-# https://www.loc.gov/marc/languages/language_name.html
-LANG_MAP_MARC = {
-    "afr": "af",
-    "alb": "sq",
-    "amh": "am",
-    "ara": "ar",
-    "arm": "hy",
-    "aze": "az",
-    "ben": "bn",
-    "bos": "bs",
-    "bul": "bg",
-    "cat": "ca",
-    "chi": "zh",
-    "cze": "cs",
-    "dan": "da",
-    "dut": "nl",
-    "eng": "en",
-    "epo": "eo",
-    "est": "et",
-    "fin": "fi",
-    "fre": "fr",
-    "geo": "ka",
-    "ger": "de",
-    "gla": "gd",
-    "gre": "el",
-    "heb": "he",
-    "hin": "hi",
-    "hrv": "hr",
-    "hun": "hu",
-    "ice": "is",
-    "ind": "id",
-    "ita": "it",
-    "jpn": "ja",
-    "kin": "rw",
-    "kor": "ko",
-    "lat": "la",
-    "lav": "lv",
-    "lit": "lt",
-    "mac": "mk",
-    "mal": "ml",
-    "mao": "mi",
-    "may": "ms",
-    "nor": "no",
-    "per": "fa",
-    "per": "fa",
-    "pol": "pl",
-    "por": "pt",
-    "pus": "ps",
-    "rum": "ro",
-    "rus": "ru",
-    "san": "sa",
-    "slo": "sk",
-    "slv": "sl",
-    "spa": "es",
-    "srp": "sr",
-    "swe": "sv",
-    "tha": "th",
-    "tur": "tr",
-    "ukr": "uk",
-    "urd": "ur",
-    "vie": "vi",
-    "wel": "cy",
-    # additions
-    "gle": "ga",  # "Irish" (Gaelic)
-    "jav": "jv",  # Javanese
-    "welsh": "cy",  # Welsh
-    "oci": "oc",  # Occitan
-    # Don't have ISO 639-1 codes
-    "grc": "el",  # Ancient Greek; map to modern greek
-    "map": None,  # Austronesian (collection)
-    "syr": None,  # Syriac, Modern
-    "gem": None,  # Old Saxon
-    "non": None,  # Old Norse
-    "emg": None,  # Eastern Meohang
-    "neg": None,  # Negidal
-    "mul": None,  # Multiple languages
-    "und": None,  # Undetermined
-}
+def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+    if not raw:
+        return None
+    raw = raw.strip().replace("http://", "//").replace("https://", "//")
+    if "creativecommons.org" in raw.lower():
+        raw = raw.lower()
+        raw = raw.replace("/legalcode", "/").replace("/uk", "")
+        if not raw.endswith("/"):
+            raw = raw + "/"
+    return LICENSE_SLUG_MAP.get(raw)
+
+
+def test_lookup_license_slug() -> None:
+
+    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
+    assert (
+        lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
+        == "CC-BY"
+    )
+    assert (
+        lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
+        == "CC-0"
+    )
+    assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
+    assert (
+        lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
+        == "CC-BY-NC-SA"
+    )
+    assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
+    assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
+    assert lookup_license_slug("") is None
+    assert lookup_license_slug(None) is None
-- 
cgit v1.2.3