importers: refactor imports of clean() and other normalization helpers

author: Bryan Newbold <bnewbold@robocracy.org> 2021-11-10 13:23:12 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-11-10 13:28:28 -0800
commit: 16e9979a6f347b49764c1141209e84083ea81057 (patch)
tree: ccc3d35607cadac4933e9b28366bedf5a605c122 /python/fatcat_tools/importers
parent: ab4e1355bf93e3755985f1b5cd2589a78601d253 (diff)
download: fatcat-16e9979a6f347b49764c1141209e84083ea81057.tar.gz
fatcat-16e9979a6f347b49764c1141209e84083ea81057.zip
12 files changed, 104 insertions, 95 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 223ae526..4d4d696b 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -15,7 +15,6 @@ from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter
 from .arxiv import ArxivRawImporter
 from .chocula import ChoculaImporter
 from .common import (
-    LANG_MAP_MARC,
     Bs4XmlFileListPusher,
     Bs4XmlFilePusher,
     Bs4XmlLargeFilePusher,
@@ -27,8 +26,6 @@ from .common import (
     KafkaJsonPusher,
     LinePusher,
     SqlitePusher,
-    clean,
-    is_cjk,
     make_kafka_consumer,
 )
 from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index b4a4d9ed..92289bb3 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, FileEntity
 
-from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url
+from fatcat_tools.normal import b32_hex
+
+from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, make_rel_url
 
 ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL"
 
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 842c7853..c44fec3b 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ContainerEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
 
 
 class ChoculaImporter(EntityImporter):
@@ -40,7 +42,7 @@ class ChoculaImporter(EntityImporter):
         returns a ContainerEntity (or None if invalid or couldn't parse)
         """
 
-        name = clean(row.get("name"))
+        name = clean_str(row.get("name"))
         if not name:
             # Name is required (by schema)
             return None
@@ -85,7 +87,7 @@ class ChoculaImporter(EntityImporter):
             ident=row["ident"],
             name=name,
             container_type=container_type,
-            publisher=clean(row.get("publisher")),
+            publisher=clean_str(row.get("publisher")),
             wikidata_qid=row.get("wikidata_qid"),
             extra=extra,
         )
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 425b6f13..56c3d32e 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -27,10 +27,7 @@ from fatcat_openapi_client import (
 from fatcat_openapi_client.rest import ApiException
 from fuzzycat.matching import match_release_fuzzy
 
-# TODO: refactor so remove need for this (re-imports for backwards compatibility)
-from fatcat_tools.normal import is_cjk  # noqa: F401
-from fatcat_tools.normal import LANG_MAP_MARC, b32_hex, clean_doi  # noqa: F401
-from fatcat_tools.normal import clean_str as clean  # noqa: F401
+from fatcat_tools.normal import clean_doi
 from fatcat_tools.transforms import entity_to_dict
 
 DATE_FMT: str = "%Y-%m-%d"
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index c9f251fc..8f5a4265 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,9 +4,9 @@ from typing import Any, Dict, List, Optional, Sequence
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str
 
-from .common import EntityImporter, clean
+from .common import EntityImporter
 
 # The docs/guide should be the canonical home for these mappings; update there
 # first
@@ -232,21 +232,21 @@ class CrossrefImporter(EntityImporter):
                     if len(affiliation_list) > 1:
                         # note: affiliation => more_affiliations
                         extra["more_affiliations"] = [
-                            clean(a["name"]) for a in affiliation_list[1:]
+                            clean_str(a["name"]) for a in affiliation_list[1:]
                         ]
                 if am.get("sequence") and am.get("sequence") != "additional":
-                    extra["seq"] = clean(am.get("sequence"))
+                    extra["seq"] = clean_str(am.get("sequence"))
                 assert ctype in ("author", "editor", "translator")
-                raw_name = clean(raw_name)
+                raw_name = clean_str(raw_name)
                 # TODO: what if 'raw_name' is None?
                 contribs.append(
                     ReleaseContrib(
                         creator_id=creator_id,
                         index=index,
                         raw_name=raw_name,
-                        given_name=clean(am.get("given")),
-                        surname=clean(am.get("family")),
-                        raw_affiliation=clean(raw_affiliation),
+                        given_name=clean_str(am.get("given")),
+                        surname=clean_str(am.get("family")),
+                        raw_affiliation=clean_str(raw_affiliation),
                         role=ctype,
                         extra=extra or None,
                     )
@@ -263,11 +263,11 @@ class CrossrefImporter(EntityImporter):
         container_id = None
         if issnl:
             container_id = self.lookup_issnl(issnl)
-        publisher = clean(obj.get("publisher"))
+        publisher = clean_str(obj.get("publisher"))
 
         container_name = obj.get("container-title")
         if container_name:
-            container_name = clean(container_name[0], force_xml=True)
+            container_name = clean_str(container_name[0], force_xml=True)
         if not container_name:
             container_name = None
         if (
@@ -323,7 +323,7 @@ class CrossrefImporter(EntityImporter):
                 ref_extra["journal-title"] = rm["journal-title"]
             if rm.get("DOI"):
                 ref_extra["doi"] = rm.get("DOI").lower()
-            author = clean(rm.get("author"))
+            author = clean_str(rm.get("author"))
             if author:
                 ref_extra["authors"] = [author]
             for k in (
@@ -347,8 +347,8 @@ class CrossrefImporter(EntityImporter):
                 "series-title",
                 "volume-title",
             ):
-                if clean(rm.get(k)):
-                    ref_extra[k] = clean(rm[k])
+                if clean_str(rm.get(k)):
+                    ref_extra[k] = clean_str(rm[k])
             refs.append(
                 fatcat_openapi_client.ReleaseRef(
                     index=i,
@@ -356,9 +356,9 @@ class CrossrefImporter(EntityImporter):
                     target_release_id=None,
                     key=key,
                     year=year,
-                    container_name=clean(ref_container_name),
-                    title=clean(rm.get("article-title")),
-                    locator=clean(rm.get("first-page")),
+                    container_name=clean_str(ref_container_name),
+                    title=clean_str(rm.get("article-title")),
+                    locator=clean_str(rm.get("first-page")),
                     # TODO: just dump JSON somewhere here?
                     extra=ref_extra or None,
                 )
@@ -366,7 +366,7 @@ class CrossrefImporter(EntityImporter):
 
         # abstracts
         abstracts = []
-        abstract = clean(obj.get("abstract"))
+        abstract = clean_str(obj.get("abstract"))
         if abstract and len(abstract) > 10:
             abstracts.append(
                 fatcat_openapi_client.ReleaseAbstract(
@@ -387,9 +387,9 @@ class CrossrefImporter(EntityImporter):
                 if type(val) == list:
                     val = val[0]
                 if type(val) == str:
-                    val = clean(val)
+                    val = clean_str(val)
                     if val:
-                        extra[key] = clean(val)
+                        extra[key] = clean_str(val)
                 else:
                     extra[key] = val
         # crossref-nested extra keys
@@ -397,14 +397,14 @@ class CrossrefImporter(EntityImporter):
             val = obj.get(key)
             if val:
                 if type(val) == str:
-                    extra_crossref[key] = clean(val)
+                    extra_crossref[key] = clean_str(val)
                 else:
                     extra_crossref[key] = val
         if license_extra:
             extra_crossref["license"] = license_extra
 
         if len(obj["title"]) > 1:
-            aliases = [clean(t) for t in obj["title"][1:]]
+            aliases = [clean_str(t) for t in obj["title"][1:]]
             aliases = [t for t in aliases if t]
             if aliases:
                 extra["aliases"] = aliases
@@ -459,11 +459,11 @@ class CrossrefImporter(EntityImporter):
         if obj.get("original-title"):
             ot = obj.get("original-title")
             if ot is not None:
-                original_title = clean(ot[0], force_xml=True)
+                original_title = clean_str(ot[0], force_xml=True)
 
         title: Optional[str] = None
         if obj.get("title"):
-            title = clean(obj["title"][0], force_xml=True)
+            title = clean_str(obj["title"][0], force_xml=True)
             if not title or len(title) <= 1:
                 # title can't be just a single character
                 self.counts["skip-blank-title"] += 1
@@ -476,7 +476,7 @@ class CrossrefImporter(EntityImporter):
 
         subtitle = None
         if obj.get("subtitle"):
-            subtitle = clean(obj["subtitle"][0], force_xml=True)
+            subtitle = clean_str(obj["subtitle"][0], force_xml=True)
             if not subtitle or len(subtitle) <= 1:
                 # subtitle can't be just a single character
                 subtitle = None
@@ -499,10 +499,10 @@ class CrossrefImporter(EntityImporter):
                 doi=doi,
                 isbn13=isbn13,
             ),
-            volume=clean(obj.get("volume")),
-            issue=clean(obj.get("issue")),
-            pages=clean(obj.get("page")),
-            language=clean(obj.get("language")),
+            volume=clean_str(obj.get("volume")),
+            issue=clean_str(obj.get("issue")),
+            pages=clean_str(obj.get("page")),
+            language=clean_str(obj.get("language")),
             license_slug=license_slug,
             extra=extra or None,
             abstracts=abstracts or None,
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index fe02cac4..441514b8 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -21,10 +21,10 @@ import langdetect
 import pycountry
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str
 from fatcat_tools.transforms import entity_to_dict
 
-from .common import EntityImporter, clean
+from .common import EntityImporter
 
 # Cutoff length for abstracts.
 MAX_ABSTRACT_LENGTH = 2048
@@ -322,7 +322,7 @@ class DataciteImporter(EntityImporter):
             print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
             return False
 
-        title = clean(title)
+        title = clean_str(title)
         if not title:
             print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
             return False
@@ -341,7 +341,7 @@ class DataciteImporter(EntityImporter):
         if not subtitle:
             subtitle = None
         else:
-            subtitle = clean(subtitle)
+            subtitle = clean_str(subtitle)
 
         # Dates. A few internal dates (registered, created, updated) and
         # published (0..2554). We try to work with typed date list, in
@@ -399,7 +399,7 @@ class DataciteImporter(EntityImporter):
             publisher = None
 
         if publisher:
-            publisher = clean(publisher)
+            publisher = clean_str(publisher)
 
         # Container. For the moment, only ISSN as container.
         container_id = None
@@ -460,10 +460,10 @@ class DataciteImporter(EntityImporter):
         issue = container.get("issue")
 
         if volume:
-            volume = clean(volume)
+            volume = clean_str(volume)
 
         if issue:
-            issue = clean(issue)
+            issue = clean_str(issue)
 
         # Pages.
         pages = None
@@ -548,7 +548,7 @@ class DataciteImporter(EntityImporter):
                     "[{}] language detection failed with {} on {}".format(doi, err, text),
                     file=sys.stderr,
                 )
-            abstract_text = clean(text)
+            abstract_text = clean_str(text)
             if not abstract_text:
                 continue
             abstracts.append(
@@ -874,14 +874,14 @@ class DataciteImporter(EntityImporter):
                 if len(affiliations) == 0:
                     raw_affiliation = None
                 else:
-                    raw_affiliation = clean(affiliations[0])
+                    raw_affiliation = clean_str(affiliations[0])
 
                 name = c.get("name")
                 given_name = c.get("givenName")
                 surname = c.get("familyName")
 
                 if name:
-                    name = clean(name)
+                    name = clean_str(name)
                 if not any((name, given_name, surname)):
                     continue
                 if not name:
@@ -895,8 +895,8 @@ class DataciteImporter(EntityImporter):
                     name = index_form_to_display_name(name)
 
                 if given_name:
-                    given_name = clean(given_name)
-                surname = clean(surname)
+                    given_name = clean_str(given_name)
+                surname = clean_str(surname)
 
                 # Perform a final assertion that name does not reduce to zero
                 # (e.g. whitespace only name).
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 7c595787..9db499a0 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str
 
-from .common import EntityImporter, clean, make_rel_url
+from .common import EntityImporter, make_rel_url
 
 MAX_ABSTRACT_BYTES = 4096
 
@@ -86,7 +86,7 @@ class GrobidMetadataImporter(EntityImporter):
         abstract = obj.get("abstract")
         if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
             abobj = fatcat_openapi_client.ReleaseAbstract(
-                mimetype="text/plain", content=clean(obj.get("abstract"))
+                mimetype="text/plain", content=clean_str(obj.get("abstract"))
             )
             abstracts = [abobj]
         else:
@@ -97,9 +97,9 @@ class GrobidMetadataImporter(EntityImporter):
             contribs.append(
                 fatcat_openapi_client.ReleaseContrib(
                     index=i,
-                    raw_name=clean(a["name"]),
-                    given_name=clean(a.get("given_name")),
-                    surname=clean(a.get("surname")),
+                    raw_name=clean_str(a["name"]),
+                    given_name=clean_str(a.get("given_name")),
+                    surname=clean_str(a.get("surname")),
                     role="author",
                     extra=None,
                 )
@@ -116,15 +116,15 @@ class GrobidMetadataImporter(EntityImporter):
                     pass
             for key in ("volume", "url", "issue", "publisher"):
                 if raw.get(key):
-                    cite_extra[key] = clean(raw[key])
+                    cite_extra[key] = clean_str(raw[key])
             if raw.get("authors"):
-                cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]
+                cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]]
 
             refs.append(
                 fatcat_openapi_client.ReleaseRef(
-                    key=clean(raw.get("id")),
+                    key=clean_str(raw.get("id")),
                     year=year,
-                    title=clean(raw["title"]),
+                    title=clean_str(raw["title"]),
                     extra=cite_extra or None,
                 )
             )
@@ -140,7 +140,7 @@ class GrobidMetadataImporter(EntityImporter):
         if doi:
             extra["doi"] = doi
         if obj["journal"] and obj["journal"].get("name"):
-            extra["container_name"] = clean(obj["journal"]["name"])
+            extra["container_name"] = clean_str(obj["journal"]["name"])
 
         # TODO: ISSN/eISSN handling? or just journal name lookup?
 
@@ -149,7 +149,7 @@ class GrobidMetadataImporter(EntityImporter):
         if self.longtail_oa:
             extra["longtail_oa"] = True
 
-        clean_title = clean(obj["title"], force_xml=True)
+        clean_title = clean_str(obj["title"], force_xml=True)
         if not clean_title or len(clean_title) < 2:
             return None
         title = clean_title
@@ -161,9 +161,9 @@ class GrobidMetadataImporter(EntityImporter):
             release_year=release_year,
             contribs=contribs,
             refs=refs,
-            publisher=clean(obj["journal"].get("publisher")),
-            volume=clean(obj["journal"].get("volume")),
-            issue=clean(obj["journal"].get("issue")),
+            publisher=clean_str(obj["journal"].get("publisher")),
+            volume=clean_str(obj["journal"].get("volume")),
+            issue=clean_str(obj["journal"].get("issue")),
             abstracts=abstracts or None,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(),
             extra=extra or None,
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index a737ac9f..9916a55f 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -6,9 +6,9 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str, is_cjk
 
-from .common import DATE_FMT, EntityImporter, clean, is_cjk
+from .common import DATE_FMT, EntityImporter
 
 
 # TODO: should be List[Tag] not List[Any] for full type annotations
@@ -36,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]:
     for raw in raw_persons:
         name = raw.find("name") or None
         if name:
-            name = clean(name.get_text().replace("\n", " "))
+            name = clean_str(name.get_text().replace("\n", " "))
         surname = raw.find("familyName") or None
         if surname:
-            surname = clean(surname.get_text().replace("\n", " "))
+            surname = clean_str(surname.get_text().replace("\n", " "))
         given_name = raw.find("givenName") or None
         if given_name:
-            given_name = clean(given_name.get_text().replace("\n", " "))
+            given_name = clean_str(given_name.get_text().replace("\n", " "))
         lang = "en"
         if is_cjk(name):
             lang = "ja"
@@ -230,16 +230,16 @@ class JalcImporter(EntityImporter):
                 for p in record.find_all("publicationName")
                 if p.get_text()
             ]
-            pubs = [clean(p) for p in pubs if p]
+            pubs = [clean_str(p) for p in pubs if p]
             assert pubs
             if len(pubs) > 1 and pubs[0] == pubs[1]:
                 pubs = [pubs[0]]
             if len(pubs) > 1 and is_cjk(pubs[0]):
                 # eng/jpn ordering is not reliable
                 pubs = [pubs[1], pubs[0]]
-            container_name = clean(pubs[0])
+            container_name = clean_str(pubs[0])
             if len(pubs) > 1:
-                container_extra["original_name"] = clean(pubs[1])
+                container_extra["original_name"] = clean_str(pubs[1])
 
         if record.publisher:
             pubs = [
@@ -254,7 +254,7 @@ class JalcImporter(EntityImporter):
                 # ordering is not reliable
                 pubs = [pubs[1], pubs[0]]
             if pubs:
-                publisher = clean(pubs[0])
+                publisher = clean_str(pubs[0])
                 if len(pubs) > 1:
                     container_extra["publisher_aliases"] = pubs[1:]
 
@@ -296,14 +296,14 @@ class JalcImporter(EntityImporter):
         # (informally)
         extra["jalc"] = extra_jalc
 
-        title = clean(title)
+        title = clean_str(title)
         if not title:
             return None
 
         re = ReleaseEntity(
             work_id=None,
             title=title,
-            original_title=clean(original_title),
+            original_title=clean_str(original_title),
             release_type=release_type,
             release_stage="published",
             release_date=release_date,
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index a45e49f3..fc1dfcbd 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, ContainerEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
 
 
 def or_none(s: Optional[str]) -> Optional[str]:
@@ -105,7 +107,7 @@ class JournalMetadataImporter(EntityImporter):
         if extra_ia:
             extra["ia"] = extra_ia
 
-        name = clean(row.get("name"))
+        name = clean_str(row.get("name"))
         if not name:
             return None
 
@@ -115,7 +117,7 @@ class JournalMetadataImporter(EntityImporter):
             issnp=row.get("issnp"),
             container_type=None,  # TODO
             name=name,
-            publisher=clean(row.get("publisher")),
+            publisher=clean_str(row.get("publisher")),
             wikidata_qid=None,  # TODO
             extra=extra,
         )
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index ca1f2466..c2f650b0 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,9 +8,9 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import LANG_MAP_MARC, clean_doi, clean_str
 
-from .common import LANG_MAP_MARC, EntityImporter, clean
+from .common import EntityImporter
 from .crossref import CONTAINER_TYPE_MAP
 
 # TODO: more entries?
@@ -140,7 +140,7 @@ class JstorImporter(EntityImporter):
                 issnl=issnl,
                 publisher=publisher,
                 container_type=self.map_container_type(release_type),
-                name=clean(journal_title, force_xml=True),
+                name=clean_str(journal_title, force_xml=True),
             )
             ce_edit = self.create_container(ce)
             container_id = ce_edit.ident
@@ -166,13 +166,13 @@ class JstorImporter(EntityImporter):
             for c in cgroup.find_all("contrib"):
                 given = c.find("given-names")
                 if given:
-                    given = clean(given.get_text().replace("\n", " "))
+                    given = clean_str(given.get_text().replace("\n", " "))
                 surname = c.find("surname")
                 if surname:
-                    surname = clean(surname.get_text().replace("\n", " "))
+                    surname = clean_str(surname.get_text().replace("\n", " "))
                 raw_name = c.find("string-name")
                 if raw_name:
-                    raw_name = clean(raw_name.get_text().replace("\n", " "))
+                    raw_name = clean_str(raw_name.get_text().replace("\n", " "))
 
                 if not raw_name:
                     if given and surname:
@@ -234,7 +234,7 @@ class JstorImporter(EntityImporter):
 
         # JSTOR issue-id
         if article_meta.find("issue-id"):
-            issue_id = clean(article_meta.find("issue-id").string)
+            issue_id = clean_str(article_meta.find("issue-id").string)
             if issue_id:
                 extra_jstor["issue_id"] = issue_id
 
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 430cdd0f..f3d82a86 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -4,7 +4,9 @@ from typing import Any, Dict, List, Optional
 import fatcat_openapi_client
 from fatcat_openapi_client import ApiClient, CreatorEntity
 
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
 
 
 def value_or_none(e: Any) -> Any:
@@ -65,14 +67,14 @@ class OrcidImporter(EntityImporter):
         if not self.is_orcid(orcid):
             sys.stderr.write("Bad ORCID: {}\n".format(orcid))
             return None
-        display = clean(display)
+        display = clean_str(display)
         if not display:
             # must have *some* name
             return None
         ce = CreatorEntity(
             orcid=orcid,
-            given_name=clean(given),
-            surname=clean(sur),
+            given_name=clean_str(given),
+            surname=clean_str(sur),
             display_name=display,
             extra=extra,
         )
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index d32fcefa..3274234f 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,9 +8,16 @@ import fatcat_openapi_client
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ApiClient, ReleaseEntity
 
-from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid
+from fatcat_tools.normal import (
+    LANG_MAP_MARC,
+    clean_doi,
+    clean_issn,
+    clean_pmcid,
+    clean_pmid,
+    clean_str,
+)
 
-from .common import LANG_MAP_MARC, EntityImporter, clean
+from .common import EntityImporter
 
 # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
 PUBMED_RELEASE_TYPE_MAP = {
@@ -704,14 +711,14 @@ class PubmedImporter(EntityImporter):
         if extra_pubmed:
             extra["pubmed"] = extra_pubmed
 
-        title = clean(title)
+        title = clean_str(title)
         if not title:
             return None
 
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
             title=title,
-            original_title=clean(original_title),
+            original_title=clean_str(original_title),
             release_type=release_type,
             release_stage=release_stage,
             release_date=release_date,
author	Bryan Newbold <bnewbold@robocracy.org>	2021-11-10 13:23:12 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-11-10 13:28:28 -0800
commit	16e9979a6f347b49764c1141209e84083ea81057 (patch)
tree	ccc3d35607cadac4933e9b28366bedf5a605c122 /python/fatcat_tools/importers
parent	ab4e1355bf93e3755985f1b5cd2589a78601d253 (diff)
download	fatcat-16e9979a6f347b49764c1141209e84083ea81057.tar.gz fatcat-16e9979a6f347b49764c1141209e84083ea81057.zip