aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-10 13:23:12 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-10 13:28:28 -0800
commit16e9979a6f347b49764c1141209e84083ea81057 (patch)
treeccc3d35607cadac4933e9b28366bedf5a605c122
parentab4e1355bf93e3755985f1b5cd2589a78601d253 (diff)
downloadfatcat-16e9979a6f347b49764c1141209e84083ea81057.tar.gz
fatcat-16e9979a6f347b49764c1141209e84083ea81057.zip
importers: refactor imports of clean() and other normalization helpers
-rw-r--r--python/fatcat_tools/importers/__init__.py3
-rw-r--r--python/fatcat_tools/importers/arabesque.py4
-rw-r--r--python/fatcat_tools/importers/chocula.py8
-rw-r--r--python/fatcat_tools/importers/common.py5
-rw-r--r--python/fatcat_tools/importers/crossref.py56
-rw-r--r--python/fatcat_tools/importers/datacite.py24
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py30
-rw-r--r--python/fatcat_tools/importers/jalc.py22
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py8
-rw-r--r--python/fatcat_tools/importers/jstor.py14
-rw-r--r--python/fatcat_tools/importers/orcid.py10
-rw-r--r--python/fatcat_tools/importers/pubmed.py15
12 files changed, 104 insertions, 95 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 223ae526..4d4d696b 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -15,7 +15,6 @@ from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter
from .arxiv import ArxivRawImporter
from .chocula import ChoculaImporter
from .common import (
- LANG_MAP_MARC,
Bs4XmlFileListPusher,
Bs4XmlFilePusher,
Bs4XmlLargeFilePusher,
@@ -27,8 +26,6 @@ from .common import (
KafkaJsonPusher,
LinePusher,
SqlitePusher,
- clean,
- is_cjk,
make_kafka_consumer,
)
from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index b4a4d9ed..92289bb3 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, FileEntity
-from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url
+from fatcat_tools.normal import b32_hex
+
+from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, make_rel_url
ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL"
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 842c7853..c44fec3b 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ContainerEntity
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
class ChoculaImporter(EntityImporter):
@@ -40,7 +42,7 @@ class ChoculaImporter(EntityImporter):
returns a ContainerEntity (or None if invalid or couldn't parse)
"""
- name = clean(row.get("name"))
+ name = clean_str(row.get("name"))
if not name:
# Name is required (by schema)
return None
@@ -85,7 +87,7 @@ class ChoculaImporter(EntityImporter):
ident=row["ident"],
name=name,
container_type=container_type,
- publisher=clean(row.get("publisher")),
+ publisher=clean_str(row.get("publisher")),
wikidata_qid=row.get("wikidata_qid"),
extra=extra,
)
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 425b6f13..56c3d32e 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -27,10 +27,7 @@ from fatcat_openapi_client import (
from fatcat_openapi_client.rest import ApiException
from fuzzycat.matching import match_release_fuzzy
-# TODO: refactor so remove need for this (re-imports for backwards compatibility)
-from fatcat_tools.normal import is_cjk # noqa: F401
-from fatcat_tools.normal import LANG_MAP_MARC, b32_hex, clean_doi # noqa: F401
-from fatcat_tools.normal import clean_str as clean # noqa: F401
+from fatcat_tools.normal import clean_doi
from fatcat_tools.transforms import entity_to_dict
DATE_FMT: str = "%Y-%m-%d"
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index c9f251fc..8f5a4265 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,9 +4,9 @@ from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str
-from .common import EntityImporter, clean
+from .common import EntityImporter
# The docs/guide should be the canonical home for these mappings; update there
# first
@@ -232,21 +232,21 @@ class CrossrefImporter(EntityImporter):
if len(affiliation_list) > 1:
# note: affiliation => more_affiliations
extra["more_affiliations"] = [
- clean(a["name"]) for a in affiliation_list[1:]
+ clean_str(a["name"]) for a in affiliation_list[1:]
]
if am.get("sequence") and am.get("sequence") != "additional":
- extra["seq"] = clean(am.get("sequence"))
+ extra["seq"] = clean_str(am.get("sequence"))
assert ctype in ("author", "editor", "translator")
- raw_name = clean(raw_name)
+ raw_name = clean_str(raw_name)
# TODO: what if 'raw_name' is None?
contribs.append(
ReleaseContrib(
creator_id=creator_id,
index=index,
raw_name=raw_name,
- given_name=clean(am.get("given")),
- surname=clean(am.get("family")),
- raw_affiliation=clean(raw_affiliation),
+ given_name=clean_str(am.get("given")),
+ surname=clean_str(am.get("family")),
+ raw_affiliation=clean_str(raw_affiliation),
role=ctype,
extra=extra or None,
)
@@ -263,11 +263,11 @@ class CrossrefImporter(EntityImporter):
container_id = None
if issnl:
container_id = self.lookup_issnl(issnl)
- publisher = clean(obj.get("publisher"))
+ publisher = clean_str(obj.get("publisher"))
container_name = obj.get("container-title")
if container_name:
- container_name = clean(container_name[0], force_xml=True)
+ container_name = clean_str(container_name[0], force_xml=True)
if not container_name:
container_name = None
if (
@@ -323,7 +323,7 @@ class CrossrefImporter(EntityImporter):
ref_extra["journal-title"] = rm["journal-title"]
if rm.get("DOI"):
ref_extra["doi"] = rm.get("DOI").lower()
- author = clean(rm.get("author"))
+ author = clean_str(rm.get("author"))
if author:
ref_extra["authors"] = [author]
for k in (
@@ -347,8 +347,8 @@ class CrossrefImporter(EntityImporter):
"series-title",
"volume-title",
):
- if clean(rm.get(k)):
- ref_extra[k] = clean(rm[k])
+ if clean_str(rm.get(k)):
+ ref_extra[k] = clean_str(rm[k])
refs.append(
fatcat_openapi_client.ReleaseRef(
index=i,
@@ -356,9 +356,9 @@ class CrossrefImporter(EntityImporter):
target_release_id=None,
key=key,
year=year,
- container_name=clean(ref_container_name),
- title=clean(rm.get("article-title")),
- locator=clean(rm.get("first-page")),
+ container_name=clean_str(ref_container_name),
+ title=clean_str(rm.get("article-title")),
+ locator=clean_str(rm.get("first-page")),
# TODO: just dump JSON somewhere here?
extra=ref_extra or None,
)
@@ -366,7 +366,7 @@ class CrossrefImporter(EntityImporter):
# abstracts
abstracts = []
- abstract = clean(obj.get("abstract"))
+ abstract = clean_str(obj.get("abstract"))
if abstract and len(abstract) > 10:
abstracts.append(
fatcat_openapi_client.ReleaseAbstract(
@@ -387,9 +387,9 @@ class CrossrefImporter(EntityImporter):
if type(val) == list:
val = val[0]
if type(val) == str:
- val = clean(val)
+ val = clean_str(val)
if val:
- extra[key] = clean(val)
+ extra[key] = clean_str(val)
else:
extra[key] = val
# crossref-nested extra keys
@@ -397,14 +397,14 @@ class CrossrefImporter(EntityImporter):
val = obj.get(key)
if val:
if type(val) == str:
- extra_crossref[key] = clean(val)
+ extra_crossref[key] = clean_str(val)
else:
extra_crossref[key] = val
if license_extra:
extra_crossref["license"] = license_extra
if len(obj["title"]) > 1:
- aliases = [clean(t) for t in obj["title"][1:]]
+ aliases = [clean_str(t) for t in obj["title"][1:]]
aliases = [t for t in aliases if t]
if aliases:
extra["aliases"] = aliases
@@ -459,11 +459,11 @@ class CrossrefImporter(EntityImporter):
if obj.get("original-title"):
ot = obj.get("original-title")
if ot is not None:
- original_title = clean(ot[0], force_xml=True)
+ original_title = clean_str(ot[0], force_xml=True)
title: Optional[str] = None
if obj.get("title"):
- title = clean(obj["title"][0], force_xml=True)
+ title = clean_str(obj["title"][0], force_xml=True)
if not title or len(title) <= 1:
# title can't be just a single character
self.counts["skip-blank-title"] += 1
@@ -476,7 +476,7 @@ class CrossrefImporter(EntityImporter):
subtitle = None
if obj.get("subtitle"):
- subtitle = clean(obj["subtitle"][0], force_xml=True)
+ subtitle = clean_str(obj["subtitle"][0], force_xml=True)
if not subtitle or len(subtitle) <= 1:
# subtitle can't be just a single character
subtitle = None
@@ -499,10 +499,10 @@ class CrossrefImporter(EntityImporter):
doi=doi,
isbn13=isbn13,
),
- volume=clean(obj.get("volume")),
- issue=clean(obj.get("issue")),
- pages=clean(obj.get("page")),
- language=clean(obj.get("language")),
+ volume=clean_str(obj.get("volume")),
+ issue=clean_str(obj.get("issue")),
+ pages=clean_str(obj.get("page")),
+ language=clean_str(obj.get("language")),
license_slug=license_slug,
extra=extra or None,
abstracts=abstracts or None,
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index fe02cac4..441514b8 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -21,10 +21,10 @@ import langdetect
import pycountry
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str
from fatcat_tools.transforms import entity_to_dict
-from .common import EntityImporter, clean
+from .common import EntityImporter
# Cutoff length for abstracts.
MAX_ABSTRACT_LENGTH = 2048
@@ -322,7 +322,7 @@ class DataciteImporter(EntityImporter):
print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
return False
- title = clean(title)
+ title = clean_str(title)
if not title:
print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
return False
@@ -341,7 +341,7 @@ class DataciteImporter(EntityImporter):
if not subtitle:
subtitle = None
else:
- subtitle = clean(subtitle)
+ subtitle = clean_str(subtitle)
# Dates. A few internal dates (registered, created, updated) and
# published (0..2554). We try to work with typed date list, in
@@ -399,7 +399,7 @@ class DataciteImporter(EntityImporter):
publisher = None
if publisher:
- publisher = clean(publisher)
+ publisher = clean_str(publisher)
# Container. For the moment, only ISSN as container.
container_id = None
@@ -460,10 +460,10 @@ class DataciteImporter(EntityImporter):
issue = container.get("issue")
if volume:
- volume = clean(volume)
+ volume = clean_str(volume)
if issue:
- issue = clean(issue)
+ issue = clean_str(issue)
# Pages.
pages = None
@@ -548,7 +548,7 @@ class DataciteImporter(EntityImporter):
"[{}] language detection failed with {} on {}".format(doi, err, text),
file=sys.stderr,
)
- abstract_text = clean(text)
+ abstract_text = clean_str(text)
if not abstract_text:
continue
abstracts.append(
@@ -874,14 +874,14 @@ class DataciteImporter(EntityImporter):
if len(affiliations) == 0:
raw_affiliation = None
else:
- raw_affiliation = clean(affiliations[0])
+ raw_affiliation = clean_str(affiliations[0])
name = c.get("name")
given_name = c.get("givenName")
surname = c.get("familyName")
if name:
- name = clean(name)
+ name = clean_str(name)
if not any((name, given_name, surname)):
continue
if not name:
@@ -895,8 +895,8 @@ class DataciteImporter(EntityImporter):
name = index_form_to_display_name(name)
if given_name:
- given_name = clean(given_name)
- surname = clean(surname)
+ given_name = clean_str(given_name)
+ surname = clean_str(surname)
# Perform a final assertion that name does not reduce to zero
# (e.g. whitespace only name).
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 7c595787..9db499a0 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str
-from .common import EntityImporter, clean, make_rel_url
+from .common import EntityImporter, make_rel_url
MAX_ABSTRACT_BYTES = 4096
@@ -86,7 +86,7 @@ class GrobidMetadataImporter(EntityImporter):
abstract = obj.get("abstract")
if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
abobj = fatcat_openapi_client.ReleaseAbstract(
- mimetype="text/plain", content=clean(obj.get("abstract"))
+ mimetype="text/plain", content=clean_str(obj.get("abstract"))
)
abstracts = [abobj]
else:
@@ -97,9 +97,9 @@ class GrobidMetadataImporter(EntityImporter):
contribs.append(
fatcat_openapi_client.ReleaseContrib(
index=i,
- raw_name=clean(a["name"]),
- given_name=clean(a.get("given_name")),
- surname=clean(a.get("surname")),
+ raw_name=clean_str(a["name"]),
+ given_name=clean_str(a.get("given_name")),
+ surname=clean_str(a.get("surname")),
role="author",
extra=None,
)
@@ -116,15 +116,15 @@ class GrobidMetadataImporter(EntityImporter):
pass
for key in ("volume", "url", "issue", "publisher"):
if raw.get(key):
- cite_extra[key] = clean(raw[key])
+ cite_extra[key] = clean_str(raw[key])
if raw.get("authors"):
- cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]
+ cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]]
refs.append(
fatcat_openapi_client.ReleaseRef(
- key=clean(raw.get("id")),
+ key=clean_str(raw.get("id")),
year=year,
- title=clean(raw["title"]),
+ title=clean_str(raw["title"]),
extra=cite_extra or None,
)
)
@@ -140,7 +140,7 @@ class GrobidMetadataImporter(EntityImporter):
if doi:
extra["doi"] = doi
if obj["journal"] and obj["journal"].get("name"):
- extra["container_name"] = clean(obj["journal"]["name"])
+ extra["container_name"] = clean_str(obj["journal"]["name"])
# TODO: ISSN/eISSN handling? or just journal name lookup?
@@ -149,7 +149,7 @@ class GrobidMetadataImporter(EntityImporter):
if self.longtail_oa:
extra["longtail_oa"] = True
- clean_title = clean(obj["title"], force_xml=True)
+ clean_title = clean_str(obj["title"], force_xml=True)
if not clean_title or len(clean_title) < 2:
return None
title = clean_title
@@ -161,9 +161,9 @@ class GrobidMetadataImporter(EntityImporter):
release_year=release_year,
contribs=contribs,
refs=refs,
- publisher=clean(obj["journal"].get("publisher")),
- volume=clean(obj["journal"].get("volume")),
- issue=clean(obj["journal"].get("issue")),
+ publisher=clean_str(obj["journal"].get("publisher")),
+ volume=clean_str(obj["journal"].get("volume")),
+ issue=clean_str(obj["journal"].get("issue")),
abstracts=abstracts or None,
ext_ids=fatcat_openapi_client.ReleaseExtIds(),
extra=extra or None,
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index a737ac9f..9916a55f 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -6,9 +6,9 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str, is_cjk
-from .common import DATE_FMT, EntityImporter, clean, is_cjk
+from .common import DATE_FMT, EntityImporter
# TODO: should be List[Tag] not List[Any] for full type annotations
@@ -36,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]:
for raw in raw_persons:
name = raw.find("name") or None
if name:
- name = clean(name.get_text().replace("\n", " "))
+ name = clean_str(name.get_text().replace("\n", " "))
surname = raw.find("familyName") or None
if surname:
- surname = clean(surname.get_text().replace("\n", " "))
+ surname = clean_str(surname.get_text().replace("\n", " "))
given_name = raw.find("givenName") or None
if given_name:
- given_name = clean(given_name.get_text().replace("\n", " "))
+ given_name = clean_str(given_name.get_text().replace("\n", " "))
lang = "en"
if is_cjk(name):
lang = "ja"
@@ -230,16 +230,16 @@ class JalcImporter(EntityImporter):
for p in record.find_all("publicationName")
if p.get_text()
]
- pubs = [clean(p) for p in pubs if p]
+ pubs = [clean_str(p) for p in pubs if p]
assert pubs
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
if len(pubs) > 1 and is_cjk(pubs[0]):
# eng/jpn ordering is not reliable
pubs = [pubs[1], pubs[0]]
- container_name = clean(pubs[0])
+ container_name = clean_str(pubs[0])
if len(pubs) > 1:
- container_extra["original_name"] = clean(pubs[1])
+ container_extra["original_name"] = clean_str(pubs[1])
if record.publisher:
pubs = [
@@ -254,7 +254,7 @@ class JalcImporter(EntityImporter):
# ordering is not reliable
pubs = [pubs[1], pubs[0]]
if pubs:
- publisher = clean(pubs[0])
+ publisher = clean_str(pubs[0])
if len(pubs) > 1:
container_extra["publisher_aliases"] = pubs[1:]
@@ -296,14 +296,14 @@ class JalcImporter(EntityImporter):
# (informally)
extra["jalc"] = extra_jalc
- title = clean(title)
+ title = clean_str(title)
if not title:
return None
re = ReleaseEntity(
work_id=None,
title=title,
- original_title=clean(original_title),
+ original_title=clean_str(original_title),
release_type=release_type,
release_stage="published",
release_date=release_date,
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index a45e49f3..fc1dfcbd 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ContainerEntity
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
def or_none(s: Optional[str]) -> Optional[str]:
@@ -105,7 +107,7 @@ class JournalMetadataImporter(EntityImporter):
if extra_ia:
extra["ia"] = extra_ia
- name = clean(row.get("name"))
+ name = clean_str(row.get("name"))
if not name:
return None
@@ -115,7 +117,7 @@ class JournalMetadataImporter(EntityImporter):
issnp=row.get("issnp"),
container_type=None, # TODO
name=name,
- publisher=clean(row.get("publisher")),
+ publisher=clean_str(row.get("publisher")),
wikidata_qid=None, # TODO
extra=extra,
)
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index ca1f2466..c2f650b0 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,9 +8,9 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import LANG_MAP_MARC, clean_doi, clean_str
-from .common import LANG_MAP_MARC, EntityImporter, clean
+from .common import EntityImporter
from .crossref import CONTAINER_TYPE_MAP
# TODO: more entries?
@@ -140,7 +140,7 @@ class JstorImporter(EntityImporter):
issnl=issnl,
publisher=publisher,
container_type=self.map_container_type(release_type),
- name=clean(journal_title, force_xml=True),
+ name=clean_str(journal_title, force_xml=True),
)
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
@@ -166,13 +166,13 @@ class JstorImporter(EntityImporter):
for c in cgroup.find_all("contrib"):
given = c.find("given-names")
if given:
- given = clean(given.get_text().replace("\n", " "))
+ given = clean_str(given.get_text().replace("\n", " "))
surname = c.find("surname")
if surname:
- surname = clean(surname.get_text().replace("\n", " "))
+ surname = clean_str(surname.get_text().replace("\n", " "))
raw_name = c.find("string-name")
if raw_name:
- raw_name = clean(raw_name.get_text().replace("\n", " "))
+ raw_name = clean_str(raw_name.get_text().replace("\n", " "))
if not raw_name:
if given and surname:
@@ -234,7 +234,7 @@ class JstorImporter(EntityImporter):
# JSTOR issue-id
if article_meta.find("issue-id"):
- issue_id = clean(article_meta.find("issue-id").string)
+ issue_id = clean_str(article_meta.find("issue-id").string)
if issue_id:
extra_jstor["issue_id"] = issue_id
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 430cdd0f..f3d82a86 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -4,7 +4,9 @@ from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, CreatorEntity
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
def value_or_none(e: Any) -> Any:
@@ -65,14 +67,14 @@ class OrcidImporter(EntityImporter):
if not self.is_orcid(orcid):
sys.stderr.write("Bad ORCID: {}\n".format(orcid))
return None
- display = clean(display)
+ display = clean_str(display)
if not display:
# must have *some* name
return None
ce = CreatorEntity(
orcid=orcid,
- given_name=clean(given),
- surname=clean(sur),
+ given_name=clean_str(given),
+ surname=clean_str(sur),
display_name=display,
extra=extra,
)
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index d32fcefa..3274234f 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,9 +8,16 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity
-from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid
+from fatcat_tools.normal import (
+ LANG_MAP_MARC,
+ clean_doi,
+ clean_issn,
+ clean_pmcid,
+ clean_pmid,
+ clean_str,
+)
-from .common import LANG_MAP_MARC, EntityImporter, clean
+from .common import EntityImporter
# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
PUBMED_RELEASE_TYPE_MAP = {
@@ -704,14 +711,14 @@ class PubmedImporter(EntityImporter):
if extra_pubmed:
extra["pubmed"] = extra_pubmed
- title = clean(title)
+ title = clean_str(title)
if not title:
return None
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
title=title,
- original_title=clean(original_title),
+ original_title=clean_str(original_title),
release_type=release_type,
release_stage=release_stage,
release_date=release_date,