diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-10 13:23:12 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-10 13:28:28 -0800 |
commit | 16e9979a6f347b49764c1141209e84083ea81057 (patch) | |
tree | ccc3d35607cadac4933e9b28366bedf5a605c122 /python | |
parent | ab4e1355bf93e3755985f1b5cd2589a78601d253 (diff) | |
download | fatcat-16e9979a6f347b49764c1141209e84083ea81057.tar.gz fatcat-16e9979a6f347b49764c1141209e84083ea81057.zip |
importers: refactor imports of clean() and other normalization helpers
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 3 | ||||
-rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/importers/chocula.py | 8 | ||||
-rw-r--r-- | python/fatcat_tools/importers/common.py | 5 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 56 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 24 | ||||
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 30 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jalc.py | 22 | ||||
-rw-r--r-- | python/fatcat_tools/importers/journal_metadata.py | 8 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jstor.py | 14 | ||||
-rw-r--r-- | python/fatcat_tools/importers/orcid.py | 10 | ||||
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 15 |
12 files changed, 104 insertions, 95 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 223ae526..4d4d696b 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -15,7 +15,6 @@ from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter from .arxiv import ArxivRawImporter from .chocula import ChoculaImporter from .common import ( - LANG_MAP_MARC, Bs4XmlFileListPusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, @@ -27,8 +26,6 @@ from .common import ( KafkaJsonPusher, LinePusher, SqlitePusher, - clean, - is_cjk, make_kafka_consumer, ) from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index b4a4d9ed..92289bb3 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, FileEntity -from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url +from fatcat_tools.normal import b32_hex + +from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, make_rel_url ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL" diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 842c7853..c44fec3b 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ContainerEntity -from .common import EntityImporter, clean +from fatcat_tools.normal import clean_str + +from .common import EntityImporter class ChoculaImporter(EntityImporter): @@ -40,7 +42,7 @@ class ChoculaImporter(EntityImporter): returns a ContainerEntity (or None if invalid or couldn't parse) """ - name = clean(row.get("name")) + name = clean_str(row.get("name")) if not name: # Name is required (by schema) return None @@ -85,7 +87,7 @@ class ChoculaImporter(EntityImporter): ident=row["ident"], name=name, container_type=container_type, - publisher=clean(row.get("publisher")), + publisher=clean_str(row.get("publisher")), wikidata_qid=row.get("wikidata_qid"), extra=extra, ) diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 425b6f13..56c3d32e 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -27,10 +27,7 @@ from fatcat_openapi_client import ( from fatcat_openapi_client.rest import ApiException from fuzzycat.matching import match_release_fuzzy -# TODO: refactor so remove need for this (re-imports for backwards compatibility) -from fatcat_tools.normal import is_cjk # noqa: F401 -from fatcat_tools.normal import LANG_MAP_MARC, b32_hex, clean_doi # noqa: F401 -from fatcat_tools.normal import clean_str as clean # noqa: F401 +from fatcat_tools.normal import clean_doi from fatcat_tools.transforms import entity_to_dict DATE_FMT: str = "%Y-%m-%d" diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index c9f251fc..8f5a4265 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -4,9 +4,9 @@ from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from fatcat_tools.normal import clean_doi +from fatcat_tools.normal import clean_doi, clean_str -from .common import EntityImporter, clean +from .common import EntityImporter # The docs/guide should be the canonical home for these mappings; update there # first @@ -232,21 +232,21 @@ class CrossrefImporter(EntityImporter): if len(affiliation_list) > 1: # note: affiliation => more_affiliations extra["more_affiliations"] = [ - clean(a["name"]) for a in affiliation_list[1:] + clean_str(a["name"]) for a in affiliation_list[1:] ] if am.get("sequence") and am.get("sequence") != "additional": - extra["seq"] = clean(am.get("sequence")) + extra["seq"] = clean_str(am.get("sequence")) assert ctype in ("author", "editor", "translator") - raw_name = clean(raw_name) + raw_name = clean_str(raw_name) # TODO: what if 'raw_name' is None? contribs.append( ReleaseContrib( creator_id=creator_id, index=index, raw_name=raw_name, - given_name=clean(am.get("given")), - surname=clean(am.get("family")), - raw_affiliation=clean(raw_affiliation), + given_name=clean_str(am.get("given")), + surname=clean_str(am.get("family")), + raw_affiliation=clean_str(raw_affiliation), role=ctype, extra=extra or None, ) @@ -263,11 +263,11 @@ class CrossrefImporter(EntityImporter): container_id = None if issnl: container_id = self.lookup_issnl(issnl) - publisher = clean(obj.get("publisher")) + publisher = clean_str(obj.get("publisher")) container_name = obj.get("container-title") if container_name: - container_name = clean(container_name[0], force_xml=True) + container_name = clean_str(container_name[0], force_xml=True) if not container_name: container_name = None if ( @@ -323,7 +323,7 @@ class CrossrefImporter(EntityImporter): ref_extra["journal-title"] = rm["journal-title"] if rm.get("DOI"): ref_extra["doi"] = rm.get("DOI").lower() - author = clean(rm.get("author")) + author = clean_str(rm.get("author")) if author: ref_extra["authors"] = [author] for k in ( @@ -347,8 +347,8 @@ class CrossrefImporter(EntityImporter): "series-title", "volume-title", ): - if clean(rm.get(k)): - ref_extra[k] = clean(rm[k]) + if clean_str(rm.get(k)): + ref_extra[k] = clean_str(rm[k]) refs.append( fatcat_openapi_client.ReleaseRef( index=i, @@ -356,9 +356,9 @@ class CrossrefImporter(EntityImporter): target_release_id=None, key=key, year=year, - container_name=clean(ref_container_name), - title=clean(rm.get("article-title")), - locator=clean(rm.get("first-page")), + container_name=clean_str(ref_container_name), + title=clean_str(rm.get("article-title")), + locator=clean_str(rm.get("first-page")), # TODO: just dump JSON somewhere here? extra=ref_extra or None, ) @@ -366,7 +366,7 @@ class CrossrefImporter(EntityImporter): # abstracts abstracts = [] - abstract = clean(obj.get("abstract")) + abstract = clean_str(obj.get("abstract")) if abstract and len(abstract) > 10: abstracts.append( fatcat_openapi_client.ReleaseAbstract( @@ -387,9 +387,9 @@ class CrossrefImporter(EntityImporter): if type(val) == list: val = val[0] if type(val) == str: - val = clean(val) + val = clean_str(val) if val: - extra[key] = clean(val) + extra[key] = clean_str(val) else: extra[key] = val # crossref-nested extra keys @@ -397,14 +397,14 @@ class CrossrefImporter(EntityImporter): val = obj.get(key) if val: if type(val) == str: - extra_crossref[key] = clean(val) + extra_crossref[key] = clean_str(val) else: extra_crossref[key] = val if license_extra: extra_crossref["license"] = license_extra if len(obj["title"]) > 1: - aliases = [clean(t) for t in obj["title"][1:]] + aliases = [clean_str(t) for t in obj["title"][1:]] aliases = [t for t in aliases if t] if aliases: extra["aliases"] = aliases @@ -459,11 +459,11 @@ class CrossrefImporter(EntityImporter): if obj.get("original-title"): ot = obj.get("original-title") if ot is not None: - original_title = clean(ot[0], force_xml=True) + original_title = clean_str(ot[0], force_xml=True) title: Optional[str] = None if obj.get("title"): - title = clean(obj["title"][0], force_xml=True) + title = clean_str(obj["title"][0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character self.counts["skip-blank-title"] += 1 @@ -476,7 +476,7 @@ class CrossrefImporter(EntityImporter): subtitle = None if obj.get("subtitle"): - subtitle = clean(obj["subtitle"][0], force_xml=True) + subtitle = clean_str(obj["subtitle"][0], force_xml=True) if not subtitle or len(subtitle) <= 1: # subtitle can't be just a single character subtitle = None @@ -499,10 +499,10 @@ class CrossrefImporter(EntityImporter): doi=doi, isbn13=isbn13, ), - volume=clean(obj.get("volume")), - issue=clean(obj.get("issue")), - pages=clean(obj.get("page")), - language=clean(obj.get("language")), + volume=clean_str(obj.get("volume")), + issue=clean_str(obj.get("issue")), + pages=clean_str(obj.get("page")), + language=clean_str(obj.get("language")), license_slug=license_slug, extra=extra or None, abstracts=abstracts or None, diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index fe02cac4..441514b8 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -21,10 +21,10 @@ import langdetect import pycountry from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from fatcat_tools.normal import clean_doi +from fatcat_tools.normal import clean_doi, clean_str from fatcat_tools.transforms import entity_to_dict -from .common import EntityImporter, clean +from .common import EntityImporter # Cutoff length for abstracts. MAX_ABSTRACT_LENGTH = 2048 @@ -322,7 +322,7 @@ class DataciteImporter(EntityImporter): print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr) return False - title = clean(title) + title = clean_str(title) if not title: print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr) return False @@ -341,7 +341,7 @@ class DataciteImporter(EntityImporter): if not subtitle: subtitle = None else: - subtitle = clean(subtitle) + subtitle = clean_str(subtitle) # Dates. A few internal dates (registered, created, updated) and # published (0..2554). We try to work with typed date list, in @@ -399,7 +399,7 @@ class DataciteImporter(EntityImporter): publisher = None if publisher: - publisher = clean(publisher) + publisher = clean_str(publisher) # Container. For the moment, only ISSN as container. container_id = None @@ -460,10 +460,10 @@ class DataciteImporter(EntityImporter): issue = container.get("issue") if volume: - volume = clean(volume) + volume = clean_str(volume) if issue: - issue = clean(issue) + issue = clean_str(issue) # Pages. pages = None @@ -548,7 +548,7 @@ class DataciteImporter(EntityImporter): "[{}] language detection failed with {} on {}".format(doi, err, text), file=sys.stderr, ) - abstract_text = clean(text) + abstract_text = clean_str(text) if not abstract_text: continue abstracts.append( @@ -874,14 +874,14 @@ class DataciteImporter(EntityImporter): if len(affiliations) == 0: raw_affiliation = None else: - raw_affiliation = clean(affiliations[0]) + raw_affiliation = clean_str(affiliations[0]) name = c.get("name") given_name = c.get("givenName") surname = c.get("familyName") if name: - name = clean(name) + name = clean_str(name) if not any((name, given_name, surname)): continue if not name: @@ -895,8 +895,8 @@ class DataciteImporter(EntityImporter): name = index_form_to_display_name(name) if given_name: - given_name = clean(given_name) - surname = clean(surname) + given_name = clean_str(given_name) + surname = clean_str(surname) # Perform a final assertion that name does not reduce to zero # (e.g. whitespace only name). diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 7c595787..9db499a0 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity -from fatcat_tools.normal import clean_doi +from fatcat_tools.normal import clean_doi, clean_str -from .common import EntityImporter, clean, make_rel_url +from .common import EntityImporter, make_rel_url MAX_ABSTRACT_BYTES = 4096 @@ -86,7 +86,7 @@ class GrobidMetadataImporter(EntityImporter): abstract = obj.get("abstract") if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10: abobj = fatcat_openapi_client.ReleaseAbstract( - mimetype="text/plain", content=clean(obj.get("abstract")) + mimetype="text/plain", content=clean_str(obj.get("abstract")) ) abstracts = [abobj] else: @@ -97,9 +97,9 @@ class GrobidMetadataImporter(EntityImporter): contribs.append( fatcat_openapi_client.ReleaseContrib( index=i, - raw_name=clean(a["name"]), - given_name=clean(a.get("given_name")), - surname=clean(a.get("surname")), + raw_name=clean_str(a["name"]), + given_name=clean_str(a.get("given_name")), + surname=clean_str(a.get("surname")), role="author", extra=None, ) @@ -116,15 +116,15 @@ class GrobidMetadataImporter(EntityImporter): pass for key in ("volume", "url", "issue", "publisher"): if raw.get(key): - cite_extra[key] = clean(raw[key]) + cite_extra[key] = clean_str(raw[key]) if raw.get("authors"): - cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]] + cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]] refs.append( fatcat_openapi_client.ReleaseRef( - key=clean(raw.get("id")), + key=clean_str(raw.get("id")), year=year, - title=clean(raw["title"]), + title=clean_str(raw["title"]), extra=cite_extra or None, ) ) @@ -140,7 +140,7 @@ class GrobidMetadataImporter(EntityImporter): if doi: extra["doi"] = doi if obj["journal"] and obj["journal"].get("name"): - extra["container_name"] = clean(obj["journal"]["name"]) + extra["container_name"] = clean_str(obj["journal"]["name"]) # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -149,7 +149,7 @@ class GrobidMetadataImporter(EntityImporter): if self.longtail_oa: extra["longtail_oa"] = True - clean_title = clean(obj["title"], force_xml=True) + clean_title = clean_str(obj["title"], force_xml=True) if not clean_title or len(clean_title) < 2: return None title = clean_title @@ -161,9 +161,9 @@ class GrobidMetadataImporter(EntityImporter): release_year=release_year, contribs=contribs, refs=refs, - publisher=clean(obj["journal"].get("publisher")), - volume=clean(obj["journal"].get("volume")), - issue=clean(obj["journal"].get("issue")), + publisher=clean_str(obj["journal"].get("publisher")), + volume=clean_str(obj["journal"].get("volume")), + issue=clean_str(obj["journal"].get("issue")), abstracts=abstracts or None, ext_ids=fatcat_openapi_client.ReleaseExtIds(), extra=extra or None, diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index a737ac9f..9916a55f 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -6,9 +6,9 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from fatcat_tools.normal import clean_doi +from fatcat_tools.normal import clean_doi, clean_str, is_cjk -from .common import DATE_FMT, EntityImporter, clean, is_cjk +from .common import DATE_FMT, EntityImporter # TODO: should be List[Tag] not List[Any] for full type annotations @@ -36,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]: for raw in raw_persons: name = raw.find("name") or None if name: - name = clean(name.get_text().replace("\n", " ")) + name = clean_str(name.get_text().replace("\n", " ")) surname = raw.find("familyName") or None if surname: - surname = clean(surname.get_text().replace("\n", " ")) + surname = clean_str(surname.get_text().replace("\n", " ")) given_name = raw.find("givenName") or None if given_name: - given_name = clean(given_name.get_text().replace("\n", " ")) + given_name = clean_str(given_name.get_text().replace("\n", " ")) lang = "en" if is_cjk(name): lang = "ja" @@ -230,16 +230,16 @@ class JalcImporter(EntityImporter): for p in record.find_all("publicationName") if p.get_text() ] - pubs = [clean(p) for p in pubs if p] + pubs = [clean_str(p) for p in pubs if p] assert pubs if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] if len(pubs) > 1 and is_cjk(pubs[0]): # eng/jpn ordering is not reliable pubs = [pubs[1], pubs[0]] - container_name = clean(pubs[0]) + container_name = clean_str(pubs[0]) if len(pubs) > 1: - container_extra["original_name"] = clean(pubs[1]) + container_extra["original_name"] = clean_str(pubs[1]) if record.publisher: pubs = [ @@ -254,7 +254,7 @@ class JalcImporter(EntityImporter): # ordering is not reliable pubs = [pubs[1], pubs[0]] if pubs: - publisher = clean(pubs[0]) + publisher = clean_str(pubs[0]) if len(pubs) > 1: container_extra["publisher_aliases"] = pubs[1:] @@ -296,14 +296,14 @@ class JalcImporter(EntityImporter): # (informally) extra["jalc"] = extra_jalc - title = clean(title) + title = clean_str(title) if not title: return None re = ReleaseEntity( work_id=None, title=title, - original_title=clean(original_title), + original_title=clean_str(original_title), release_type=release_type, release_stage="published", release_date=release_date, diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index a45e49f3..fc1dfcbd 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ContainerEntity -from .common import EntityImporter, clean +from fatcat_tools.normal import clean_str + +from .common import EntityImporter def or_none(s: Optional[str]) -> Optional[str]: @@ -105,7 +107,7 @@ class JournalMetadataImporter(EntityImporter): if extra_ia: extra["ia"] = extra_ia - name = clean(row.get("name")) + name = clean_str(row.get("name")) if not name: return None @@ -115,7 +117,7 @@ class JournalMetadataImporter(EntityImporter): issnp=row.get("issnp"), container_type=None, # TODO name=name, - publisher=clean(row.get("publisher")), + publisher=clean_str(row.get("publisher")), wikidata_qid=None, # TODO extra=extra, ) diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index ca1f2466..c2f650b0 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -8,9 +8,9 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity -from fatcat_tools.normal import clean_doi +from fatcat_tools.normal import LANG_MAP_MARC, clean_doi, clean_str -from .common import LANG_MAP_MARC, EntityImporter, clean +from .common import EntityImporter from .crossref import CONTAINER_TYPE_MAP # TODO: more entries? @@ -140,7 +140,7 @@ class JstorImporter(EntityImporter): issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), - name=clean(journal_title, force_xml=True), + name=clean_str(journal_title, force_xml=True), ) ce_edit = self.create_container(ce) container_id = ce_edit.ident @@ -166,13 +166,13 @@ class JstorImporter(EntityImporter): for c in cgroup.find_all("contrib"): given = c.find("given-names") if given: - given = clean(given.get_text().replace("\n", " ")) + given = clean_str(given.get_text().replace("\n", " ")) surname = c.find("surname") if surname: - surname = clean(surname.get_text().replace("\n", " ")) + surname = clean_str(surname.get_text().replace("\n", " ")) raw_name = c.find("string-name") if raw_name: - raw_name = clean(raw_name.get_text().replace("\n", " ")) + raw_name = clean_str(raw_name.get_text().replace("\n", " ")) if not raw_name: if given and surname: @@ -234,7 +234,7 @@ class JstorImporter(EntityImporter): # JSTOR issue-id if article_meta.find("issue-id"): - issue_id = clean(article_meta.find("issue-id").string) + issue_id = clean_str(article_meta.find("issue-id").string) if issue_id: extra_jstor["issue_id"] = issue_id diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 430cdd0f..f3d82a86 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -4,7 +4,9 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, CreatorEntity -from .common import EntityImporter, clean +from fatcat_tools.normal import clean_str + +from .common import EntityImporter def value_or_none(e: Any) -> Any: @@ -65,14 +67,14 @@ class OrcidImporter(EntityImporter): if not self.is_orcid(orcid): sys.stderr.write("Bad ORCID: {}\n".format(orcid)) return None - display = clean(display) + display = clean_str(display) if not display: # must have *some* name return None ce = CreatorEntity( orcid=orcid, - given_name=clean(given), - surname=clean(sur), + given_name=clean_str(given), + surname=clean_str(sur), display_name=display, extra=extra, ) diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index d32fcefa..3274234f 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -8,9 +8,16 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity -from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid +from fatcat_tools.normal import ( + LANG_MAP_MARC, + clean_doi, + clean_issn, + clean_pmcid, + clean_pmid, + clean_str, +) -from .common import LANG_MAP_MARC, EntityImporter, clean +from .common import EntityImporter # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly PUBMED_RELEASE_TYPE_MAP = { @@ -704,14 +711,14 @@ class PubmedImporter(EntityImporter): if extra_pubmed: extra["pubmed"] = extra_pubmed - title = clean(title) + title = clean_str(title) if not title: return None re = fatcat_openapi_client.ReleaseEntity( work_id=None, title=title, - original_title=clean(original_title), + original_title=clean_str(original_title), release_type=release_type, release_stage=release_stage, release_date=release_date, |