diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-10 13:23:12 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-10 13:28:28 -0800 |
commit | 16e9979a6f347b49764c1141209e84083ea81057 (patch) | |
tree | ccc3d35607cadac4933e9b28366bedf5a605c122 /python/fatcat_tools/importers/jalc.py | |
parent | ab4e1355bf93e3755985f1b5cd2589a78601d253 (diff) | |
download | fatcat-16e9979a6f347b49764c1141209e84083ea81057.tar.gz fatcat-16e9979a6f347b49764c1141209e84083ea81057.zip |
importers: refactor imports of clean() and other normalization helpers
Diffstat (limited to 'python/fatcat_tools/importers/jalc.py')
-rw-r--r-- | python/fatcat_tools/importers/jalc.py | 22 |
1 files changed, 11 insertions, 11 deletions
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index a737ac9f..9916a55f 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -6,9 +6,9 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from fatcat_tools.normal import clean_doi +from fatcat_tools.normal import clean_doi, clean_str, is_cjk -from .common import DATE_FMT, EntityImporter, clean, is_cjk +from .common import DATE_FMT, EntityImporter # TODO: should be List[Tag] not List[Any] for full type annotations @@ -36,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]: for raw in raw_persons: name = raw.find("name") or None if name: - name = clean(name.get_text().replace("\n", " ")) + name = clean_str(name.get_text().replace("\n", " ")) surname = raw.find("familyName") or None if surname: - surname = clean(surname.get_text().replace("\n", " ")) + surname = clean_str(surname.get_text().replace("\n", " ")) given_name = raw.find("givenName") or None if given_name: - given_name = clean(given_name.get_text().replace("\n", " ")) + given_name = clean_str(given_name.get_text().replace("\n", " ")) lang = "en" if is_cjk(name): lang = "ja" @@ -230,16 +230,16 @@ class JalcImporter(EntityImporter): for p in record.find_all("publicationName") if p.get_text() ] - pubs = [clean(p) for p in pubs if p] + pubs = [clean_str(p) for p in pubs if p] assert pubs if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] if len(pubs) > 1 and is_cjk(pubs[0]): # eng/jpn ordering is not reliable pubs = [pubs[1], pubs[0]] - container_name = clean(pubs[0]) + container_name = clean_str(pubs[0]) if len(pubs) > 1: - container_extra["original_name"] = clean(pubs[1]) + container_extra["original_name"] = clean_str(pubs[1]) if record.publisher: pubs = [ @@ -254,7 +254,7 @@ class JalcImporter(EntityImporter): # ordering is not reliable pubs = [pubs[1], pubs[0]] if pubs: - publisher = clean(pubs[0]) + publisher = clean_str(pubs[0]) if len(pubs) > 1: container_extra["publisher_aliases"] = pubs[1:] @@ -296,14 +296,14 @@ class JalcImporter(EntityImporter): # (informally) extra["jalc"] = extra_jalc - title = clean(title) + title = clean_str(title) if not title: return None re = ReleaseEntity( work_id=None, title=title, - original_title=clean(original_title), + original_title=clean_str(original_title), release_type=release_type, release_stage="published", release_date=release_date, |