summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/jalc.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-10 13:23:12 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-10 13:28:28 -0800
commit16e9979a6f347b49764c1141209e84083ea81057 (patch)
treeccc3d35607cadac4933e9b28366bedf5a605c122 /python/fatcat_tools/importers/jalc.py
parentab4e1355bf93e3755985f1b5cd2589a78601d253 (diff)
downloadfatcat-16e9979a6f347b49764c1141209e84083ea81057.tar.gz
fatcat-16e9979a6f347b49764c1141209e84083ea81057.zip
importers: refactor imports of clean() and other normalization helpers
Diffstat (limited to 'python/fatcat_tools/importers/jalc.py')
-rw-r--r--python/fatcat_tools/importers/jalc.py22
1 files changed, 11 insertions, 11 deletions
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index a737ac9f..9916a55f 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -6,9 +6,9 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str, is_cjk
-from .common import DATE_FMT, EntityImporter, clean, is_cjk
+from .common import DATE_FMT, EntityImporter
# TODO: should be List[Tag] not List[Any] for full type annotations
@@ -36,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]:
for raw in raw_persons:
name = raw.find("name") or None
if name:
- name = clean(name.get_text().replace("\n", " "))
+ name = clean_str(name.get_text().replace("\n", " "))
surname = raw.find("familyName") or None
if surname:
- surname = clean(surname.get_text().replace("\n", " "))
+ surname = clean_str(surname.get_text().replace("\n", " "))
given_name = raw.find("givenName") or None
if given_name:
- given_name = clean(given_name.get_text().replace("\n", " "))
+ given_name = clean_str(given_name.get_text().replace("\n", " "))
lang = "en"
if is_cjk(name):
lang = "ja"
@@ -230,16 +230,16 @@ class JalcImporter(EntityImporter):
for p in record.find_all("publicationName")
if p.get_text()
]
- pubs = [clean(p) for p in pubs if p]
+ pubs = [clean_str(p) for p in pubs if p]
assert pubs
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
if len(pubs) > 1 and is_cjk(pubs[0]):
# eng/jpn ordering is not reliable
pubs = [pubs[1], pubs[0]]
- container_name = clean(pubs[0])
+ container_name = clean_str(pubs[0])
if len(pubs) > 1:
- container_extra["original_name"] = clean(pubs[1])
+ container_extra["original_name"] = clean_str(pubs[1])
if record.publisher:
pubs = [
@@ -254,7 +254,7 @@ class JalcImporter(EntityImporter):
# ordering is not reliable
pubs = [pubs[1], pubs[0]]
if pubs:
- publisher = clean(pubs[0])
+ publisher = clean_str(pubs[0])
if len(pubs) > 1:
container_extra["publisher_aliases"] = pubs[1:]
@@ -296,14 +296,14 @@ class JalcImporter(EntityImporter):
# (informally)
extra["jalc"] = extra_jalc
- title = clean(title)
+ title = clean_str(title)
if not title:
return None
re = ReleaseEntity(
work_id=None,
title=title,
- original_title=clean(original_title),
+ original_title=clean_str(original_title),
release_type=release_type,
release_stage="published",
release_date=release_date,