diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-04-01 12:02:20 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-04-01 12:02:20 -0700 |
commit | f77a553350238c8ccc9c3bc0edcf47fb9dd067b3 (patch) | |
tree | 9cd3fac4da944cd859491d3593a149923948133b /python/fatcat_tools/importers/jalc.py | |
parent | 6681500eeffe39b7d029a0e0d6b2ed83729f555f (diff) | |
download | fatcat-f77a553350238c8ccc9c3bc0edcf47fb9dd067b3.tar.gz fatcat-f77a553350238c8ccc9c3bc0edcf47fb9dd067b3.zip |
importers: replace newlines in get_text() strings
Diffstat (limited to 'python/fatcat_tools/importers/jalc.py')
-rw-r--r-- | python/fatcat_tools/importers/jalc.py | 14 |
1 files changed, 7 insertions, 7 deletions
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 51760f8a..c2adc0d6 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -35,13 +35,13 @@ def parse_jalc_persons(raw_persons): for raw in raw_persons: name = raw.find('name') or None if name: - name = clean(name.get_text()) + name = clean(name.get_text().replace('\n', ' ')) surname = raw.find('familyName') or None if surname: - surname = clean(surname.get_text()) + surname = clean(surname.get_text().replace('\n', ' ')) given_name = raw.find('givenName') or None if given_name: - given_name = clean(given_name.get_text()) + given_name = clean(given_name.get_text().replace('\n', ' ')) lang = 'en' if is_cjk(name): lang = 'ja' @@ -163,12 +163,12 @@ class JalcImporter(EntityImporter): titles = record.find_all("title") if not titles: return None - title = titles[0].get_text().strip() + title = titles[0].get_text().replace('\n', ' ').strip() original_title = None if title.endswith('.'): title = title[:-1] if len(titles) > 1: - original_title = titles[1].get_text().strip() + original_title = titles[1].get_text().replace('\n', ' ').strip() if original_title.endswith('.'): original_title = original_title[:-1] @@ -242,7 +242,7 @@ class JalcImporter(EntityImporter): container_extra = dict() if record.publicationName: - pubs = [p.get_text().strip() for p in record.find_all("publicationName") if p.get_text()] + pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()] pubs = [clean(p) for p in pubs if p] assert(pubs) if len(pubs) > 1 and pubs[0] == pubs[1]: @@ -255,7 +255,7 @@ class JalcImporter(EntityImporter): container_extra['original_name'] = clean(pubs[1]) if record.publisher: - pubs = [p.get_text().strip() for p in record.find_all("publisher") if p.get_text()] + pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()] pubs = [p for p in pubs if p] if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] |