diff options
author | bnewbold <bnewbold@archive.org> | 2020-04-01 22:03:19 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2020-04-01 22:03:19 +0000 |
commit | 32f195cec41459045f3d3453dad7a97b38d4e288 (patch) | |
tree | ab166daf8686472ed9641d96ab055f37ee89d71c /python/fatcat_tools/importers/jalc.py | |
parent | 0e2025091d0c974a888a5bc741495951c952ccda (diff) | |
parent | 938d2c5366d80618b839c83baadc9b5c62d10dce (diff) | |
download | fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.tar.gz fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.zip |
Merge branch 'bnewbold-pubmed-get_text' into 'master'
beautifulsoup XML parsing: .string vs. .get_text()
See merge request webgroup/fatcat!40
Diffstat (limited to 'python/fatcat_tools/importers/jalc.py')
-rw-r--r-- | python/fatcat_tools/importers/jalc.py | 14 |
1 files changed, 7 insertions, 7 deletions
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 351a20a3..c2adc0d6 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -35,13 +35,13 @@ def parse_jalc_persons(raw_persons): for raw in raw_persons: name = raw.find('name') or None if name: - name = clean(name.string) + name = clean(name.get_text().replace('\n', ' ')) surname = raw.find('familyName') or None if surname: - surname = clean(surname.string) + surname = clean(surname.get_text().replace('\n', ' ')) given_name = raw.find('givenName') or None if given_name: - given_name = clean(given_name.string) + given_name = clean(given_name.get_text().replace('\n', ' ')) lang = 'en' if is_cjk(name): lang = 'ja' @@ -163,12 +163,12 @@ class JalcImporter(EntityImporter): titles = record.find_all("title") if not titles: return None - title = titles[0].string.strip() + title = titles[0].get_text().replace('\n', ' ').strip() original_title = None if title.endswith('.'): title = title[:-1] if len(titles) > 1: - original_title = titles[1].string.strip() + original_title = titles[1].get_text().replace('\n', ' ').strip() if original_title.endswith('.'): original_title = original_title[:-1] @@ -242,7 +242,7 @@ class JalcImporter(EntityImporter): container_extra = dict() if record.publicationName: - pubs = [p.string.strip() for p in record.find_all("publicationName") if p.string] + pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()] pubs = [clean(p) for p in pubs if p] assert(pubs) if len(pubs) > 1 and pubs[0] == pubs[1]: @@ -255,7 +255,7 @@ class JalcImporter(EntityImporter): container_extra['original_name'] = clean(pubs[1]) if record.publisher: - pubs = [p.string.strip() for p in record.find_all("publisher") if p.string] + pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()] pubs = [p for p in pubs if p] if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] |