aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/jalc.py
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2020-04-01 22:03:19 +0000
committerbnewbold <bnewbold@archive.org>2020-04-01 22:03:19 +0000
commit32f195cec41459045f3d3453dad7a97b38d4e288 (patch)
treeab166daf8686472ed9641d96ab055f37ee89d71c /python/fatcat_tools/importers/jalc.py
parent0e2025091d0c974a888a5bc741495951c952ccda (diff)
parent938d2c5366d80618b839c83baadc9b5c62d10dce (diff)
downloadfatcat-32f195cec41459045f3d3453dad7a97b38d4e288.tar.gz
fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.zip
Merge branch 'bnewbold-pubmed-get_text' into 'master'
beautifulsoup XML parsing: .string vs. .get_text() See merge request webgroup/fatcat!40
Diffstat (limited to 'python/fatcat_tools/importers/jalc.py')
-rw-r--r--python/fatcat_tools/importers/jalc.py14
1 files changed, 7 insertions, 7 deletions
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 351a20a3..c2adc0d6 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -35,13 +35,13 @@ def parse_jalc_persons(raw_persons):
for raw in raw_persons:
name = raw.find('name') or None
if name:
- name = clean(name.string)
+ name = clean(name.get_text().replace('\n', ' '))
surname = raw.find('familyName') or None
if surname:
- surname = clean(surname.string)
+ surname = clean(surname.get_text().replace('\n', ' '))
given_name = raw.find('givenName') or None
if given_name:
- given_name = clean(given_name.string)
+ given_name = clean(given_name.get_text().replace('\n', ' '))
lang = 'en'
if is_cjk(name):
lang = 'ja'
@@ -163,12 +163,12 @@ class JalcImporter(EntityImporter):
titles = record.find_all("title")
if not titles:
return None
- title = titles[0].string.strip()
+ title = titles[0].get_text().replace('\n', ' ').strip()
original_title = None
if title.endswith('.'):
title = title[:-1]
if len(titles) > 1:
- original_title = titles[1].string.strip()
+ original_title = titles[1].get_text().replace('\n', ' ').strip()
if original_title.endswith('.'):
original_title = original_title[:-1]
@@ -242,7 +242,7 @@ class JalcImporter(EntityImporter):
container_extra = dict()
if record.publicationName:
- pubs = [p.string.strip() for p in record.find_all("publicationName") if p.string]
+ pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()]
pubs = [clean(p) for p in pubs if p]
assert(pubs)
if len(pubs) > 1 and pubs[0] == pubs[1]:
@@ -255,7 +255,7 @@ class JalcImporter(EntityImporter):
container_extra['original_name'] = clean(pubs[1])
if record.publisher:
- pubs = [p.string.strip() for p in record.find_all("publisher") if p.string]
+ pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()]
pubs = [p for p in pubs if p]
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]