summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/jstor.py
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2020-04-01 22:03:19 +0000
committerbnewbold <bnewbold@archive.org>2020-04-01 22:03:19 +0000
commit32f195cec41459045f3d3453dad7a97b38d4e288 (patch)
treeab166daf8686472ed9641d96ab055f37ee89d71c /python/fatcat_tools/importers/jstor.py
parent0e2025091d0c974a888a5bc741495951c952ccda (diff)
parent938d2c5366d80618b839c83baadc9b5c62d10dce (diff)
downloadfatcat-32f195cec41459045f3d3453dad7a97b38d4e288.tar.gz
fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.zip
Merge branch 'bnewbold-pubmed-get_text' into 'master'
beautifulsoup XML parsing: .string vs. .get_text() See merge request webgroup/fatcat!40
Diffstat (limited to 'python/fatcat_tools/importers/jstor.py')
-rw-r--r--python/fatcat_tools/importers/jstor.py18
1 files changed, 9 insertions, 9 deletions
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 5ff1ecd9..96dbf947 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -63,13 +63,13 @@ class JstorImporter(EntityImporter):
release_type = JSTOR_TYPE_MAP.get(article['article-type'])
title = article_meta.find("article-title")
- if title and title.string:
- title = title.string.strip()
- elif title and not title.string:
+ if title and title.get_text():
+ title = title.get_text().replace('\n', ' ').strip()
+ elif title and not title.get_text():
title = None
if not title and release_type.startswith('review') and article_meta.product.source:
- title = "Review: {}".format(article_meta.product.source.string)
+ title = "Review: {}".format(article_meta.product.source.replace('\n', ' ').get_text())
if not title:
return None
@@ -96,8 +96,8 @@ class JstorImporter(EntityImporter):
if journal_ids:
extra_jstor['journal_ids'] = journal_ids
- journal_title = journal_meta.find("journal-title").string
- publisher = journal_meta.find("publisher-name").string
+ journal_title = journal_meta.find("journal-title").get_text().replace('\n', ' ')
+ publisher = journal_meta.find("publisher-name").get_text().replace('\n', ' ')
issn = journal_meta.find("issn")
if issn:
issn = issn.string
@@ -141,13 +141,13 @@ class JstorImporter(EntityImporter):
for c in cgroup.find_all("contrib"):
given = c.find("given-names")
if given:
- given = clean(given.string)
+ given = clean(given.get_text().replace('\n', ' '))
surname = c.find("surname")
if surname:
- surname = clean(surname.string)
+ surname = clean(surname.get_text().replace('\n', ' '))
raw_name = c.find("string-name")
if raw_name:
- raw_name = clean(raw_name.string)
+ raw_name = clean(raw_name.get_text().replace('\n', ' '))
if not raw_name:
if given and surname: