diff options
author | bnewbold <bnewbold@archive.org> | 2020-04-01 22:03:19 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2020-04-01 22:03:19 +0000 |
commit | 32f195cec41459045f3d3453dad7a97b38d4e288 (patch) | |
tree | ab166daf8686472ed9641d96ab055f37ee89d71c /python/fatcat_tools/importers/jstor.py | |
parent | 0e2025091d0c974a888a5bc741495951c952ccda (diff) | |
parent | 938d2c5366d80618b839c83baadc9b5c62d10dce (diff) | |
download | fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.tar.gz fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.zip |
Merge branch 'bnewbold-pubmed-get_text' into 'master'
beautifulsoup XML parsing: .string vs. .get_text()
See merge request webgroup/fatcat!40
Diffstat (limited to 'python/fatcat_tools/importers/jstor.py')
-rw-r--r-- | python/fatcat_tools/importers/jstor.py | 18 |
1 files changed, 9 insertions, 9 deletions
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 5ff1ecd9..96dbf947 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -63,13 +63,13 @@ class JstorImporter(EntityImporter): release_type = JSTOR_TYPE_MAP.get(article['article-type']) title = article_meta.find("article-title") - if title and title.string: - title = title.string.strip() - elif title and not title.string: + if title and title.get_text(): + title = title.get_text().replace('\n', ' ').strip() + elif title and not title.get_text(): title = None if not title and release_type.startswith('review') and article_meta.product.source: - title = "Review: {}".format(article_meta.product.source.string) + title = "Review: {}".format(article_meta.product.source.replace('\n', ' ').get_text()) if not title: return None @@ -96,8 +96,8 @@ class JstorImporter(EntityImporter): if journal_ids: extra_jstor['journal_ids'] = journal_ids - journal_title = journal_meta.find("journal-title").string - publisher = journal_meta.find("publisher-name").string + journal_title = journal_meta.find("journal-title").get_text().replace('\n', ' ') + publisher = journal_meta.find("publisher-name").get_text().replace('\n', ' ') issn = journal_meta.find("issn") if issn: issn = issn.string @@ -141,13 +141,13 @@ class JstorImporter(EntityImporter): for c in cgroup.find_all("contrib"): given = c.find("given-names") if given: - given = clean(given.string) + given = clean(given.get_text().replace('\n', ' ')) surname = c.find("surname") if surname: - surname = clean(surname.string) + surname = clean(surname.get_text().replace('\n', ' ')) raw_name = c.find("string-name") if raw_name: - raw_name = clean(raw_name.string) + raw_name = clean(raw_name.get_text().replace('\n', ' ')) if not raw_name: if given and surname: |