diff options
author | bnewbold <bnewbold@archive.org> | 2020-04-01 22:03:19 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2020-04-01 22:03:19 +0000 |
commit | 32f195cec41459045f3d3453dad7a97b38d4e288 (patch) | |
tree | ab166daf8686472ed9641d96ab055f37ee89d71c /python/fatcat_tools/importers/arxiv.py | |
parent | 0e2025091d0c974a888a5bc741495951c952ccda (diff) | |
parent | 938d2c5366d80618b839c83baadc9b5c62d10dce (diff) | |
download | fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.tar.gz fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.zip |
Merge branch 'bnewbold-pubmed-get_text' into 'master'
beautifulsoup XML parsing: .string vs. .get_text()
See merge request webgroup/fatcat!40
Diffstat (limited to 'python/fatcat_tools/importers/arxiv.py')
-rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 22 |
1 files changed, 11 insertions, 11 deletions
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index c69ee16a..719592fc 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -118,13 +118,13 @@ class ArxivRawImporter(EntityImporter): if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]): sys.stderr.write("BOGUS DOI: {}\n".format(doi)) doi = None - title = latex_to_text(metadata.title.string) - authors = parse_arxiv_authors(metadata.authors.string) + title = latex_to_text(metadata.title.get_text().replace('\n', ' ')) + authors = parse_arxiv_authors(metadata.authors.get_text().replace('\n', ' ')) contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)] lang = "en" # the vast majority in english - if metadata.comments and metadata.comments.string: - comments = metadata.comments.string.strip() + if metadata.comments and metadata.comments.get_text(): + comments = metadata.comments.get_text().replace('\n', ' ').strip() extra_arxiv['comments'] = comments if 'in french' in comments.lower(): lang = 'fr' @@ -145,8 +145,8 @@ class ArxivRawImporter(EntityImporter): # more languages? number = None - if metadata.find('journal-ref') and metadata.find('journal-ref').string: - journal_ref = metadata.find('journal-ref').string.strip() + if metadata.find('journal-ref') and metadata.find('journal-ref').get_text(): + journal_ref = metadata.find('journal-ref').get_text().replace('\n', ' ').strip() extra_arxiv['journal_ref'] = journal_ref if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): release_type = "paper-conference" @@ -160,16 +160,16 @@ class ArxivRawImporter(EntityImporter): release_type = "report" if metadata.find('acm-class') and metadata.find('acm-class').string: extra_arxiv['acm_class'] = metadata.find('acm-class').string.strip() - if metadata.categories and metadata.categories.string: - extra_arxiv['categories'] = metadata.categories.string.split() + if metadata.categories and metadata.categories.get_text(): + extra_arxiv['categories'] = metadata.categories.get_text().split() license_slug = None - if metadata.license and metadata.license.string: - license_slug = lookup_license_slug(metadata.license.string) + if metadata.license and metadata.license.get_text(): + license_slug = lookup_license_slug(metadata.license.get_text()) abstracts = None if metadata.abstract: # TODO: test for this multi-abstract code path abstracts = [] - abst = metadata.abstract.string.strip() + abst = metadata.abstract.get_text().strip() orig = None if '-----' in abst: both = abst.split('-----') |