aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/arxiv.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-04-01 12:02:20 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-04-01 12:02:20 -0700
commitf77a553350238c8ccc9c3bc0edcf47fb9dd067b3 (patch)
tree9cd3fac4da944cd859491d3593a149923948133b /python/fatcat_tools/importers/arxiv.py
parent6681500eeffe39b7d029a0e0d6b2ed83729f555f (diff)
downloadfatcat-f77a553350238c8ccc9c3bc0edcf47fb9dd067b3.tar.gz
fatcat-f77a553350238c8ccc9c3bc0edcf47fb9dd067b3.zip
importers: replace newlines in get_text() strings
Diffstat (limited to 'python/fatcat_tools/importers/arxiv.py')
-rw-r--r--python/fatcat_tools/importers/arxiv.py8
1 files changed, 4 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 79b242c4..719592fc 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -118,13 +118,13 @@ class ArxivRawImporter(EntityImporter):
if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]):
sys.stderr.write("BOGUS DOI: {}\n".format(doi))
doi = None
- title = latex_to_text(metadata.title.get_text())
- authors = parse_arxiv_authors(metadata.authors.get_text())
+ title = latex_to_text(metadata.title.get_text().replace('\n', ' '))
+ authors = parse_arxiv_authors(metadata.authors.get_text().replace('\n', ' '))
contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)]
lang = "en" # the vast majority in english
if metadata.comments and metadata.comments.get_text():
- comments = metadata.comments.get_text().strip()
+ comments = metadata.comments.get_text().replace('\n', ' ').strip()
extra_arxiv['comments'] = comments
if 'in french' in comments.lower():
lang = 'fr'
@@ -146,7 +146,7 @@ class ArxivRawImporter(EntityImporter):
number = None
if metadata.find('journal-ref') and metadata.find('journal-ref').get_text():
- journal_ref = metadata.find('journal-ref').get_text().strip()
+ journal_ref = metadata.find('journal-ref').get_text().replace('\n', ' ').strip()
extra_arxiv['journal_ref'] = journal_ref
if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
release_type = "paper-conference"