diff options
Diffstat (limited to 'python/fatcat_tools/importers/pubmed.py')
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 12 |
1 files changed, 7 insertions, 5 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3e9527d4..62bb1ddb 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -394,6 +394,7 @@ class PubmedImporter(EntityImporter): title = medline.Article.ArticleTitle.get_text() # always present if title: + title = title.replace('\n', ' ') if title.endswith('.'): title = title[:-1] # this hides some "special" titles, but the vast majority are @@ -407,6 +408,7 @@ class PubmedImporter(EntityImporter): original_title = medline.Article.find("VernacularTitle", recurse=False) if original_title: original_title = original_title.get_text() or None + original_title = original_title.replace('\n', ' ') if original_title and original_title.endswith('.'): original_title = original_title[:-1] @@ -558,15 +560,15 @@ class PubmedImporter(EntityImporter): surname = None raw_name = None if author.ForeName: - given_name = author.ForeName.get_text() + given_name = author.ForeName.get_text().replace('\n', ' ') if author.LastName: - surname = author.LastName.get_text() + surname = author.LastName.get_text().replace('\n', ' ') if given_name and surname: raw_name = "{} {}".format(given_name, surname) elif surname: raw_name = surname if not raw_name and author.CollectiveName and author.CollectiveName.get_text(): - raw_name = author.CollectiveName.get_text() + raw_name = author.CollectiveName.get_text().replace('\n', ' ') contrib_extra = dict() orcid = author.find("Identifier", Source="ORCID") if orcid: @@ -588,9 +590,9 @@ class PubmedImporter(EntityImporter): affiliations = author.find_all("Affiliation") raw_affiliation = None if affiliations: - raw_affiliation = affiliations[0].get_text() + raw_affiliation = affiliations[0].get_text().replace('\n', ' ') if len(affiliations) > 1: - contrib_extra['more_affiliations'] = [ra.get_text() for ra in affiliations[1:]] + contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]] if author.find("EqualContrib"): # TODO: schema for this? contrib_extra['equal'] = True |