diff options
author | bnewbold <bnewbold@archive.org> | 2020-04-01 22:03:19 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2020-04-01 22:03:19 +0000 |
commit | 32f195cec41459045f3d3453dad7a97b38d4e288 (patch) | |
tree | ab166daf8686472ed9641d96ab055f37ee89d71c /python/fatcat_tools/importers/pubmed.py | |
parent | 0e2025091d0c974a888a5bc741495951c952ccda (diff) | |
parent | 938d2c5366d80618b839c83baadc9b5c62d10dce (diff) | |
download | fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.tar.gz fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.zip |
Merge branch 'bnewbold-pubmed-get_text' into 'master'
beautifulsoup XML parsing: .string vs. .get_text()
See merge request webgroup/fatcat!40
Diffstat (limited to 'python/fatcat_tools/importers/pubmed.py')
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 32 |
1 files changed, 20 insertions, 12 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3ecf5ef4..abcb21d9 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -392,8 +392,9 @@ class PubmedImporter(EntityImporter): if pages: pages = pages.string - title = medline.Article.ArticleTitle.string # always present + title = medline.Article.ArticleTitle.get_text() # always present if title: + title = title.replace('\n', ' ') if title.endswith('.'): title = title[:-1] # this hides some "special" titles, but the vast majority are @@ -406,20 +407,27 @@ class PubmedImporter(EntityImporter): original_title = medline.Article.find("VernacularTitle", recurse=False) if original_title: - original_title = original_title.string or None + original_title = original_title.get_text() or None + original_title = original_title.replace('\n', ' ') if original_title and original_title.endswith('.'): original_title = original_title[:-1] + if original_title and not title: + # if we only have an "original" title, but not translated/english + # title, sub in the original title so the entity can be created + title = original_title + original_title = None + # TODO: happening in alpha order, not handling multi-language well. language = medline.Article.Language if language: - language = language.string + language = language.get_text() if language in ("und", "un"): # "undetermined" language = None else: language = LANG_MAP_MARC.get(language) - if not language and not (medline.Article.Language.string in LANG_MAP_MARC): + if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC): warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string)) ### Journal/Issue Metadata @@ -479,7 +487,7 @@ class PubmedImporter(EntityImporter): print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr) if journal.find("Title"): - container_name = journal.Title.string + container_name = journal.Title.get_text() if (container_id is None and self.create_containers and (issnl is not None) and container_name): @@ -558,15 +566,15 @@ class PubmedImporter(EntityImporter): surname = None raw_name = None if author.ForeName: - given_name = author.ForeName.string + given_name = author.ForeName.get_text().replace('\n', ' ') if author.LastName: - surname = author.LastName.string + surname = author.LastName.get_text().replace('\n', ' ') if given_name and surname: raw_name = "{} {}".format(given_name, surname) elif surname: raw_name = surname - if not raw_name and author.CollectiveName and author.CollectiveName.string: - raw_name = author.CollectiveName.string + if not raw_name and author.CollectiveName and author.CollectiveName.get_text(): + raw_name = author.CollectiveName.get_text().replace('\n', ' ') contrib_extra = dict() orcid = author.find("Identifier", Source="ORCID") if orcid: @@ -588,9 +596,9 @@ class PubmedImporter(EntityImporter): affiliations = author.find_all("Affiliation") raw_affiliation = None if affiliations: - raw_affiliation = affiliations[0].string + raw_affiliation = affiliations[0].get_text().replace('\n', ' ') if len(affiliations) > 1: - contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]] + contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]] if author.find("EqualContrib"): # TODO: schema for this? contrib_extra['equal'] = True @@ -638,7 +646,7 @@ class PubmedImporter(EntityImporter): ref_release_id = self.lookup_pmid(ref_pmid) ref_raw = ref.Citation if ref_raw: - ref_extra['unstructured'] = ref_raw.string + ref_extra['unstructured'] = ref_raw.get_text() if not ref_extra: ref_extra = None refs.append(fatcat_openapi_client.ReleaseRef( |