diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-03-28 20:01:46 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-03-28 20:01:48 -0700 |
commit | d6af7b7544ddb3b5e7b1f4a0fd76bd9cd5ed9125 (patch) | |
tree | 924f23ae748b94d8604a0e92f952ddf9562a5c93 /python/fatcat_tools | |
parent | 4b75a81cbd0faeefa6a0f04b97ecc6832924ee69 (diff) | |
download | fatcat-d6af7b7544ddb3b5e7b1f4a0fd76bd9cd5ed9125.tar.gz fatcat-d6af7b7544ddb3b5e7b1f4a0fd76bd9cd5ed9125.zip |
pubmed: bunch of .get_text() instead of .string
Yikes! Apparently when a tag has child tags, .string will return None
instead of all the strings. .get_text() returns all of it:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#string
I've things like identifiers as .string, when we expect only a single
string inside.
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 24 |
1 files changed, 12 insertions, 12 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3ecf5ef4..3e9527d4 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -392,7 +392,7 @@ class PubmedImporter(EntityImporter): if pages: pages = pages.string - title = medline.Article.ArticleTitle.string # always present + title = medline.Article.ArticleTitle.get_text() # always present if title: if title.endswith('.'): title = title[:-1] @@ -406,20 +406,20 @@ class PubmedImporter(EntityImporter): original_title = medline.Article.find("VernacularTitle", recurse=False) if original_title: - original_title = original_title.string or None + original_title = original_title.get_text() or None if original_title and original_title.endswith('.'): original_title = original_title[:-1] # TODO: happening in alpha order, not handling multi-language well. language = medline.Article.Language if language: - language = language.string + language = language.get_text() if language in ("und", "un"): # "undetermined" language = None else: language = LANG_MAP_MARC.get(language) - if not language and not (medline.Article.Language.string in LANG_MAP_MARC): + if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC): warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string)) ### Journal/Issue Metadata @@ -479,7 +479,7 @@ class PubmedImporter(EntityImporter): print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr) if journal.find("Title"): - container_name = journal.Title.string + container_name = journal.Title.get_text() if (container_id is None and self.create_containers and (issnl is not None) and container_name): @@ -558,15 +558,15 @@ class PubmedImporter(EntityImporter): surname = None raw_name = None if author.ForeName: - given_name = author.ForeName.string + given_name = author.ForeName.get_text() if author.LastName: - surname = author.LastName.string + surname = author.LastName.get_text() if given_name and surname: raw_name = "{} {}".format(given_name, surname) elif surname: raw_name = surname - if not raw_name and author.CollectiveName and author.CollectiveName.string: - raw_name = author.CollectiveName.string + if not raw_name and author.CollectiveName and author.CollectiveName.get_text(): + raw_name = author.CollectiveName.get_text() contrib_extra = dict() orcid = author.find("Identifier", Source="ORCID") if orcid: @@ -588,9 +588,9 @@ class PubmedImporter(EntityImporter): affiliations = author.find_all("Affiliation") raw_affiliation = None if affiliations: - raw_affiliation = affiliations[0].string + raw_affiliation = affiliations[0].get_text() if len(affiliations) > 1: - contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]] + contrib_extra['more_affiliations'] = [ra.get_text() for ra in affiliations[1:]] if author.find("EqualContrib"): # TODO: schema for this? contrib_extra['equal'] = True @@ -638,7 +638,7 @@ class PubmedImporter(EntityImporter): ref_release_id = self.lookup_pmid(ref_pmid) ref_raw = ref.Citation if ref_raw: - ref_extra['unstructured'] = ref_raw.string + ref_extra['unstructured'] = ref_raw.get_text() if not ref_extra: ref_extra = None refs.append(fatcat_openapi_client.ReleaseRef( |