pubmed: bunch of .get_text() instead of .string

Yikes! Apparently when a tag has child tags, .string will return None instead of all the strings. .get_text() returns all of it: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text https://www.crummy.com/software/BeautifulSoup/bs4/doc/#string I've things like identifiers as .string, when we expect only a single string inside.
author: Bryan Newbold <bnewbold@robocracy.org> 2020-03-28 20:01:46 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2020-03-28 20:01:48 -0700
commit: d6af7b7544ddb3b5e7b1f4a0fd76bd9cd5ed9125 (patch)
tree: 924f23ae748b94d8604a0e92f952ddf9562a5c93
parent: 4b75a81cbd0faeefa6a0f04b97ecc6832924ee69 (diff)
download: fatcat-d6af7b7544ddb3b5e7b1f4a0fd76bd9cd5ed9125.tar.gz
fatcat-d6af7b7544ddb3b5e7b1f4a0fd76bd9cd5ed9125.zip
1 files changed, 12 insertions, 12 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3ecf5ef4..3e9527d4 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -392,7 +392,7 @@ class PubmedImporter(EntityImporter):
         if pages:
             pages = pages.string
 
-        title = medline.Article.ArticleTitle.string # always present
+        title = medline.Article.ArticleTitle.get_text() # always present
         if title:
             if title.endswith('.'):
                 title = title[:-1]
@@ -406,20 +406,20 @@ class PubmedImporter(EntityImporter):
 
         original_title = medline.Article.find("VernacularTitle", recurse=False)
         if original_title:
-            original_title = original_title.string or None
+            original_title = original_title.get_text() or None
             if original_title and original_title.endswith('.'):
                 original_title = original_title[:-1]
 
         # TODO: happening in alpha order, not handling multi-language well.
         language = medline.Article.Language
         if language:
-            language = language.string
+            language = language.get_text()
             if language in ("und", "un"):
                 # "undetermined"
                 language = None
             else:
                 language = LANG_MAP_MARC.get(language)
-                if not language and not (medline.Article.Language.string in LANG_MAP_MARC):
+                if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC):
                     warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
 
         ### Journal/Issue Metadata
@@ -479,7 +479,7 @@ class PubmedImporter(EntityImporter):
                 print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)
 
         if journal.find("Title"):
-            container_name = journal.Title.string
+            container_name = journal.Title.get_text()
 
         if (container_id is None and self.create_containers and (issnl is not None)
                 and container_name):
@@ -558,15 +558,15 @@ class PubmedImporter(EntityImporter):
                 surname = None
                 raw_name = None
                 if author.ForeName:
-                    given_name = author.ForeName.string
+                    given_name = author.ForeName.get_text()
                 if author.LastName:
-                    surname = author.LastName.string
+                    surname = author.LastName.get_text()
                 if given_name and surname:
                     raw_name = "{} {}".format(given_name, surname)
                 elif surname:
                     raw_name = surname
-                if not raw_name and author.CollectiveName and author.CollectiveName.string:
-                    raw_name = author.CollectiveName.string
+                if not raw_name and author.CollectiveName and author.CollectiveName.get_text():
+                    raw_name = author.CollectiveName.get_text()
                 contrib_extra = dict()
                 orcid = author.find("Identifier", Source="ORCID")
                 if orcid:
@@ -588,9 +588,9 @@ class PubmedImporter(EntityImporter):
                 affiliations = author.find_all("Affiliation")
                 raw_affiliation = None
                 if affiliations:
-                    raw_affiliation = affiliations[0].string
+                    raw_affiliation = affiliations[0].get_text()
                     if len(affiliations) > 1:
-                        contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]]
+                        contrib_extra['more_affiliations'] = [ra.get_text() for ra in affiliations[1:]]
                 if author.find("EqualContrib"):
                     # TODO: schema for this?
                     contrib_extra['equal'] = True
@@ -638,7 +638,7 @@ class PubmedImporter(EntityImporter):
                         ref_release_id = self.lookup_pmid(ref_pmid)
                 ref_raw = ref.Citation
                 if ref_raw:
-                    ref_extra['unstructured'] = ref_raw.string
+                    ref_extra['unstructured'] = ref_raw.get_text()
                 if not ref_extra:
                     ref_extra = None
                 refs.append(fatcat_openapi_client.ReleaseRef(
author	Bryan Newbold <bnewbold@robocracy.org>	2020-03-28 20:01:46 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2020-03-28 20:01:48 -0700
commit	d6af7b7544ddb3b5e7b1f4a0fd76bd9cd5ed9125 (patch)
tree	924f23ae748b94d8604a0e92f952ddf9562a5c93
parent	4b75a81cbd0faeefa6a0f04b97ecc6832924ee69 (diff)
download	fatcat-d6af7b7544ddb3b5e7b1f4a0fd76bd9cd5ed9125.tar.gz fatcat-d6af7b7544ddb3b5e7b1f4a0fd76bd9cd5ed9125.zip