From d6af7b7544ddb3b5e7b1f4a0fd76bd9cd5ed9125 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Sat, 28 Mar 2020 20:01:46 -0700
Subject: pubmed: bunch of .get_text() instead of .string

Yikes! Apparently when a tag has child tags, .string will return None
instead of all the strings. .get_text() returns all of it:

  https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text
  https://www.crummy.com/software/BeautifulSoup/bs4/doc/#string

I've things like identifiers as .string, when we expect only a single
string inside.
---
 python/fatcat_tools/importers/pubmed.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3ecf5ef4..3e9527d4 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -392,7 +392,7 @@ class PubmedImporter(EntityImporter):
         if pages:
             pages = pages.string
 
-        title = medline.Article.ArticleTitle.string # always present
+        title = medline.Article.ArticleTitle.get_text() # always present
         if title:
             if title.endswith('.'):
                 title = title[:-1]
@@ -406,20 +406,20 @@ class PubmedImporter(EntityImporter):
 
         original_title = medline.Article.find("VernacularTitle", recurse=False)
         if original_title:
-            original_title = original_title.string or None
+            original_title = original_title.get_text() or None
             if original_title and original_title.endswith('.'):
                 original_title = original_title[:-1]
 
         # TODO: happening in alpha order, not handling multi-language well.
         language = medline.Article.Language
         if language:
-            language = language.string
+            language = language.get_text()
             if language in ("und", "un"):
                 # "undetermined"
                 language = None
             else:
                 language = LANG_MAP_MARC.get(language)
-                if not language and not (medline.Article.Language.string in LANG_MAP_MARC):
+                if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC):
                     warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
 
         ### Journal/Issue Metadata
@@ -479,7 +479,7 @@ class PubmedImporter(EntityImporter):
                 print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)
 
         if journal.find("Title"):
-            container_name = journal.Title.string
+            container_name = journal.Title.get_text()
 
         if (container_id is None and self.create_containers and (issnl is not None)
                 and container_name):
@@ -558,15 +558,15 @@ class PubmedImporter(EntityImporter):
                 surname = None
                 raw_name = None
                 if author.ForeName:
-                    given_name = author.ForeName.string
+                    given_name = author.ForeName.get_text()
                 if author.LastName:
-                    surname = author.LastName.string
+                    surname = author.LastName.get_text()
                 if given_name and surname:
                     raw_name = "{} {}".format(given_name, surname)
                 elif surname:
                     raw_name = surname
-                if not raw_name and author.CollectiveName and author.CollectiveName.string:
-                    raw_name = author.CollectiveName.string
+                if not raw_name and author.CollectiveName and author.CollectiveName.get_text():
+                    raw_name = author.CollectiveName.get_text()
                 contrib_extra = dict()
                 orcid = author.find("Identifier", Source="ORCID")
                 if orcid:
@@ -588,9 +588,9 @@ class PubmedImporter(EntityImporter):
                 affiliations = author.find_all("Affiliation")
                 raw_affiliation = None
                 if affiliations:
-                    raw_affiliation = affiliations[0].string
+                    raw_affiliation = affiliations[0].get_text()
                     if len(affiliations) > 1:
-                        contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]]
+                        contrib_extra['more_affiliations'] = [ra.get_text() for ra in affiliations[1:]]
                 if author.find("EqualContrib"):
                     # TODO: schema for this?
                     contrib_extra['equal'] = True
@@ -638,7 +638,7 @@ class PubmedImporter(EntityImporter):
                         ref_release_id = self.lookup_pmid(ref_pmid)
                 ref_raw = ref.Citation
                 if ref_raw:
-                    ref_extra['unstructured'] = ref_raw.string
+                    ref_extra['unstructured'] = ref_raw.get_text()
                 if not ref_extra:
                     ref_extra = None
                 refs.append(fatcat_openapi_client.ReleaseRef(
-- 
cgit v1.2.3


From 6681500eeffe39b7d029a0e0d6b2ed83729f555f Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Sat, 28 Mar 2020 20:12:54 -0700
Subject: importers: more string/get_text swaps

See previous pubmed commit for details.
---
 python/fatcat_tools/importers/arxiv.py | 22 +++++++++++-----------
 python/fatcat_tools/importers/jalc.py  | 14 +++++++-------
 python/fatcat_tools/importers/jstor.py | 18 +++++++++---------
 3 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index c69ee16a..79b242c4 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -118,13 +118,13 @@ class ArxivRawImporter(EntityImporter):
             if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]):
                 sys.stderr.write("BOGUS DOI: {}\n".format(doi))
                 doi = None
-        title = latex_to_text(metadata.title.string)
-        authors = parse_arxiv_authors(metadata.authors.string)
+        title = latex_to_text(metadata.title.get_text())
+        authors = parse_arxiv_authors(metadata.authors.get_text())
         contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)]
 
         lang = "en"     # the vast majority in english
-        if metadata.comments and metadata.comments.string:
-            comments = metadata.comments.string.strip()
+        if metadata.comments and metadata.comments.get_text():
+            comments = metadata.comments.get_text().strip()
             extra_arxiv['comments'] = comments
             if 'in french' in comments.lower():
                 lang = 'fr'
@@ -145,8 +145,8 @@ class ArxivRawImporter(EntityImporter):
             # more languages?
 
         number = None
-        if metadata.find('journal-ref') and metadata.find('journal-ref').string:
-            journal_ref = metadata.find('journal-ref').string.strip()
+        if metadata.find('journal-ref') and metadata.find('journal-ref').get_text():
+            journal_ref = metadata.find('journal-ref').get_text().strip()
             extra_arxiv['journal_ref'] = journal_ref
             if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
                 release_type = "paper-conference"
@@ -160,16 +160,16 @@ class ArxivRawImporter(EntityImporter):
                 release_type = "report"
         if metadata.find('acm-class') and metadata.find('acm-class').string:
             extra_arxiv['acm_class'] = metadata.find('acm-class').string.strip()
-        if metadata.categories and metadata.categories.string:
-            extra_arxiv['categories'] = metadata.categories.string.split()
+        if metadata.categories and metadata.categories.get_text():
+            extra_arxiv['categories'] = metadata.categories.get_text().split()
         license_slug = None
-        if metadata.license and metadata.license.string:
-            license_slug = lookup_license_slug(metadata.license.string)
+        if metadata.license and metadata.license.get_text():
+            license_slug = lookup_license_slug(metadata.license.get_text())
         abstracts = None
         if metadata.abstract:
             # TODO: test for this multi-abstract code path
             abstracts = []
-            abst = metadata.abstract.string.strip()
+            abst = metadata.abstract.get_text().strip()
             orig = None
             if '-----' in abst:
                 both = abst.split('-----')
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 351a20a3..51760f8a 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -35,13 +35,13 @@ def parse_jalc_persons(raw_persons):
     for raw in raw_persons:
         name = raw.find('name') or None
         if name:
-            name = clean(name.string)
+            name = clean(name.get_text())
         surname = raw.find('familyName') or None
         if surname:
-            surname = clean(surname.string)
+            surname = clean(surname.get_text())
         given_name = raw.find('givenName') or None
         if given_name:
-            given_name = clean(given_name.string)
+            given_name = clean(given_name.get_text())
         lang = 'en'
         if is_cjk(name):
             lang = 'ja'
@@ -163,12 +163,12 @@ class JalcImporter(EntityImporter):
         titles = record.find_all("title")
         if not titles:
             return None
-        title = titles[0].string.strip()
+        title = titles[0].get_text().strip()
         original_title = None
         if title.endswith('.'):
             title = title[:-1]
         if len(titles) > 1:
-            original_title = titles[1].string.strip()
+            original_title = titles[1].get_text().strip()
             if original_title.endswith('.'):
                 original_title = original_title[:-1]
 
@@ -242,7 +242,7 @@ class JalcImporter(EntityImporter):
         container_extra = dict()
 
         if record.publicationName:
-            pubs = [p.string.strip() for p in record.find_all("publicationName") if p.string]
+            pubs = [p.get_text().strip() for p in record.find_all("publicationName") if p.get_text()]
             pubs = [clean(p) for p in pubs if p]
             assert(pubs)
             if len(pubs) > 1 and pubs[0] == pubs[1]:
@@ -255,7 +255,7 @@ class JalcImporter(EntityImporter):
                 container_extra['original_name'] = clean(pubs[1])
 
         if record.publisher:
-            pubs = [p.string.strip() for p in record.find_all("publisher") if p.string]
+            pubs = [p.get_text().strip() for p in record.find_all("publisher") if p.get_text()]
             pubs = [p for p in pubs if p]
             if len(pubs) > 1 and pubs[0] == pubs[1]:
                 pubs = [pubs[0]]
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 5ff1ecd9..184a0bb1 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -63,13 +63,13 @@ class JstorImporter(EntityImporter):
 
         release_type = JSTOR_TYPE_MAP.get(article['article-type'])
         title = article_meta.find("article-title")
-        if title and title.string:
-            title = title.string.strip()
-        elif title and not title.string:
+        if title and title.get_text():
+            title = title.get_text().strip()
+        elif title and not title.get_text():
             title = None
 
         if not title and release_type.startswith('review') and article_meta.product.source:
-            title = "Review: {}".format(article_meta.product.source.string)
+            title = "Review: {}".format(article_meta.product.source.get_text())
 
         if not title:
             return None
@@ -96,8 +96,8 @@ class JstorImporter(EntityImporter):
         if journal_ids:
             extra_jstor['journal_ids'] = journal_ids
 
-        journal_title = journal_meta.find("journal-title").string
-        publisher = journal_meta.find("publisher-name").string
+        journal_title = journal_meta.find("journal-title").get_text()
+        publisher = journal_meta.find("publisher-name").get_text()
         issn = journal_meta.find("issn")
         if issn:
             issn = issn.string
@@ -141,13 +141,13 @@ class JstorImporter(EntityImporter):
             for c in cgroup.find_all("contrib"):
                 given = c.find("given-names")
                 if given:
-                    given = clean(given.string)
+                    given = clean(given.get_text())
                 surname = c.find("surname")
                 if surname:
-                    surname = clean(surname.string)
+                    surname = clean(surname.get_text())
                 raw_name = c.find("string-name")
                 if raw_name:
-                    raw_name = clean(raw_name.string)
+                    raw_name = clean(raw_name.get_text())
 
                 if not raw_name:
                     if given and surname:
-- 
cgit v1.2.3


From f77a553350238c8ccc9c3bc0edcf47fb9dd067b3 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 1 Apr 2020 12:02:20 -0700
Subject: importers: replace newlines in get_text() strings

---
 python/fatcat_tools/importers/arxiv.py  |  8 ++++----
 python/fatcat_tools/importers/jalc.py   | 14 +++++++-------
 python/fatcat_tools/importers/jstor.py  | 14 +++++++-------
 python/fatcat_tools/importers/pubmed.py | 12 +++++++-----
 4 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 79b242c4..719592fc 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -118,13 +118,13 @@ class ArxivRawImporter(EntityImporter):
             if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]):
                 sys.stderr.write("BOGUS DOI: {}\n".format(doi))
                 doi = None
-        title = latex_to_text(metadata.title.get_text())
-        authors = parse_arxiv_authors(metadata.authors.get_text())
+        title = latex_to_text(metadata.title.get_text().replace('\n', ' '))
+        authors = parse_arxiv_authors(metadata.authors.get_text().replace('\n', ' '))
         contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)]
 
         lang = "en"     # the vast majority in english
         if metadata.comments and metadata.comments.get_text():
-            comments = metadata.comments.get_text().strip()
+            comments = metadata.comments.get_text().replace('\n', ' ').strip()
             extra_arxiv['comments'] = comments
             if 'in french' in comments.lower():
                 lang = 'fr'
@@ -146,7 +146,7 @@ class ArxivRawImporter(EntityImporter):
 
         number = None
         if metadata.find('journal-ref') and metadata.find('journal-ref').get_text():
-            journal_ref = metadata.find('journal-ref').get_text().strip()
+            journal_ref = metadata.find('journal-ref').get_text().replace('\n', ' ').strip()
             extra_arxiv['journal_ref'] = journal_ref
             if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
                 release_type = "paper-conference"
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 51760f8a..c2adc0d6 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -35,13 +35,13 @@ def parse_jalc_persons(raw_persons):
     for raw in raw_persons:
         name = raw.find('name') or None
         if name:
-            name = clean(name.get_text())
+            name = clean(name.get_text().replace('\n', ' '))
         surname = raw.find('familyName') or None
         if surname:
-            surname = clean(surname.get_text())
+            surname = clean(surname.get_text().replace('\n', ' '))
         given_name = raw.find('givenName') or None
         if given_name:
-            given_name = clean(given_name.get_text())
+            given_name = clean(given_name.get_text().replace('\n', ' '))
         lang = 'en'
         if is_cjk(name):
             lang = 'ja'
@@ -163,12 +163,12 @@ class JalcImporter(EntityImporter):
         titles = record.find_all("title")
         if not titles:
             return None
-        title = titles[0].get_text().strip()
+        title = titles[0].get_text().replace('\n', ' ').strip()
         original_title = None
         if title.endswith('.'):
             title = title[:-1]
         if len(titles) > 1:
-            original_title = titles[1].get_text().strip()
+            original_title = titles[1].get_text().replace('\n', ' ').strip()
             if original_title.endswith('.'):
                 original_title = original_title[:-1]
 
@@ -242,7 +242,7 @@ class JalcImporter(EntityImporter):
         container_extra = dict()
 
         if record.publicationName:
-            pubs = [p.get_text().strip() for p in record.find_all("publicationName") if p.get_text()]
+            pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()]
             pubs = [clean(p) for p in pubs if p]
             assert(pubs)
             if len(pubs) > 1 and pubs[0] == pubs[1]:
@@ -255,7 +255,7 @@ class JalcImporter(EntityImporter):
                 container_extra['original_name'] = clean(pubs[1])
 
         if record.publisher:
-            pubs = [p.get_text().strip() for p in record.find_all("publisher") if p.get_text()]
+            pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()]
             pubs = [p for p in pubs if p]
             if len(pubs) > 1 and pubs[0] == pubs[1]:
                 pubs = [pubs[0]]
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 184a0bb1..96dbf947 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -64,12 +64,12 @@ class JstorImporter(EntityImporter):
         release_type = JSTOR_TYPE_MAP.get(article['article-type'])
         title = article_meta.find("article-title")
         if title and title.get_text():
-            title = title.get_text().strip()
+            title = title.get_text().replace('\n', ' ').strip()
         elif title and not title.get_text():
             title = None
 
         if not title and release_type.startswith('review') and article_meta.product.source:
-            title = "Review: {}".format(article_meta.product.source.get_text())
+            title = "Review: {}".format(article_meta.product.source.replace('\n', ' ').get_text())
 
         if not title:
             return None
@@ -96,8 +96,8 @@ class JstorImporter(EntityImporter):
         if journal_ids:
             extra_jstor['journal_ids'] = journal_ids
 
-        journal_title = journal_meta.find("journal-title").get_text()
-        publisher = journal_meta.find("publisher-name").get_text()
+        journal_title = journal_meta.find("journal-title").get_text().replace('\n', ' ')
+        publisher = journal_meta.find("publisher-name").get_text().replace('\n', ' ')
         issn = journal_meta.find("issn")
         if issn:
             issn = issn.string
@@ -141,13 +141,13 @@ class JstorImporter(EntityImporter):
             for c in cgroup.find_all("contrib"):
                 given = c.find("given-names")
                 if given:
-                    given = clean(given.get_text())
+                    given = clean(given.get_text().replace('\n', ' '))
                 surname = c.find("surname")
                 if surname:
-                    surname = clean(surname.get_text())
+                    surname = clean(surname.get_text().replace('\n', ' '))
                 raw_name = c.find("string-name")
                 if raw_name:
-                    raw_name = clean(raw_name.get_text())
+                    raw_name = clean(raw_name.get_text().replace('\n', ' '))
 
                 if not raw_name:
                     if given and surname:
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3e9527d4..62bb1ddb 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -394,6 +394,7 @@ class PubmedImporter(EntityImporter):
 
         title = medline.Article.ArticleTitle.get_text() # always present
         if title:
+            title = title.replace('\n', ' ')
             if title.endswith('.'):
                 title = title[:-1]
             # this hides some "special" titles, but the vast majority are
@@ -407,6 +408,7 @@ class PubmedImporter(EntityImporter):
         original_title = medline.Article.find("VernacularTitle", recurse=False)
         if original_title:
             original_title = original_title.get_text() or None
+            original_title = original_title.replace('\n', ' ')
             if original_title and original_title.endswith('.'):
                 original_title = original_title[:-1]
 
@@ -558,15 +560,15 @@ class PubmedImporter(EntityImporter):
                 surname = None
                 raw_name = None
                 if author.ForeName:
-                    given_name = author.ForeName.get_text()
+                    given_name = author.ForeName.get_text().replace('\n', ' ')
                 if author.LastName:
-                    surname = author.LastName.get_text()
+                    surname = author.LastName.get_text().replace('\n', ' ')
                 if given_name and surname:
                     raw_name = "{} {}".format(given_name, surname)
                 elif surname:
                     raw_name = surname
                 if not raw_name and author.CollectiveName and author.CollectiveName.get_text():
-                    raw_name = author.CollectiveName.get_text()
+                    raw_name = author.CollectiveName.get_text().replace('\n', ' ')
                 contrib_extra = dict()
                 orcid = author.find("Identifier", Source="ORCID")
                 if orcid:
@@ -588,9 +590,9 @@ class PubmedImporter(EntityImporter):
                 affiliations = author.find_all("Affiliation")
                 raw_affiliation = None
                 if affiliations:
-                    raw_affiliation = affiliations[0].get_text()
+                    raw_affiliation = affiliations[0].get_text().replace('\n', ' ')
                     if len(affiliations) > 1:
-                        contrib_extra['more_affiliations'] = [ra.get_text() for ra in affiliations[1:]]
+                        contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]]
                 if author.find("EqualContrib"):
                     # TODO: schema for this?
                     contrib_extra['equal'] = True
-- 
cgit v1.2.3


From 938d2c5366d80618b839c83baadc9b5c62d10dce Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 1 Apr 2020 12:02:43 -0700
Subject: pubmed: use untranslated title if translated not available

The primary motivation for this change is that fatcat *requires* a
non-empty title for each release entity. Pubmed/Medline occasionally
indexes just a VenacularTitle with no ArticleTitle for foreign
publications, and currently those records don't end up in fatcat at all.
---
 python/fatcat_tools/importers/pubmed.py | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 62bb1ddb..abcb21d9 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -412,6 +412,12 @@ class PubmedImporter(EntityImporter):
             if original_title and original_title.endswith('.'):
                 original_title = original_title[:-1]
 
+        if original_title and not title:
+            # if we only have an "original" title, but not translated/english
+            # title, sub in the original title so the entity can be created
+            title = original_title
+            original_title = None
+
         # TODO: happening in alpha order, not handling multi-language well.
         language = medline.Article.Language
         if language:
-- 
cgit v1.2.3