From d6af7b7544ddb3b5e7b1f4a0fd76bd9cd5ed9125 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 28 Mar 2020 20:01:46 -0700 Subject: pubmed: bunch of .get_text() instead of .string Yikes! Apparently when a tag has child tags, .string will return None instead of all the strings. .get_text() returns all of it: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text https://www.crummy.com/software/BeautifulSoup/bs4/doc/#string I've things like identifiers as .string, when we expect only a single string inside. --- python/fatcat_tools/importers/pubmed.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3ecf5ef4..3e9527d4 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -392,7 +392,7 @@ class PubmedImporter(EntityImporter): if pages: pages = pages.string - title = medline.Article.ArticleTitle.string # always present + title = medline.Article.ArticleTitle.get_text() # always present if title: if title.endswith('.'): title = title[:-1] @@ -406,20 +406,20 @@ class PubmedImporter(EntityImporter): original_title = medline.Article.find("VernacularTitle", recurse=False) if original_title: - original_title = original_title.string or None + original_title = original_title.get_text() or None if original_title and original_title.endswith('.'): original_title = original_title[:-1] # TODO: happening in alpha order, not handling multi-language well. language = medline.Article.Language if language: - language = language.string + language = language.get_text() if language in ("und", "un"): # "undetermined" language = None else: language = LANG_MAP_MARC.get(language) - if not language and not (medline.Article.Language.string in LANG_MAP_MARC): + if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC): warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string)) ### Journal/Issue Metadata @@ -479,7 +479,7 @@ class PubmedImporter(EntityImporter): print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr) if journal.find("Title"): - container_name = journal.Title.string + container_name = journal.Title.get_text() if (container_id is None and self.create_containers and (issnl is not None) and container_name): @@ -558,15 +558,15 @@ class PubmedImporter(EntityImporter): surname = None raw_name = None if author.ForeName: - given_name = author.ForeName.string + given_name = author.ForeName.get_text() if author.LastName: - surname = author.LastName.string + surname = author.LastName.get_text() if given_name and surname: raw_name = "{} {}".format(given_name, surname) elif surname: raw_name = surname - if not raw_name and author.CollectiveName and author.CollectiveName.string: - raw_name = author.CollectiveName.string + if not raw_name and author.CollectiveName and author.CollectiveName.get_text(): + raw_name = author.CollectiveName.get_text() contrib_extra = dict() orcid = author.find("Identifier", Source="ORCID") if orcid: @@ -588,9 +588,9 @@ class PubmedImporter(EntityImporter): affiliations = author.find_all("Affiliation") raw_affiliation = None if affiliations: - raw_affiliation = affiliations[0].string + raw_affiliation = affiliations[0].get_text() if len(affiliations) > 1: - contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]] + contrib_extra['more_affiliations'] = [ra.get_text() for ra in affiliations[1:]] if author.find("EqualContrib"): # TODO: schema for this? contrib_extra['equal'] = True @@ -638,7 +638,7 @@ class PubmedImporter(EntityImporter): ref_release_id = self.lookup_pmid(ref_pmid) ref_raw = ref.Citation if ref_raw: - ref_extra['unstructured'] = ref_raw.string + ref_extra['unstructured'] = ref_raw.get_text() if not ref_extra: ref_extra = None refs.append(fatcat_openapi_client.ReleaseRef( -- cgit v1.2.3 From 6681500eeffe39b7d029a0e0d6b2ed83729f555f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 28 Mar 2020 20:12:54 -0700 Subject: importers: more string/get_text swaps See previous pubmed commit for details. --- python/fatcat_tools/importers/arxiv.py | 22 +++++++++++----------- python/fatcat_tools/importers/jalc.py | 14 +++++++------- python/fatcat_tools/importers/jstor.py | 18 +++++++++--------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index c69ee16a..79b242c4 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -118,13 +118,13 @@ class ArxivRawImporter(EntityImporter): if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]): sys.stderr.write("BOGUS DOI: {}\n".format(doi)) doi = None - title = latex_to_text(metadata.title.string) - authors = parse_arxiv_authors(metadata.authors.string) + title = latex_to_text(metadata.title.get_text()) + authors = parse_arxiv_authors(metadata.authors.get_text()) contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)] lang = "en" # the vast majority in english - if metadata.comments and metadata.comments.string: - comments = metadata.comments.string.strip() + if metadata.comments and metadata.comments.get_text(): + comments = metadata.comments.get_text().strip() extra_arxiv['comments'] = comments if 'in french' in comments.lower(): lang = 'fr' @@ -145,8 +145,8 @@ class ArxivRawImporter(EntityImporter): # more languages? number = None - if metadata.find('journal-ref') and metadata.find('journal-ref').string: - journal_ref = metadata.find('journal-ref').string.strip() + if metadata.find('journal-ref') and metadata.find('journal-ref').get_text(): + journal_ref = metadata.find('journal-ref').get_text().strip() extra_arxiv['journal_ref'] = journal_ref if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): release_type = "paper-conference" @@ -160,16 +160,16 @@ class ArxivRawImporter(EntityImporter): release_type = "report" if metadata.find('acm-class') and metadata.find('acm-class').string: extra_arxiv['acm_class'] = metadata.find('acm-class').string.strip() - if metadata.categories and metadata.categories.string: - extra_arxiv['categories'] = metadata.categories.string.split() + if metadata.categories and metadata.categories.get_text(): + extra_arxiv['categories'] = metadata.categories.get_text().split() license_slug = None - if metadata.license and metadata.license.string: - license_slug = lookup_license_slug(metadata.license.string) + if metadata.license and metadata.license.get_text(): + license_slug = lookup_license_slug(metadata.license.get_text()) abstracts = None if metadata.abstract: # TODO: test for this multi-abstract code path abstracts = [] - abst = metadata.abstract.string.strip() + abst = metadata.abstract.get_text().strip() orig = None if '-----' in abst: both = abst.split('-----') diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 351a20a3..51760f8a 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -35,13 +35,13 @@ def parse_jalc_persons(raw_persons): for raw in raw_persons: name = raw.find('name') or None if name: - name = clean(name.string) + name = clean(name.get_text()) surname = raw.find('familyName') or None if surname: - surname = clean(surname.string) + surname = clean(surname.get_text()) given_name = raw.find('givenName') or None if given_name: - given_name = clean(given_name.string) + given_name = clean(given_name.get_text()) lang = 'en' if is_cjk(name): lang = 'ja' @@ -163,12 +163,12 @@ class JalcImporter(EntityImporter): titles = record.find_all("title") if not titles: return None - title = titles[0].string.strip() + title = titles[0].get_text().strip() original_title = None if title.endswith('.'): title = title[:-1] if len(titles) > 1: - original_title = titles[1].string.strip() + original_title = titles[1].get_text().strip() if original_title.endswith('.'): original_title = original_title[:-1] @@ -242,7 +242,7 @@ class JalcImporter(EntityImporter): container_extra = dict() if record.publicationName: - pubs = [p.string.strip() for p in record.find_all("publicationName") if p.string] + pubs = [p.get_text().strip() for p in record.find_all("publicationName") if p.get_text()] pubs = [clean(p) for p in pubs if p] assert(pubs) if len(pubs) > 1 and pubs[0] == pubs[1]: @@ -255,7 +255,7 @@ class JalcImporter(EntityImporter): container_extra['original_name'] = clean(pubs[1]) if record.publisher: - pubs = [p.string.strip() for p in record.find_all("publisher") if p.string] + pubs = [p.get_text().strip() for p in record.find_all("publisher") if p.get_text()] pubs = [p for p in pubs if p] if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 5ff1ecd9..184a0bb1 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -63,13 +63,13 @@ class JstorImporter(EntityImporter): release_type = JSTOR_TYPE_MAP.get(article['article-type']) title = article_meta.find("article-title") - if title and title.string: - title = title.string.strip() - elif title and not title.string: + if title and title.get_text(): + title = title.get_text().strip() + elif title and not title.get_text(): title = None if not title and release_type.startswith('review') and article_meta.product.source: - title = "Review: {}".format(article_meta.product.source.string) + title = "Review: {}".format(article_meta.product.source.get_text()) if not title: return None @@ -96,8 +96,8 @@ class JstorImporter(EntityImporter): if journal_ids: extra_jstor['journal_ids'] = journal_ids - journal_title = journal_meta.find("journal-title").string - publisher = journal_meta.find("publisher-name").string + journal_title = journal_meta.find("journal-title").get_text() + publisher = journal_meta.find("publisher-name").get_text() issn = journal_meta.find("issn") if issn: issn = issn.string @@ -141,13 +141,13 @@ class JstorImporter(EntityImporter): for c in cgroup.find_all("contrib"): given = c.find("given-names") if given: - given = clean(given.string) + given = clean(given.get_text()) surname = c.find("surname") if surname: - surname = clean(surname.string) + surname = clean(surname.get_text()) raw_name = c.find("string-name") if raw_name: - raw_name = clean(raw_name.string) + raw_name = clean(raw_name.get_text()) if not raw_name: if given and surname: -- cgit v1.2.3 From f77a553350238c8ccc9c3bc0edcf47fb9dd067b3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 1 Apr 2020 12:02:20 -0700 Subject: importers: replace newlines in get_text() strings --- python/fatcat_tools/importers/arxiv.py | 8 ++++---- python/fatcat_tools/importers/jalc.py | 14 +++++++------- python/fatcat_tools/importers/jstor.py | 14 +++++++------- python/fatcat_tools/importers/pubmed.py | 12 +++++++----- 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 79b242c4..719592fc 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -118,13 +118,13 @@ class ArxivRawImporter(EntityImporter): if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]): sys.stderr.write("BOGUS DOI: {}\n".format(doi)) doi = None - title = latex_to_text(metadata.title.get_text()) - authors = parse_arxiv_authors(metadata.authors.get_text()) + title = latex_to_text(metadata.title.get_text().replace('\n', ' ')) + authors = parse_arxiv_authors(metadata.authors.get_text().replace('\n', ' ')) contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)] lang = "en" # the vast majority in english if metadata.comments and metadata.comments.get_text(): - comments = metadata.comments.get_text().strip() + comments = metadata.comments.get_text().replace('\n', ' ').strip() extra_arxiv['comments'] = comments if 'in french' in comments.lower(): lang = 'fr' @@ -146,7 +146,7 @@ class ArxivRawImporter(EntityImporter): number = None if metadata.find('journal-ref') and metadata.find('journal-ref').get_text(): - journal_ref = metadata.find('journal-ref').get_text().strip() + journal_ref = metadata.find('journal-ref').get_text().replace('\n', ' ').strip() extra_arxiv['journal_ref'] = journal_ref if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): release_type = "paper-conference" diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 51760f8a..c2adc0d6 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -35,13 +35,13 @@ def parse_jalc_persons(raw_persons): for raw in raw_persons: name = raw.find('name') or None if name: - name = clean(name.get_text()) + name = clean(name.get_text().replace('\n', ' ')) surname = raw.find('familyName') or None if surname: - surname = clean(surname.get_text()) + surname = clean(surname.get_text().replace('\n', ' ')) given_name = raw.find('givenName') or None if given_name: - given_name = clean(given_name.get_text()) + given_name = clean(given_name.get_text().replace('\n', ' ')) lang = 'en' if is_cjk(name): lang = 'ja' @@ -163,12 +163,12 @@ class JalcImporter(EntityImporter): titles = record.find_all("title") if not titles: return None - title = titles[0].get_text().strip() + title = titles[0].get_text().replace('\n', ' ').strip() original_title = None if title.endswith('.'): title = title[:-1] if len(titles) > 1: - original_title = titles[1].get_text().strip() + original_title = titles[1].get_text().replace('\n', ' ').strip() if original_title.endswith('.'): original_title = original_title[:-1] @@ -242,7 +242,7 @@ class JalcImporter(EntityImporter): container_extra = dict() if record.publicationName: - pubs = [p.get_text().strip() for p in record.find_all("publicationName") if p.get_text()] + pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()] pubs = [clean(p) for p in pubs if p] assert(pubs) if len(pubs) > 1 and pubs[0] == pubs[1]: @@ -255,7 +255,7 @@ class JalcImporter(EntityImporter): container_extra['original_name'] = clean(pubs[1]) if record.publisher: - pubs = [p.get_text().strip() for p in record.find_all("publisher") if p.get_text()] + pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()] pubs = [p for p in pubs if p] if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 184a0bb1..96dbf947 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -64,12 +64,12 @@ class JstorImporter(EntityImporter): release_type = JSTOR_TYPE_MAP.get(article['article-type']) title = article_meta.find("article-title") if title and title.get_text(): - title = title.get_text().strip() + title = title.get_text().replace('\n', ' ').strip() elif title and not title.get_text(): title = None if not title and release_type.startswith('review') and article_meta.product.source: - title = "Review: {}".format(article_meta.product.source.get_text()) + title = "Review: {}".format(article_meta.product.source.replace('\n', ' ').get_text()) if not title: return None @@ -96,8 +96,8 @@ class JstorImporter(EntityImporter): if journal_ids: extra_jstor['journal_ids'] = journal_ids - journal_title = journal_meta.find("journal-title").get_text() - publisher = journal_meta.find("publisher-name").get_text() + journal_title = journal_meta.find("journal-title").get_text().replace('\n', ' ') + publisher = journal_meta.find("publisher-name").get_text().replace('\n', ' ') issn = journal_meta.find("issn") if issn: issn = issn.string @@ -141,13 +141,13 @@ class JstorImporter(EntityImporter): for c in cgroup.find_all("contrib"): given = c.find("given-names") if given: - given = clean(given.get_text()) + given = clean(given.get_text().replace('\n', ' ')) surname = c.find("surname") if surname: - surname = clean(surname.get_text()) + surname = clean(surname.get_text().replace('\n', ' ')) raw_name = c.find("string-name") if raw_name: - raw_name = clean(raw_name.get_text()) + raw_name = clean(raw_name.get_text().replace('\n', ' ')) if not raw_name: if given and surname: diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3e9527d4..62bb1ddb 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -394,6 +394,7 @@ class PubmedImporter(EntityImporter): title = medline.Article.ArticleTitle.get_text() # always present if title: + title = title.replace('\n', ' ') if title.endswith('.'): title = title[:-1] # this hides some "special" titles, but the vast majority are @@ -407,6 +408,7 @@ class PubmedImporter(EntityImporter): original_title = medline.Article.find("VernacularTitle", recurse=False) if original_title: original_title = original_title.get_text() or None + original_title = original_title.replace('\n', ' ') if original_title and original_title.endswith('.'): original_title = original_title[:-1] @@ -558,15 +560,15 @@ class PubmedImporter(EntityImporter): surname = None raw_name = None if author.ForeName: - given_name = author.ForeName.get_text() + given_name = author.ForeName.get_text().replace('\n', ' ') if author.LastName: - surname = author.LastName.get_text() + surname = author.LastName.get_text().replace('\n', ' ') if given_name and surname: raw_name = "{} {}".format(given_name, surname) elif surname: raw_name = surname if not raw_name and author.CollectiveName and author.CollectiveName.get_text(): - raw_name = author.CollectiveName.get_text() + raw_name = author.CollectiveName.get_text().replace('\n', ' ') contrib_extra = dict() orcid = author.find("Identifier", Source="ORCID") if orcid: @@ -588,9 +590,9 @@ class PubmedImporter(EntityImporter): affiliations = author.find_all("Affiliation") raw_affiliation = None if affiliations: - raw_affiliation = affiliations[0].get_text() + raw_affiliation = affiliations[0].get_text().replace('\n', ' ') if len(affiliations) > 1: - contrib_extra['more_affiliations'] = [ra.get_text() for ra in affiliations[1:]] + contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]] if author.find("EqualContrib"): # TODO: schema for this? contrib_extra['equal'] = True -- cgit v1.2.3 From 938d2c5366d80618b839c83baadc9b5c62d10dce Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 1 Apr 2020 12:02:43 -0700 Subject: pubmed: use untranslated title if translated not available The primary motivation for this change is that fatcat *requires* a non-empty title for each release entity. Pubmed/Medline occasionally indexes just a VenacularTitle with no ArticleTitle for foreign publications, and currently those records don't end up in fatcat at all. --- python/fatcat_tools/importers/pubmed.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 62bb1ddb..abcb21d9 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -412,6 +412,12 @@ class PubmedImporter(EntityImporter): if original_title and original_title.endswith('.'): original_title = original_title[:-1] + if original_title and not title: + # if we only have an "original" title, but not translated/english + # title, sub in the original title so the entity can be created + title = original_title + original_title = None + # TODO: happening in alpha order, not handling multi-language well. language = medline.Article.Language if language: -- cgit v1.2.3