summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/pubmed.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/pubmed.py')
-rw-r--r--python/fatcat_tools/importers/pubmed.py32
1 files changed, 20 insertions, 12 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3ecf5ef4..abcb21d9 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -392,8 +392,9 @@ class PubmedImporter(EntityImporter):
if pages:
pages = pages.string
- title = medline.Article.ArticleTitle.string # always present
+ title = medline.Article.ArticleTitle.get_text() # always present
if title:
+ title = title.replace('\n', ' ')
if title.endswith('.'):
title = title[:-1]
# this hides some "special" titles, but the vast majority are
@@ -406,20 +407,27 @@ class PubmedImporter(EntityImporter):
original_title = medline.Article.find("VernacularTitle", recurse=False)
if original_title:
- original_title = original_title.string or None
+ original_title = original_title.get_text() or None
+ original_title = original_title.replace('\n', ' ')
if original_title and original_title.endswith('.'):
original_title = original_title[:-1]
+ if original_title and not title:
+ # if we only have an "original" title, but not translated/english
+ # title, sub in the original title so the entity can be created
+ title = original_title
+ original_title = None
+
# TODO: happening in alpha order, not handling multi-language well.
language = medline.Article.Language
if language:
- language = language.string
+ language = language.get_text()
if language in ("und", "un"):
# "undetermined"
language = None
else:
language = LANG_MAP_MARC.get(language)
- if not language and not (medline.Article.Language.string in LANG_MAP_MARC):
+ if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC):
warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
### Journal/Issue Metadata
@@ -479,7 +487,7 @@ class PubmedImporter(EntityImporter):
print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)
if journal.find("Title"):
- container_name = journal.Title.string
+ container_name = journal.Title.get_text()
if (container_id is None and self.create_containers and (issnl is not None)
and container_name):
@@ -558,15 +566,15 @@ class PubmedImporter(EntityImporter):
surname = None
raw_name = None
if author.ForeName:
- given_name = author.ForeName.string
+ given_name = author.ForeName.get_text().replace('\n', ' ')
if author.LastName:
- surname = author.LastName.string
+ surname = author.LastName.get_text().replace('\n', ' ')
if given_name and surname:
raw_name = "{} {}".format(given_name, surname)
elif surname:
raw_name = surname
- if not raw_name and author.CollectiveName and author.CollectiveName.string:
- raw_name = author.CollectiveName.string
+ if not raw_name and author.CollectiveName and author.CollectiveName.get_text():
+ raw_name = author.CollectiveName.get_text().replace('\n', ' ')
contrib_extra = dict()
orcid = author.find("Identifier", Source="ORCID")
if orcid:
@@ -588,9 +596,9 @@ class PubmedImporter(EntityImporter):
affiliations = author.find_all("Affiliation")
raw_affiliation = None
if affiliations:
- raw_affiliation = affiliations[0].string
+ raw_affiliation = affiliations[0].get_text().replace('\n', ' ')
if len(affiliations) > 1:
- contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]]
+ contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]]
if author.find("EqualContrib"):
# TODO: schema for this?
contrib_extra['equal'] = True
@@ -638,7 +646,7 @@ class PubmedImporter(EntityImporter):
ref_release_id = self.lookup_pmid(ref_pmid)
ref_raw = ref.Citation
if ref_raw:
- ref_extra['unstructured'] = ref_raw.string
+ ref_extra['unstructured'] = ref_raw.get_text()
if not ref_extra:
ref_extra = None
refs.append(fatcat_openapi_client.ReleaseRef(