summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2020-04-01 22:03:19 +0000
committerbnewbold <bnewbold@archive.org>2020-04-01 22:03:19 +0000
commit32f195cec41459045f3d3453dad7a97b38d4e288 (patch)
treeab166daf8686472ed9641d96ab055f37ee89d71c /python/fatcat_tools/importers
parent0e2025091d0c974a888a5bc741495951c952ccda (diff)
parent938d2c5366d80618b839c83baadc9b5c62d10dce (diff)
downloadfatcat-32f195cec41459045f3d3453dad7a97b38d4e288.tar.gz
fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.zip
Merge branch 'bnewbold-pubmed-get_text' into 'master'
beautifulsoup XML parsing: .string vs. .get_text() See merge request webgroup/fatcat!40
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/arxiv.py22
-rw-r--r--python/fatcat_tools/importers/jalc.py14
-rw-r--r--python/fatcat_tools/importers/jstor.py18
-rw-r--r--python/fatcat_tools/importers/pubmed.py32
4 files changed, 47 insertions, 39 deletions
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index c69ee16a..719592fc 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -118,13 +118,13 @@ class ArxivRawImporter(EntityImporter):
if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]):
sys.stderr.write("BOGUS DOI: {}\n".format(doi))
doi = None
- title = latex_to_text(metadata.title.string)
- authors = parse_arxiv_authors(metadata.authors.string)
+ title = latex_to_text(metadata.title.get_text().replace('\n', ' '))
+ authors = parse_arxiv_authors(metadata.authors.get_text().replace('\n', ' '))
contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)]
lang = "en" # the vast majority in english
- if metadata.comments and metadata.comments.string:
- comments = metadata.comments.string.strip()
+ if metadata.comments and metadata.comments.get_text():
+ comments = metadata.comments.get_text().replace('\n', ' ').strip()
extra_arxiv['comments'] = comments
if 'in french' in comments.lower():
lang = 'fr'
@@ -145,8 +145,8 @@ class ArxivRawImporter(EntityImporter):
# more languages?
number = None
- if metadata.find('journal-ref') and metadata.find('journal-ref').string:
- journal_ref = metadata.find('journal-ref').string.strip()
+ if metadata.find('journal-ref') and metadata.find('journal-ref').get_text():
+ journal_ref = metadata.find('journal-ref').get_text().replace('\n', ' ').strip()
extra_arxiv['journal_ref'] = journal_ref
if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
release_type = "paper-conference"
@@ -160,16 +160,16 @@ class ArxivRawImporter(EntityImporter):
release_type = "report"
if metadata.find('acm-class') and metadata.find('acm-class').string:
extra_arxiv['acm_class'] = metadata.find('acm-class').string.strip()
- if metadata.categories and metadata.categories.string:
- extra_arxiv['categories'] = metadata.categories.string.split()
+ if metadata.categories and metadata.categories.get_text():
+ extra_arxiv['categories'] = metadata.categories.get_text().split()
license_slug = None
- if metadata.license and metadata.license.string:
- license_slug = lookup_license_slug(metadata.license.string)
+ if metadata.license and metadata.license.get_text():
+ license_slug = lookup_license_slug(metadata.license.get_text())
abstracts = None
if metadata.abstract:
# TODO: test for this multi-abstract code path
abstracts = []
- abst = metadata.abstract.string.strip()
+ abst = metadata.abstract.get_text().strip()
orig = None
if '-----' in abst:
both = abst.split('-----')
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 351a20a3..c2adc0d6 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -35,13 +35,13 @@ def parse_jalc_persons(raw_persons):
for raw in raw_persons:
name = raw.find('name') or None
if name:
- name = clean(name.string)
+ name = clean(name.get_text().replace('\n', ' '))
surname = raw.find('familyName') or None
if surname:
- surname = clean(surname.string)
+ surname = clean(surname.get_text().replace('\n', ' '))
given_name = raw.find('givenName') or None
if given_name:
- given_name = clean(given_name.string)
+ given_name = clean(given_name.get_text().replace('\n', ' '))
lang = 'en'
if is_cjk(name):
lang = 'ja'
@@ -163,12 +163,12 @@ class JalcImporter(EntityImporter):
titles = record.find_all("title")
if not titles:
return None
- title = titles[0].string.strip()
+ title = titles[0].get_text().replace('\n', ' ').strip()
original_title = None
if title.endswith('.'):
title = title[:-1]
if len(titles) > 1:
- original_title = titles[1].string.strip()
+ original_title = titles[1].get_text().replace('\n', ' ').strip()
if original_title.endswith('.'):
original_title = original_title[:-1]
@@ -242,7 +242,7 @@ class JalcImporter(EntityImporter):
container_extra = dict()
if record.publicationName:
- pubs = [p.string.strip() for p in record.find_all("publicationName") if p.string]
+ pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()]
pubs = [clean(p) for p in pubs if p]
assert(pubs)
if len(pubs) > 1 and pubs[0] == pubs[1]:
@@ -255,7 +255,7 @@ class JalcImporter(EntityImporter):
container_extra['original_name'] = clean(pubs[1])
if record.publisher:
- pubs = [p.string.strip() for p in record.find_all("publisher") if p.string]
+ pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()]
pubs = [p for p in pubs if p]
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 5ff1ecd9..96dbf947 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -63,13 +63,13 @@ class JstorImporter(EntityImporter):
release_type = JSTOR_TYPE_MAP.get(article['article-type'])
title = article_meta.find("article-title")
- if title and title.string:
- title = title.string.strip()
- elif title and not title.string:
+ if title and title.get_text():
+ title = title.get_text().replace('\n', ' ').strip()
+ elif title and not title.get_text():
title = None
if not title and release_type.startswith('review') and article_meta.product.source:
- title = "Review: {}".format(article_meta.product.source.string)
+ title = "Review: {}".format(article_meta.product.source.replace('\n', ' ').get_text())
if not title:
return None
@@ -96,8 +96,8 @@ class JstorImporter(EntityImporter):
if journal_ids:
extra_jstor['journal_ids'] = journal_ids
- journal_title = journal_meta.find("journal-title").string
- publisher = journal_meta.find("publisher-name").string
+ journal_title = journal_meta.find("journal-title").get_text().replace('\n', ' ')
+ publisher = journal_meta.find("publisher-name").get_text().replace('\n', ' ')
issn = journal_meta.find("issn")
if issn:
issn = issn.string
@@ -141,13 +141,13 @@ class JstorImporter(EntityImporter):
for c in cgroup.find_all("contrib"):
given = c.find("given-names")
if given:
- given = clean(given.string)
+ given = clean(given.get_text().replace('\n', ' '))
surname = c.find("surname")
if surname:
- surname = clean(surname.string)
+ surname = clean(surname.get_text().replace('\n', ' '))
raw_name = c.find("string-name")
if raw_name:
- raw_name = clean(raw_name.string)
+ raw_name = clean(raw_name.get_text().replace('\n', ' '))
if not raw_name:
if given and surname:
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3ecf5ef4..abcb21d9 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -392,8 +392,9 @@ class PubmedImporter(EntityImporter):
if pages:
pages = pages.string
- title = medline.Article.ArticleTitle.string # always present
+ title = medline.Article.ArticleTitle.get_text() # always present
if title:
+ title = title.replace('\n', ' ')
if title.endswith('.'):
title = title[:-1]
# this hides some "special" titles, but the vast majority are
@@ -406,20 +407,27 @@ class PubmedImporter(EntityImporter):
original_title = medline.Article.find("VernacularTitle", recurse=False)
if original_title:
- original_title = original_title.string or None
+ original_title = original_title.get_text() or None
+ original_title = original_title.replace('\n', ' ')
if original_title and original_title.endswith('.'):
original_title = original_title[:-1]
+ if original_title and not title:
+ # if we only have an "original" title, but not translated/english
+ # title, sub in the original title so the entity can be created
+ title = original_title
+ original_title = None
+
# TODO: happening in alpha order, not handling multi-language well.
language = medline.Article.Language
if language:
- language = language.string
+ language = language.get_text()
if language in ("und", "un"):
# "undetermined"
language = None
else:
language = LANG_MAP_MARC.get(language)
- if not language and not (medline.Article.Language.string in LANG_MAP_MARC):
+ if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC):
warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
### Journal/Issue Metadata
@@ -479,7 +487,7 @@ class PubmedImporter(EntityImporter):
print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)
if journal.find("Title"):
- container_name = journal.Title.string
+ container_name = journal.Title.get_text()
if (container_id is None and self.create_containers and (issnl is not None)
and container_name):
@@ -558,15 +566,15 @@ class PubmedImporter(EntityImporter):
surname = None
raw_name = None
if author.ForeName:
- given_name = author.ForeName.string
+ given_name = author.ForeName.get_text().replace('\n', ' ')
if author.LastName:
- surname = author.LastName.string
+ surname = author.LastName.get_text().replace('\n', ' ')
if given_name and surname:
raw_name = "{} {}".format(given_name, surname)
elif surname:
raw_name = surname
- if not raw_name and author.CollectiveName and author.CollectiveName.string:
- raw_name = author.CollectiveName.string
+ if not raw_name and author.CollectiveName and author.CollectiveName.get_text():
+ raw_name = author.CollectiveName.get_text().replace('\n', ' ')
contrib_extra = dict()
orcid = author.find("Identifier", Source="ORCID")
if orcid:
@@ -588,9 +596,9 @@ class PubmedImporter(EntityImporter):
affiliations = author.find_all("Affiliation")
raw_affiliation = None
if affiliations:
- raw_affiliation = affiliations[0].string
+ raw_affiliation = affiliations[0].get_text().replace('\n', ' ')
if len(affiliations) > 1:
- contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]]
+ contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]]
if author.find("EqualContrib"):
# TODO: schema for this?
contrib_extra['equal'] = True
@@ -638,7 +646,7 @@ class PubmedImporter(EntityImporter):
ref_release_id = self.lookup_pmid(ref_pmid)
ref_raw = ref.Citation
if ref_raw:
- ref_extra['unstructured'] = ref_raw.string
+ ref_extra['unstructured'] = ref_raw.get_text()
if not ref_extra:
ref_extra = None
refs.append(fatcat_openapi_client.ReleaseRef(