summaryrefslogtreecommitdiffstats
path: root/python/tests/import_pubmed.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-03-20 13:00:50 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-03-20 13:00:52 -0700
commita6f74183dd1cf1eaa44f7edeb98dbc5dc737dabb (patch)
tree6fe603ef02c70ae748cafd0c407978c74bd3ae3c /python/tests/import_pubmed.py
parent12c0e53669fb9401b09e088217c5c103d90b9106 (diff)
downloadfatcat-a6f74183dd1cf1eaa44f7edeb98dbc5dc737dabb.tar.gz
fatcat-a6f74183dd1cf1eaa44f7edeb98dbc5dc737dabb.zip
pubmed: handle multiple ReferenceList
This resolves a situation noticed in prod where we were only importing/updating a single reference per article. Includes a regression test.
Diffstat (limited to 'python/tests/import_pubmed.py')
-rw-r--r--python/tests/import_pubmed.py12
1 files changed, 12 insertions, 0 deletions
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
index 49609f75..f57aa273 100644
--- a/python/tests/import_pubmed.py
+++ b/python/tests/import_pubmed.py
@@ -118,6 +118,7 @@ def test_pubmed_xml_parse(pubmed_importer):
assert r2.refs[0].extra['unstructured'] == "Microbiology. 2009 Jun;155(Pt 6):1840-6"
assert r2.refs[0].extra['pmid'] == "19383690"
+ assert len(r2.refs) > 1
def test_pubmed_xml_dates(pubmed_importer):
with open('tests/files/pubmed_31393839.xml', 'r') as f:
@@ -126,3 +127,14 @@ def test_pubmed_xml_dates(pubmed_importer):
assert r1.release_year == 2019
+def test_pubmed_xml_parse_refs(pubmed_importer):
+ """
+ Tests the case of multiple nested ReferenceList/Reference objects, instead
+ of a single ReferenceList with multiple Reference
+ """
+ with open('tests/files/pubmed_19129924.xml', 'r') as f:
+ soup = BeautifulSoup(f, "xml")
+ r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0])
+
+ assert len(r1.refs) > 1
+