diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-03-20 13:00:50 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-03-20 13:00:52 -0700 |
commit | a6f74183dd1cf1eaa44f7edeb98dbc5dc737dabb (patch) | |
tree | 6fe603ef02c70ae748cafd0c407978c74bd3ae3c /python/tests/import_pubmed.py | |
parent | 12c0e53669fb9401b09e088217c5c103d90b9106 (diff) | |
download | fatcat-a6f74183dd1cf1eaa44f7edeb98dbc5dc737dabb.tar.gz fatcat-a6f74183dd1cf1eaa44f7edeb98dbc5dc737dabb.zip |
pubmed: handle multiple ReferenceList
This resolves a situation noticed in prod where we were only
importing/updating a single reference per article.
Includes a regression test.
Diffstat (limited to 'python/tests/import_pubmed.py')
-rw-r--r-- | python/tests/import_pubmed.py | 12 |
1 files changed, 12 insertions, 0 deletions
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py index 49609f75..f57aa273 100644 --- a/python/tests/import_pubmed.py +++ b/python/tests/import_pubmed.py @@ -118,6 +118,7 @@ def test_pubmed_xml_parse(pubmed_importer): assert r2.refs[0].extra['unstructured'] == "Microbiology. 2009 Jun;155(Pt 6):1840-6" assert r2.refs[0].extra['pmid'] == "19383690" + assert len(r2.refs) > 1 def test_pubmed_xml_dates(pubmed_importer): with open('tests/files/pubmed_31393839.xml', 'r') as f: @@ -126,3 +127,14 @@ def test_pubmed_xml_dates(pubmed_importer): assert r1.release_year == 2019 +def test_pubmed_xml_parse_refs(pubmed_importer): + """ + Tests the case of multiple nested ReferenceList/Reference objects, instead + of a single ReferenceList with multiple Reference + """ + with open('tests/files/pubmed_19129924.xml', 'r') as f: + soup = BeautifulSoup(f, "xml") + r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0]) + + assert len(r1.refs) > 1 + |