aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-03-20 13:00:50 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-03-20 13:00:52 -0700
commita6f74183dd1cf1eaa44f7edeb98dbc5dc737dabb (patch)
tree6fe603ef02c70ae748cafd0c407978c74bd3ae3c
parent12c0e53669fb9401b09e088217c5c103d90b9106 (diff)
downloadfatcat-a6f74183dd1cf1eaa44f7edeb98dbc5dc737dabb.tar.gz
fatcat-a6f74183dd1cf1eaa44f7edeb98dbc5dc737dabb.zip
pubmed: handle multiple ReferenceList
This resolves a situation noticed in prod where we were only importing/updating a single reference per article. Includes a regression test.
-rw-r--r--python/fatcat_tools/importers/pubmed.py5
-rw-r--r--python/tests/files/pubmed_19129924.xml206
-rw-r--r--python/tests/import_pubmed.py12
3 files changed, 222 insertions, 1 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 70a6368d..3ecf5ef4 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -616,7 +616,10 @@ class PubmedImporter(EntityImporter):
### References
refs = []
if pubmed.ReferenceList:
- for ref in pubmed.ReferenceList.find_all('Reference'):
+ # note that Reference always exists within a ReferenceList, but
+ # that there may be multiple ReferenceList (eg, sometimes one per
+ # Reference)
+ for ref in pubmed.find_all('Reference'):
ref_extra = dict()
ref_doi = ref.find("ArticleId", IdType="doi")
if ref_doi:
diff --git a/python/tests/files/pubmed_19129924.xml b/python/tests/files/pubmed_19129924.xml
new file mode 100644
index 00000000..a8ff0bcd
--- /dev/null
+++ b/python/tests/files/pubmed_19129924.xml
@@ -0,0 +1,206 @@
+<PubmedArticle>
+ <MedlineCitation Owner="NLM" Status="PubMed-not-MEDLINE">
+ <PMID Version="1">19129924</PMID>
+ <DateCompleted>
+ <Year>2011</Year>
+ <Month>07</Month>
+ <Day>14</Day>
+ </DateCompleted>
+ <DateRevised>
+ <Year>2020</Year>
+ <Month>03</Month>
+ <Day>06</Day>
+ </DateRevised>
+ <Article PubModel="Electronic-eCollection">
+ <Journal>
+ <ISSN IssnType="Electronic">1662-5196</ISSN>
+ <JournalIssue CitedMedium="Internet">
+ <Volume>2</Volume>
+ <PubDate>
+ <Year>2008</Year>
+ </PubDate>
+ </JournalIssue>
+ <Title>Frontiers in neuroinformatics</Title>
+ <ISOAbbreviation>Front Neuroinform</ISOAbbreviation>
+ </Journal>
+ <ArticleTitle>PyMOOSE: Interoperable Scripting in Python for MOOSE.</ArticleTitle>
+ <Pagination>
+ <MedlinePgn>6</MedlinePgn>
+ </Pagination>
+ <ELocationID EIdType="doi" ValidYN="Y">10.3389/neuro.11.006.2008</ELocationID>
+ <Abstract>
+ <AbstractText>Python is emerging as a common scripting language for simulators. This opens up many possibilities for interoperability in the form of analysis, interfaces, and communications between simulators. We report the integration of Python scripting with the Multi-scale Object Oriented Simulation Environment (MOOSE). MOOSE is a general-purpose simulation system for compartmental neuronal models and for models of signaling pathways based on chemical kinetics. We show how the Python-scripting version of MOOSE, PyMOOSE, combines the power of a compiled simulator with the versatility and ease of use of Python. We illustrate this by using Python numerical libraries to analyze MOOSE output online, and by developing a GUI in Python/Qt for a MOOSE simulation. Finally, we build and run a composite neuronal/signaling model that uses both the NEURON and MOOSE numerical engines, and Python as a bridge between the two. Thus PyMOOSE has a high degree of interoperability with analysis routines, with graphical toolkits, and with other simulators.</AbstractText>
+ </Abstract>
+ <AuthorList CompleteYN="Y">
+ <Author ValidYN="Y">
+ <LastName>Ray</LastName>
+ <ForeName>Subhasis</ForeName>
+ <Initials>S</Initials>
+ <AffiliationInfo>
+ <Affiliation>National Centre for Biological Sciences Bangalore, India.</Affiliation>
+ </AffiliationInfo>
+ </Author>
+ <Author ValidYN="Y">
+ <LastName>Bhalla</LastName>
+ <ForeName>Upinder S</ForeName>
+ <Initials>US</Initials>
+ </Author>
+ </AuthorList>
+ <Language>eng</Language>
+ <PublicationTypeList>
+ <PublicationType UI="D016428">Journal Article</PublicationType>
+ </PublicationTypeList>
+ <ArticleDate DateType="Electronic">
+ <Year>2008</Year>
+ <Month>12</Month>
+ <Day>19</Day>
+ </ArticleDate>
+ </Article>
+ <MedlineJournalInfo>
+ <Country>Switzerland</Country>
+ <MedlineTA>Front Neuroinform</MedlineTA>
+ <NlmUniqueID>101477957</NlmUniqueID>
+ <ISSNLinking>1662-5196</ISSNLinking>
+ </MedlineJournalInfo>
+ <KeywordList Owner="NOTNLM">
+ <Keyword MajorTopicYN="N">GENESIS</Keyword>
+ <Keyword MajorTopicYN="N">MOOSE</Keyword>
+ <Keyword MajorTopicYN="N">NEURON</Keyword>
+ <Keyword MajorTopicYN="N">Python</Keyword>
+ <Keyword MajorTopicYN="N">compartmental models</Keyword>
+ <Keyword MajorTopicYN="N">multi-scale models</Keyword>
+ <Keyword MajorTopicYN="N">simulators</Keyword>
+ <Keyword MajorTopicYN="N">systems biology</Keyword>
+ </KeywordList>
+ </MedlineCitation>
+ <PubmedData>
+ <History>
+ <PubMedPubDate PubStatus="received">
+ <Year>2008</Year>
+ <Month>09</Month>
+ <Day>15</Day>
+ </PubMedPubDate>
+ <PubMedPubDate PubStatus="accepted">
+ <Year>2008</Year>
+ <Month>11</Month>
+ <Day>01</Day>
+ </PubMedPubDate>
+ <PubMedPubDate PubStatus="entrez">
+ <Year>2009</Year>
+ <Month>1</Month>
+ <Day>9</Day>
+ <Hour>9</Hour>
+ <Minute>0</Minute>
+ </PubMedPubDate>
+ <PubMedPubDate PubStatus="pubmed">
+ <Year>2009</Year>
+ <Month>1</Month>
+ <Day>9</Day>
+ <Hour>9</Hour>
+ <Minute>0</Minute>
+ </PubMedPubDate>
+ <PubMedPubDate PubStatus="medline">
+ <Year>2009</Year>
+ <Month>1</Month>
+ <Day>9</Day>
+ <Hour>9</Hour>
+ <Minute>1</Minute>
+ </PubMedPubDate>
+ </History>
+ <PublicationStatus>epublish</PublicationStatus>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">19129924</ArticleId>
+ <ArticleId IdType="doi">10.3389/neuro.11.006.2008</ArticleId>
+ <ArticleId IdType="pmc">PMC2614320</ArticleId>
+ </ArticleIdList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Eur J Neurosci. 2004 Nov;20(10):2671-80</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">15548210</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Science. 2002 Aug 9;297(5583):1018-23</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">12169734</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Philos Trans R Soc Lond B Biol Sci. 2001 Aug 29;356(1412):1209-28</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">11545699</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Biol Cybern. 1985;53(1):41-56</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">3841014</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Neuroinformatics. 2007 Summer;5(2):96-104</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">17873371</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Science. 1999 Jan 15;283(5400):381-7</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">9888852</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Neuroinformatics. 2007 Summer;5(2):127-38</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">17873374</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Nat Biotechnol. 2005 Dec;23(12):1509-15</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">16333295</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Biol Cybern. 1985;53(1):27-40</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">3841013</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>J Neurophysiol. 1995 Mar;73(3):1157-68</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">7608762</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Bioinformatics. 2003 Mar 1;19(4):524-31</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">12611808</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ </PubmedData>
+ </PubmedArticle>
+
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
index 49609f75..f57aa273 100644
--- a/python/tests/import_pubmed.py
+++ b/python/tests/import_pubmed.py
@@ -118,6 +118,7 @@ def test_pubmed_xml_parse(pubmed_importer):
assert r2.refs[0].extra['unstructured'] == "Microbiology. 2009 Jun;155(Pt 6):1840-6"
assert r2.refs[0].extra['pmid'] == "19383690"
+ assert len(r2.refs) > 1
def test_pubmed_xml_dates(pubmed_importer):
with open('tests/files/pubmed_31393839.xml', 'r') as f:
@@ -126,3 +127,14 @@ def test_pubmed_xml_dates(pubmed_importer):
assert r1.release_year == 2019
+def test_pubmed_xml_parse_refs(pubmed_importer):
+ """
+ Tests the case of multiple nested ReferenceList/Reference objects, instead
+ of a single ReferenceList with multiple Reference
+ """
+ with open('tests/files/pubmed_19129924.xml', 'r') as f:
+ soup = BeautifulSoup(f, "xml")
+ r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0])
+
+ assert len(r1.refs) > 1
+