8 files changed, 353 insertions, 13 deletions
diff --git a/python/tests/files/pubmed_19129924.xml b/python/tests/files/pubmed_19129924.xml
new file mode 100644
index 00000000..a8ff0bcd
--- /dev/null
+++ b/python/tests/files/pubmed_19129924.xml
@@ -0,0 +1,206 @@
+<PubmedArticle>
+    <MedlineCitation Owner="NLM" Status="PubMed-not-MEDLINE">
+      <PMID Version="1">19129924</PMID>
+      <DateCompleted>
+        <Year>2011</Year>
+        <Month>07</Month>
+        <Day>14</Day>
+      </DateCompleted>
+      <DateRevised>
+        <Year>2020</Year>
+        <Month>03</Month>
+        <Day>06</Day>
+      </DateRevised>
+      <Article PubModel="Electronic-eCollection">
+        <Journal>
+          <ISSN IssnType="Electronic">1662-5196</ISSN>
+          <JournalIssue CitedMedium="Internet">
+            <Volume>2</Volume>
+            <PubDate>
+              <Year>2008</Year>
+            </PubDate>
+          </JournalIssue>
+          <Title>Frontiers in neuroinformatics</Title>
+          <ISOAbbreviation>Front Neuroinform</ISOAbbreviation>
+        </Journal>
+        <ArticleTitle>PyMOOSE: Interoperable Scripting in Python for MOOSE.</ArticleTitle>
+        <Pagination>
+          <MedlinePgn>6</MedlinePgn>
+        </Pagination>
+        <ELocationID EIdType="doi" ValidYN="Y">10.3389/neuro.11.006.2008</ELocationID>
+        <Abstract>
+          <AbstractText>Python is emerging as a common scripting language for simulators. This opens up many possibilities for interoperability in the form of analysis, interfaces, and communications between simulators. We report the integration of Python scripting with the Multi-scale Object Oriented Simulation Environment (MOOSE). MOOSE is a general-purpose simulation system for compartmental neuronal models and for models of signaling pathways based on chemical kinetics. We show how the Python-scripting version of MOOSE, PyMOOSE, combines the power of a compiled simulator with the versatility and ease of use of Python. We illustrate this by using Python numerical libraries to analyze MOOSE output online, and by developing a GUI in Python/Qt for a MOOSE simulation. Finally, we build and run a composite neuronal/signaling model that uses both the NEURON and MOOSE numerical engines, and Python as a bridge between the two. Thus PyMOOSE has a high degree of interoperability with analysis routines, with graphical toolkits, and with other simulators.</AbstractText>
+        </Abstract>
+        <AuthorList CompleteYN="Y">
+          <Author ValidYN="Y">
+            <LastName>Ray</LastName>
+            <ForeName>Subhasis</ForeName>
+            <Initials>S</Initials>
+            <AffiliationInfo>
+              <Affiliation>National Centre for Biological Sciences Bangalore, India.</Affiliation>
+            </AffiliationInfo>
+          </Author>
+          <Author ValidYN="Y">
+            <LastName>Bhalla</LastName>
+            <ForeName>Upinder S</ForeName>
+            <Initials>US</Initials>
+          </Author>
+        </AuthorList>
+        <Language>eng</Language>
+        <PublicationTypeList>
+          <PublicationType UI="D016428">Journal Article</PublicationType>
+        </PublicationTypeList>
+        <ArticleDate DateType="Electronic">
+          <Year>2008</Year>
+          <Month>12</Month>
+          <Day>19</Day>
+        </ArticleDate>
+      </Article>
+      <MedlineJournalInfo>
+        <Country>Switzerland</Country>
+        <MedlineTA>Front Neuroinform</MedlineTA>
+        <NlmUniqueID>101477957</NlmUniqueID>
+        <ISSNLinking>1662-5196</ISSNLinking>
+      </MedlineJournalInfo>
+      <KeywordList Owner="NOTNLM">
+        <Keyword MajorTopicYN="N">GENESIS</Keyword>
+        <Keyword MajorTopicYN="N">MOOSE</Keyword>
+        <Keyword MajorTopicYN="N">NEURON</Keyword>
+        <Keyword MajorTopicYN="N">Python</Keyword>
+        <Keyword MajorTopicYN="N">compartmental models</Keyword>
+        <Keyword MajorTopicYN="N">multi-scale models</Keyword>
+        <Keyword MajorTopicYN="N">simulators</Keyword>
+        <Keyword MajorTopicYN="N">systems biology</Keyword>
+      </KeywordList>
+    </MedlineCitation>
+    <PubmedData>
+      <History>
+        <PubMedPubDate PubStatus="received">
+          <Year>2008</Year>
+          <Month>09</Month>
+          <Day>15</Day>
+        </PubMedPubDate>
+        <PubMedPubDate PubStatus="accepted">
+          <Year>2008</Year>
+          <Month>11</Month>
+          <Day>01</Day>
+        </PubMedPubDate>
+        <PubMedPubDate PubStatus="entrez">
+          <Year>2009</Year>
+          <Month>1</Month>
+          <Day>9</Day>
+          <Hour>9</Hour>
+          <Minute>0</Minute>
+        </PubMedPubDate>
+        <PubMedPubDate PubStatus="pubmed">
+          <Year>2009</Year>
+          <Month>1</Month>
+          <Day>9</Day>
+          <Hour>9</Hour>
+          <Minute>0</Minute>
+        </PubMedPubDate>
+        <PubMedPubDate PubStatus="medline">
+          <Year>2009</Year>
+          <Month>1</Month>
+          <Day>9</Day>
+          <Hour>9</Hour>
+          <Minute>1</Minute>
+        </PubMedPubDate>
+      </History>
+      <PublicationStatus>epublish</PublicationStatus>
+      <ArticleIdList>
+        <ArticleId IdType="pubmed">19129924</ArticleId>
+        <ArticleId IdType="doi">10.3389/neuro.11.006.2008</ArticleId>
+        <ArticleId IdType="pmc">PMC2614320</ArticleId>
+      </ArticleIdList>
+      <ReferenceList>
+        <Reference>
+          <Citation>Eur J Neurosci. 2004 Nov;20(10):2671-80</Citation>
+          <ArticleIdList>
+            <ArticleId IdType="pubmed">15548210</ArticleId>
+          </ArticleIdList>
+        </Reference>
+      </ReferenceList>
+      <ReferenceList>
+        <Reference>
+          <Citation>Science. 2002 Aug 9;297(5583):1018-23</Citation>
+          <ArticleIdList>
+            <ArticleId IdType="pubmed">12169734</ArticleId>
+          </ArticleIdList>
+        </Reference>
+      </ReferenceList>
+      <ReferenceList>
+        <Reference>
+          <Citation>Philos Trans R Soc Lond B Biol Sci. 2001 Aug 29;356(1412):1209-28</Citation>
+          <ArticleIdList>
+            <ArticleId IdType="pubmed">11545699</ArticleId>
+          </ArticleIdList>
+        </Reference>
+      </ReferenceList>
+      <ReferenceList>
+        <Reference>
+          <Citation>Biol Cybern. 1985;53(1):41-56</Citation>
+          <ArticleIdList>
+            <ArticleId IdType="pubmed">3841014</ArticleId>
+          </ArticleIdList>
+        </Reference>
+      </ReferenceList>
+      <ReferenceList>
+        <Reference>
+          <Citation>Neuroinformatics. 2007 Summer;5(2):96-104</Citation>
+          <ArticleIdList>
+            <ArticleId IdType="pubmed">17873371</ArticleId>
+          </ArticleIdList>
+        </Reference>
+      </ReferenceList>
+      <ReferenceList>
+        <Reference>
+          <Citation>Science. 1999 Jan 15;283(5400):381-7</Citation>
+          <ArticleIdList>
+            <ArticleId IdType="pubmed">9888852</ArticleId>
+          </ArticleIdList>
+        </Reference>
+      </ReferenceList>
+      <ReferenceList>
+        <Reference>
+          <Citation>Neuroinformatics. 2007 Summer;5(2):127-38</Citation>
+          <ArticleIdList>
+            <ArticleId IdType="pubmed">17873374</ArticleId>
+          </ArticleIdList>
+        </Reference>
+      </ReferenceList>
+      <ReferenceList>
+        <Reference>
+          <Citation>Nat Biotechnol. 2005 Dec;23(12):1509-15</Citation>
+          <ArticleIdList>
+            <ArticleId IdType="pubmed">16333295</ArticleId>
+          </ArticleIdList>
+        </Reference>
+      </ReferenceList>
+      <ReferenceList>
+        <Reference>
+          <Citation>Biol Cybern. 1985;53(1):27-40</Citation>
+          <ArticleIdList>
+            <ArticleId IdType="pubmed">3841013</ArticleId>
+          </ArticleIdList>
+        </Reference>
+      </ReferenceList>
+      <ReferenceList>
+        <Reference>
+          <Citation>J Neurophysiol. 1995 Mar;73(3):1157-68</Citation>
+          <ArticleIdList>
+            <ArticleId IdType="pubmed">7608762</ArticleId>
+          </ArticleIdList>
+        </Reference>
+      </ReferenceList>
+      <ReferenceList>
+        <Reference>
+          <Citation>Bioinformatics. 2003 Mar 1;19(4):524-31</Citation>
+          <ArticleIdList>
+            <ArticleId IdType="pubmed">12611808</ArticleId>
+          </ArticleIdList>
+        </Reference>
+      </ReferenceList>
+    </PubmedData>
+  </PubmedArticle>
+  
diff --git a/python/tests/files/pubmedsample_2019.xml.gz b/python/tests/files/pubmedsample_2019.xml.gz
new file mode 100644
index 00000000..bafad833
--- /dev/null
+++ b/python/tests/files/pubmedsample_2019.xml.gz
diff --git a/python/tests/files/pubmedsample_no_pmid_2019.xml.gz b/python/tests/files/pubmedsample_no_pmid_2019.xml.gz
new file mode 100644
index 00000000..8785a06d
--- /dev/null
+++ b/python/tests/files/pubmedsample_no_pmid_2019.xml.gz
diff --git a/python/tests/harvest_pubmed.py b/python/tests/harvest_pubmed.py
new file mode 100644
index 00000000..f8db46b6
--- /dev/null
+++ b/python/tests/harvest_pubmed.py
@@ -0,0 +1,80 @@
+"""
+Test pubmed FTP harvest.
+"""
+
+import datetime
+import json
+import os
+
+import pytest
+
+from fatcat_tools.harvest import *
+from fatcat_tools.harvest.pubmed import generate_date_file_map
+
+
+def test_pubmed_harvest_date(mocker):
+
+    # mock out the harvest state object so it doesn't try to actually connect
+    # to Kafka
+    mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka')
+
+    # Mocking a file fetched from FTP, should contain some 'PubmedArticle' elements.
+    # $ zcat tests/files/pubmedsample_2019.xml.gz | grep -c '<PubmedArticle>'
+    # 176
+    file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_2019.xml.gz')
+    ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr')
+    ftpretr.return_value = file_to_retrieve
+
+    test_date = '2020-02-20'
+
+    # We'll need one entry in the date_file_map.
+    generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map')
+    generate_date_file_map.return_value = {test_date: set(['dummy'])}
+
+    # For cleanup.
+    os.remove = mocker.Mock()
+
+    harvester = PubmedFTPWorker(
+        kafka_hosts="dummy",
+        produce_topic="dummy-produce-topic",
+        state_topic="dummy-state-topic",
+    )
+
+    harvester.producer = mocker.Mock()
+    harvester.date_file_map = generate_date_file_map()
+    # Since we mock out the FTP fetch, the concrete date does not matter here.
+    harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d'))
+
+    # check that we published the expected number of DOI objects were published
+    # to the (mock) kafka topic
+    assert harvester.producer.produce.call_count == 176
+    assert harvester.producer.flush.call_count == 1
+    assert os.remove.call_count == 2
+
+def test_pubmed_harvest_date_no_pmid(mocker):
+    # mock out the harvest state object so it doesn't try to actually connect
+    # to Kafka
+    mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka')
+
+    file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_no_pmid_2019.xml.gz')
+    ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr')
+    ftpretr.return_value = file_to_retrieve
+
+    test_date = '2020-02-20'
+
+    # We'll need one entry in the date_file_map.
+    generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map')
+    generate_date_file_map.return_value = {test_date: set(['dummy'])}
+
+    harvester = PubmedFTPWorker(
+        kafka_hosts="dummy",
+        produce_topic="dummy-produce-topic",
+        state_topic="dummy-state-topic",
+    )
+
+    harvester.producer = mocker.Mock()
+
+    # The file has not PMID, not importable.
+    with pytest.raises(ValueError):
+        harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d'))
+
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
index 49609f75..f57aa273 100644
--- a/python/tests/import_pubmed.py
+++ b/python/tests/import_pubmed.py
@@ -118,6 +118,7 @@ def test_pubmed_xml_parse(pubmed_importer):
 
     assert r2.refs[0].extra['unstructured'] == "Microbiology. 2009 Jun;155(Pt 6):1840-6"
     assert r2.refs[0].extra['pmid'] == "19383690"
+    assert len(r2.refs) > 1
 
 def test_pubmed_xml_dates(pubmed_importer):
     with open('tests/files/pubmed_31393839.xml', 'r') as f:
@@ -126,3 +127,14 @@ def test_pubmed_xml_dates(pubmed_importer):
 
     assert r1.release_year == 2019
 
+def test_pubmed_xml_parse_refs(pubmed_importer):
+    """
+    Tests the case of multiple nested ReferenceList/Reference objects, instead
+    of a single ReferenceList with multiple Reference
+    """
+    with open('tests/files/pubmed_19129924.xml', 'r') as f:
+        soup = BeautifulSoup(f, "xml")
+        r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0])
+
+    assert len(r1.refs) > 1
+
diff --git a/python/tests/transform_csl.py b/python/tests/transform_csl.py
index 6f29cba7..15c64ce5 100644
--- a/python/tests/transform_csl.py
+++ b/python/tests/transform_csl.py
@@ -12,22 +12,22 @@ def test_csl_crossref(crossref_importer):
         # not a single line
         raw = json.loads(f.read())
         r = crossref_importer.parse_record(raw)
-    # this work has some null contrib names; these should cause errors
-    with pytest.raises(ValueError):
-        release_to_csl(r)
-    with pytest.raises(ValueError):
-        csl = release_to_csl(r)
-        citeproc_csl(csl, 'csl-json')
-    # set with dummy so we can run other tests
-    for c in r.contribs:
-        if not c.raw_name:
-            c.raw_name = "dummy"
     csl = release_to_csl(r)
     citeproc_csl(csl, 'csl-json')
     citeproc_csl(csl, 'bibtex')
     citeproc_csl(csl, 'harvard1')
     citeproc_csl(csl, 'harvard1', html=True)
 
+    # check that with no author surnames, can't run
+    for c in r.contribs:
+        c.raw_name = None
+        c.surname = None
+    with pytest.raises(ValueError):
+        release_to_csl(r)
+    with pytest.raises(ValueError):
+        csl = release_to_csl(r)
+        citeproc_csl(csl, 'csl-json')
+
 def test_csl_pubmed(crossref_importer):
     with open('tests/files/example_releases_pubmed19n0972.json', 'r') as f:
         # multiple single lines
diff --git a/python/tests/web_citation_csl.py b/python/tests/web_citation_csl.py
index 3279ebea..e016b2d9 100644
--- a/python/tests/web_citation_csl.py
+++ b/python/tests/web_citation_csl.py
@@ -6,7 +6,7 @@ from fatcat_openapi_client.rest import ApiException
 from fixtures import *
 
 
-def test_release_bibtex(app):
+def test_release_bibtex(app, api):
 
     # "realistic" demo entity
     rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam')
@@ -17,6 +17,8 @@ def test_release_bibtex(app):
     assert b'@article{' in rv.data
     rv = app.get('/release/ccccccccccccccccccccccccca.bib')
     assert rv.status_code == 404
+    rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam/citeproc?style=bibtex')
+    assert rv.status_code == 200
     rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam/citeproc?style=csl-json')
     assert rv.status_code == 200
     # could also rv.get_json() here
@@ -25,10 +27,48 @@ def test_release_bibtex(app):
     assert rv.status_code == 200
     assert rv.data.decode('utf-8').startswith('Ioannidis, John. “Why Most Published Research Findings Are False”. 2.8 (2005)')
 
-    # "dummy" demo entity
+    # "dummy" demo entity; very minimal metadata
     rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai')
     assert rv.status_code == 200
+    assert b'BibTeX' in rv.data
+    rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai.bib')
+    assert rv.status_code == 200
+    rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/citeproc?style=modern-language-association')
+    assert rv.status_code == 200
+    rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/citeproc?style=csl-json')
+    assert rv.status_code == 200
+
+    # create release which can not have citeproc run on it (no authors)
+    eg = quick_eg(api)
+    r1 = ReleaseEntity(
+        title="some title",
+        ext_ids=ReleaseExtIds(),
+    )
+    r1edit = api.create_release(eg.editgroup_id, r1)
+    api.accept_editgroup(eg.editgroup_id)
+
+    rv = app.get('/release/{}'.format(r1edit.ident))
+    assert rv.status_code == 200
     assert not b'BibTeX' in rv.data
     with pytest.raises(ValueError):
-        rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai.bib')
+        rv = app.get('/release/{}.bib'.format(r1edit.ident))
+
+    # create release can have citeproc run on it (no authors)
+    eg = quick_eg(api)
+    r2 = ReleaseEntity(
+        title="some title again",
+        contribs=[
+            ReleaseContrib(
+                given_name="Paul",
+                surname="Otlet"),
+        ],
+        ext_ids=ReleaseExtIds(),
+    )
+    r2edit = api.create_release(eg.editgroup_id, r2)
+    api.accept_editgroup(eg.editgroup_id)
 
+    rv = app.get('/release/{}'.format(r2edit.ident))
+    assert rv.status_code == 200
+    assert b'BibTeX' in rv.data
+    rv = app.get('/release/{}.bib'.format(r2edit.ident))
+    assert rv.status_code == 200
diff --git a/python/tests/web_entity_views.py b/python/tests/web_entity_views.py
index cc4c498f..a3f0f897 100644
--- a/python/tests/web_entity_views.py
+++ b/python/tests/web_entity_views.py
@@ -42,6 +42,8 @@ def test_entity_basics(app):
         assert rv.status_code == 200
         rv = app.get('/{}/rev/{}'.format(entity_type, revision))
         assert rv.status_code == 200
+        rv = app.get('/{}/rev/{}_something'.format(entity_type, revision))
+        assert rv.status_code == 400
         rv = app.get('/{}/rev/{}/metadata'.format(entity_type, revision))
         assert rv.status_code == 200
         print('/editgroup/aaaaaaaaaaaabo53aaaaaaaaaq/{}/{}'.format(entity_type, ident))