summaryrefslogtreecommitdiffstats
path: root/python/tests/harvest_pubmed.py
diff options
context:
space:
mode:
authorMartin Czygan <martin@archive.org>2020-03-10 15:33:17 +0000
committerMartin Czygan <martin@archive.org>2020-03-10 15:33:17 +0000
commit336630e1d445fb9d233447f9af4bac94473a12bf (patch)
treeb2d4baa4ea6d3afac29b9b2760101c10d18ea30a /python/tests/harvest_pubmed.py
parentf4cce5a765a9f80f9c5e9c907689c06dc9ebf102 (diff)
parentd18942d1ab4d394bdb275bcf9eb82d1cba814775 (diff)
downloadfatcat-336630e1d445fb9d233447f9af4bac94473a12bf.tar.gz
fatcat-336630e1d445fb9d233447f9af4bac94473a12bf.zip
Merge branch 'martin-kafka-bs4-import' into 'master'
pubmed and arxiv harvest preparations See merge request webgroup/fatcat!28
Diffstat (limited to 'python/tests/harvest_pubmed.py')
-rw-r--r--python/tests/harvest_pubmed.py80
1 files changed, 80 insertions, 0 deletions
diff --git a/python/tests/harvest_pubmed.py b/python/tests/harvest_pubmed.py
new file mode 100644
index 00000000..f8db46b6
--- /dev/null
+++ b/python/tests/harvest_pubmed.py
@@ -0,0 +1,80 @@
+"""
+Test pubmed FTP harvest.
+"""
+
+import datetime
+import json
+import os
+
+import pytest
+
+from fatcat_tools.harvest import *
+from fatcat_tools.harvest.pubmed import generate_date_file_map
+
+
+def test_pubmed_harvest_date(mocker):
+
+ # mock out the harvest state object so it doesn't try to actually connect
+ # to Kafka
+ mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka')
+
+ # Mocking a file fetched from FTP, should contain some 'PubmedArticle' elements.
+ # $ zcat tests/files/pubmedsample_2019.xml.gz | grep -c '<PubmedArticle>'
+ # 176
+ file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_2019.xml.gz')
+ ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr')
+ ftpretr.return_value = file_to_retrieve
+
+ test_date = '2020-02-20'
+
+ # We'll need one entry in the date_file_map.
+ generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map')
+ generate_date_file_map.return_value = {test_date: set(['dummy'])}
+
+ # For cleanup.
+ os.remove = mocker.Mock()
+
+ harvester = PubmedFTPWorker(
+ kafka_hosts="dummy",
+ produce_topic="dummy-produce-topic",
+ state_topic="dummy-state-topic",
+ )
+
+ harvester.producer = mocker.Mock()
+ harvester.date_file_map = generate_date_file_map()
+ # Since we mock out the FTP fetch, the concrete date does not matter here.
+ harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d'))
+
+ # check that we published the expected number of DOI objects were published
+ # to the (mock) kafka topic
+ assert harvester.producer.produce.call_count == 176
+ assert harvester.producer.flush.call_count == 1
+ assert os.remove.call_count == 2
+
+def test_pubmed_harvest_date_no_pmid(mocker):
+ # mock out the harvest state object so it doesn't try to actually connect
+ # to Kafka
+ mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka')
+
+ file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_no_pmid_2019.xml.gz')
+ ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr')
+ ftpretr.return_value = file_to_retrieve
+
+ test_date = '2020-02-20'
+
+ # We'll need one entry in the date_file_map.
+ generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map')
+ generate_date_file_map.return_value = {test_date: set(['dummy'])}
+
+ harvester = PubmedFTPWorker(
+ kafka_hosts="dummy",
+ produce_topic="dummy-produce-topic",
+ state_topic="dummy-state-topic",
+ )
+
+ harvester.producer = mocker.Mock()
+
+ # The file has not PMID, not importable.
+ with pytest.raises(ValueError):
+ harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d'))
+