From 376053a479a8d683fc5e099d0b0b3cb76c026d16 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 19 Feb 2020 02:28:11 +0100 Subject: more pubmed adjustments * regenerate map in continuous mode * add tests --- python/tests/files/pubmedsample_2019.xml.gz | Bin 0 -> 218528 bytes .../tests/files/pubmedsample_no_pmid_2019.xml.gz | Bin 0 -> 1128 bytes python/tests/harvest_pubmed.py | 78 +++++++++++++++++++++ 3 files changed, 78 insertions(+) create mode 100644 python/tests/files/pubmedsample_2019.xml.gz create mode 100644 python/tests/files/pubmedsample_no_pmid_2019.xml.gz create mode 100644 python/tests/harvest_pubmed.py (limited to 'python/tests') diff --git a/python/tests/files/pubmedsample_2019.xml.gz b/python/tests/files/pubmedsample_2019.xml.gz new file mode 100644 index 00000000..bafad833 Binary files /dev/null and b/python/tests/files/pubmedsample_2019.xml.gz differ diff --git a/python/tests/files/pubmedsample_no_pmid_2019.xml.gz b/python/tests/files/pubmedsample_no_pmid_2019.xml.gz new file mode 100644 index 00000000..8785a06d Binary files /dev/null and b/python/tests/files/pubmedsample_no_pmid_2019.xml.gz differ diff --git a/python/tests/harvest_pubmed.py b/python/tests/harvest_pubmed.py new file mode 100644 index 00000000..71832722 --- /dev/null +++ b/python/tests/harvest_pubmed.py @@ -0,0 +1,78 @@ +""" +Test pubmed FTP harvest. +""" + +import datetime +import json +import os + +import pytest + +from fatcat_tools.harvest import * + + +def test_pubmed_harvest_date(mocker): + + # mock out the harvest state object so it doesn't try to actually connect + # to Kafka + mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka') + + # Mocking a file fetched from FTP, should contain some 'PubmedArticle' elements. + # $ zcat tests/files/pubmedsample_2019.xml.gz | grep -c '' + # 176 + file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_2019.xml.gz') + ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr') + ftpretr.return_value = file_to_retrieve + + test_date = '2020-02-20' + + # We'll need one entry in the date_file_map. + generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map') + generate_date_file_map.return_value = {test_date: set(['dummy'])} + + # For cleanup. + os.remove = mocker.Mock() + + harvester = PubmedFTPWorker( + kafka_hosts="dummy", + produce_topic="dummy-produce-topic", + state_topic="dummy-state-topic", + ) + + harvester.producer = mocker.Mock() + # Since we mock out the FTP fetch, the concrete date does not matter here. + harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d')) + + # check that we published the expected number of DOI objects were published + # to the (mock) kafka topic + assert harvester.producer.produce.call_count == 176 + assert harvester.producer.flush.call_count == 1 + assert os.remove.call_count == 2 + +def test_pubmed_harvest_date_no_pmid(mocker): + # mock out the harvest state object so it doesn't try to actually connect + # to Kafka + mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka') + + file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_no_pmid_2019.xml.gz') + ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr') + ftpretr.return_value = file_to_retrieve + + test_date = '2020-02-20' + + # We'll need one entry in the date_file_map. + generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map') + generate_date_file_map.return_value = {test_date: set(['dummy'])} + + harvester = PubmedFTPWorker( + kafka_hosts="dummy", + produce_topic="dummy-produce-topic", + state_topic="dummy-state-topic", + ) + + harvester.producer = mocker.Mock() + + # The file has not PMID, not importable. + with pytest.raises(ValueError): + harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d')) + -- cgit v1.2.3