From 376053a479a8d683fc5e099d0b0b3cb76c026d16 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 19 Feb 2020 02:28:11 +0100 Subject: more pubmed adjustments * regenerate map in continuous mode * add tests --- python/tests/files/pubmedsample_2019.xml.gz | Bin 0 -> 218528 bytes .../tests/files/pubmedsample_no_pmid_2019.xml.gz | Bin 0 -> 1128 bytes python/tests/harvest_pubmed.py | 78 +++++++++++++++++++++ 3 files changed, 78 insertions(+) create mode 100644 python/tests/files/pubmedsample_2019.xml.gz create mode 100644 python/tests/files/pubmedsample_no_pmid_2019.xml.gz create mode 100644 python/tests/harvest_pubmed.py (limited to 'python/tests') diff --git a/python/tests/files/pubmedsample_2019.xml.gz b/python/tests/files/pubmedsample_2019.xml.gz new file mode 100644 index 00000000..bafad833 Binary files /dev/null and b/python/tests/files/pubmedsample_2019.xml.gz differ diff --git a/python/tests/files/pubmedsample_no_pmid_2019.xml.gz b/python/tests/files/pubmedsample_no_pmid_2019.xml.gz new file mode 100644 index 00000000..8785a06d Binary files /dev/null and b/python/tests/files/pubmedsample_no_pmid_2019.xml.gz differ diff --git a/python/tests/harvest_pubmed.py b/python/tests/harvest_pubmed.py new file mode 100644 index 00000000..71832722 --- /dev/null +++ b/python/tests/harvest_pubmed.py @@ -0,0 +1,78 @@ +""" +Test pubmed FTP harvest. +""" + +import datetime +import json +import os + +import pytest + +from fatcat_tools.harvest import * + + +def test_pubmed_harvest_date(mocker): + + # mock out the harvest state object so it doesn't try to actually connect + # to Kafka + mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka') + + # Mocking a file fetched from FTP, should contain some 'PubmedArticle' elements. + # $ zcat tests/files/pubmedsample_2019.xml.gz | grep -c '' + # 176 + file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_2019.xml.gz') + ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr') + ftpretr.return_value = file_to_retrieve + + test_date = '2020-02-20' + + # We'll need one entry in the date_file_map. + generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map') + generate_date_file_map.return_value = {test_date: set(['dummy'])} + + # For cleanup. + os.remove = mocker.Mock() + + harvester = PubmedFTPWorker( + kafka_hosts="dummy", + produce_topic="dummy-produce-topic", + state_topic="dummy-state-topic", + ) + + harvester.producer = mocker.Mock() + # Since we mock out the FTP fetch, the concrete date does not matter here. + harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d')) + + # check that we published the expected number of DOI objects were published + # to the (mock) kafka topic + assert harvester.producer.produce.call_count == 176 + assert harvester.producer.flush.call_count == 1 + assert os.remove.call_count == 2 + +def test_pubmed_harvest_date_no_pmid(mocker): + # mock out the harvest state object so it doesn't try to actually connect + # to Kafka + mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka') + + file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_no_pmid_2019.xml.gz') + ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr') + ftpretr.return_value = file_to_retrieve + + test_date = '2020-02-20' + + # We'll need one entry in the date_file_map. + generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map') + generate_date_file_map.return_value = {test_date: set(['dummy'])} + + harvester = PubmedFTPWorker( + kafka_hosts="dummy", + produce_topic="dummy-produce-topic", + state_topic="dummy-state-topic", + ) + + harvester.producer = mocker.Mock() + + # The file has not PMID, not importable. + with pytest.raises(ValueError): + harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d')) + -- cgit v1.2.3 From db8892f2d960379525a4182b884c1d51c0c70186 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 10 Mar 2020 12:51:11 +0100 Subject: pubmed: move mapping generation out of fetch_date * fetch_date will fail on missing mapping * adjust tests (test will require access to pubmed ftp) --- python/fatcat_tools/harvest/pubmed.py | 15 ++++++++------- python/tests/harvest_pubmed.py | 2 ++ 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'python/tests') diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index 43a671cd..34522eb3 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -94,13 +94,12 @@ class PubmedFTPWorker: def fetch_date(self, date): """ Fetch file for a given date and feed Kafka one article per message. If - the fetched XML does not contain a PMID, this method will fail. We - build up the mapping from dates to filenames on first run. + the fetched XML does not contain a PMID, this method will fail. + + If no date file mapping is found, this will fail. """ if self.date_file_map is None: - self.date_file_map = generate_date_file_map(host=self.host) - if len(self.date_file_map) == 0: - raise ValueError("map from dates to files should not be empty, maybe the HTML changed?") + raise ValueError("cannot fetch date without date file mapping") date_str = date.strftime('%Y-%m-%d') paths = self.date_file_map.get(date_str) @@ -141,6 +140,10 @@ class PubmedFTPWorker: def run(self, continuous=False): while True: + self.date_file_map = generate_date_file_map(host=self.host) + if len(self.date_file_map) == 0: + raise ValueError("map from dates to files should not be empty, maybe the HTML changed?") + current = self.state.next(continuous) if current: print("Fetching citations updated on {} (UTC)".format(current), file=sys.stderr) @@ -151,8 +154,6 @@ class PubmedFTPWorker: if continuous: print("Sleeping {} seconds...".format(self.loop_sleep)) time.sleep(self.loop_sleep) - # Need to keep the mapping fresh. - self.date_file_map = generate_date_file_map(host=self.host) else: break print("{} FTP ingest caught up".format(self.name)) diff --git a/python/tests/harvest_pubmed.py b/python/tests/harvest_pubmed.py index 71832722..f8db46b6 100644 --- a/python/tests/harvest_pubmed.py +++ b/python/tests/harvest_pubmed.py @@ -9,6 +9,7 @@ import os import pytest from fatcat_tools.harvest import * +from fatcat_tools.harvest.pubmed import generate_date_file_map def test_pubmed_harvest_date(mocker): @@ -40,6 +41,7 @@ def test_pubmed_harvest_date(mocker): ) harvester.producer = mocker.Mock() + harvester.date_file_map = generate_date_file_map() # Since we mock out the FTP fetch, the concrete date does not matter here. harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d')) -- cgit v1.2.3