diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-03-10 12:51:11 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-03-10 12:51:11 +0100 |
commit | db8892f2d960379525a4182b884c1d51c0c70186 (patch) | |
tree | 538e9a907425bc718fe3b0cf8e21b06a366525ea /python | |
parent | 34a18cd1821d09ac0beee8959407ec51cf397337 (diff) | |
download | fatcat-db8892f2d960379525a4182b884c1d51c0c70186.tar.gz fatcat-db8892f2d960379525a4182b884c1d51c0c70186.zip |
pubmed: move mapping generation out of fetch_date
* fetch_date will fail on missing mapping
* adjust tests (test will require access to pubmed ftp)
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 15 | ||||
-rw-r--r-- | python/tests/harvest_pubmed.py | 2 |
2 files changed, 10 insertions, 7 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index 43a671cd..34522eb3 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -94,13 +94,12 @@ class PubmedFTPWorker: def fetch_date(self, date): """ Fetch file for a given date and feed Kafka one article per message. If - the fetched XML does not contain a PMID, this method will fail. We - build up the mapping from dates to filenames on first run. + the fetched XML does not contain a PMID, this method will fail. + + If no date file mapping is found, this will fail. """ if self.date_file_map is None: - self.date_file_map = generate_date_file_map(host=self.host) - if len(self.date_file_map) == 0: - raise ValueError("map from dates to files should not be empty, maybe the HTML changed?") + raise ValueError("cannot fetch date without date file mapping") date_str = date.strftime('%Y-%m-%d') paths = self.date_file_map.get(date_str) @@ -141,6 +140,10 @@ class PubmedFTPWorker: def run(self, continuous=False): while True: + self.date_file_map = generate_date_file_map(host=self.host) + if len(self.date_file_map) == 0: + raise ValueError("map from dates to files should not be empty, maybe the HTML changed?") + current = self.state.next(continuous) if current: print("Fetching citations updated on {} (UTC)".format(current), file=sys.stderr) @@ -151,8 +154,6 @@ class PubmedFTPWorker: if continuous: print("Sleeping {} seconds...".format(self.loop_sleep)) time.sleep(self.loop_sleep) - # Need to keep the mapping fresh. - self.date_file_map = generate_date_file_map(host=self.host) else: break print("{} FTP ingest caught up".format(self.name)) diff --git a/python/tests/harvest_pubmed.py b/python/tests/harvest_pubmed.py index 71832722..f8db46b6 100644 --- a/python/tests/harvest_pubmed.py +++ b/python/tests/harvest_pubmed.py @@ -9,6 +9,7 @@ import os import pytest from fatcat_tools.harvest import * +from fatcat_tools.harvest.pubmed import generate_date_file_map def test_pubmed_harvest_date(mocker): @@ -40,6 +41,7 @@ def test_pubmed_harvest_date(mocker): ) harvester.producer = mocker.Mock() + harvester.date_file_map = generate_date_file_map() # Since we mock out the FTP fetch, the concrete date does not matter here. harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d')) |