summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/harvest
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-03-10 12:51:11 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-03-10 12:51:11 +0100
commitdb8892f2d960379525a4182b884c1d51c0c70186 (patch)
tree538e9a907425bc718fe3b0cf8e21b06a366525ea /python/fatcat_tools/harvest
parent34a18cd1821d09ac0beee8959407ec51cf397337 (diff)
downloadfatcat-db8892f2d960379525a4182b884c1d51c0c70186.tar.gz
fatcat-db8892f2d960379525a4182b884c1d51c0c70186.zip
pubmed: move mapping generation out of fetch_date
* fetch_date will fail on missing mapping * adjust tests (test will require access to pubmed ftp)
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r--python/fatcat_tools/harvest/pubmed.py15
1 files changed, 8 insertions, 7 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index 43a671cd..34522eb3 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -94,13 +94,12 @@ class PubmedFTPWorker:
def fetch_date(self, date):
"""
Fetch file for a given date and feed Kafka one article per message. If
- the fetched XML does not contain a PMID, this method will fail. We
- build up the mapping from dates to filenames on first run.
+ the fetched XML does not contain a PMID, this method will fail.
+
+ If no date file mapping is found, this will fail.
"""
if self.date_file_map is None:
- self.date_file_map = generate_date_file_map(host=self.host)
- if len(self.date_file_map) == 0:
- raise ValueError("map from dates to files should not be empty, maybe the HTML changed?")
+ raise ValueError("cannot fetch date without date file mapping")
date_str = date.strftime('%Y-%m-%d')
paths = self.date_file_map.get(date_str)
@@ -141,6 +140,10 @@ class PubmedFTPWorker:
def run(self, continuous=False):
while True:
+ self.date_file_map = generate_date_file_map(host=self.host)
+ if len(self.date_file_map) == 0:
+ raise ValueError("map from dates to files should not be empty, maybe the HTML changed?")
+
current = self.state.next(continuous)
if current:
print("Fetching citations updated on {} (UTC)".format(current), file=sys.stderr)
@@ -151,8 +154,6 @@ class PubmedFTPWorker:
if continuous:
print("Sleeping {} seconds...".format(self.loop_sleep))
time.sleep(self.loop_sleep)
- # Need to keep the mapping fresh.
- self.date_file_map = generate_date_file_map(host=self.host)
else:
break
print("{} FTP ingest caught up".format(self.name))