From a4352a003a9fc7085638268ff00c05e305c519f5 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 20 Aug 2021 22:32:19 +0200 Subject: pubmed harvester: add basic retry logic Related to a previous issue with seemingly random EOFError from FTP connections, this patch wrap "ftpretr" helper function with a basic retry. Refs: fatcat-workers/issues/92151, fatcat-workers/issues/91102 --- python/fatcat_tools/harvest/pubmed.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index fd4af1f4..d336de04 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -226,7 +226,7 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'): return mapping -def ftpretr(url): +def ftpretr(url, max_retries=10, retry_delay=1): """ Note: This might move into a generic place in the future. @@ -235,16 +235,29 @@ def ftpretr(url): local temporary file. Returns the name of the local, closed temporary file. It is the reponsibility of the caller to cleanup the temporary file. + + Implements a basic retry mechanism, e.g. that became an issue in 08/2021, + when we encountered EOFError while talking to the FTP server. Retry delay in seconds. """ parsed = urlparse(url) server, path = parsed.netloc, parsed.path - ftp = ftplib.FTP(server) - ftp.login() - with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: - print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) - ftp.retrbinary('RETR %s' % path, f.write) - ftp.close() - return f.name + for i in range(max_retries): + try: + ftp = ftplib.FTP(server) + ftp.login() + with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: + print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) + ftp.retrbinary('RETR %s' % path, f.write) + ftp.close() + except EOFError as exc: + print("ftp retrbinary on {} failed with {} ({}) ({} retries left)".format( + path, exc, type(exc), max_retries - (i + 1)), file=sys.stderr) + if i + 1 == max_retries: + raise + else: + time.sleep(retry_delay) + else: + return f.name def xmlstream(filename, tag, encoding='utf-8'): -- cgit v1.2.3