diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-08-20 22:32:19 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-08-20 22:32:19 +0200 |
commit | a4352a003a9fc7085638268ff00c05e305c519f5 (patch) | |
tree | c3993f8ef93864cde31ca4dc13ac71d22ad16800 /python | |
parent | 178d3996e09f26e83b86cca7f70528cca42c4fbe (diff) | |
download | fatcat-a4352a003a9fc7085638268ff00c05e305c519f5.tar.gz fatcat-a4352a003a9fc7085638268ff00c05e305c519f5.zip |
pubmed harvester: add basic retry logic
Related to a previous issue with seemingly random EOFError from FTP
connections, this patch wrap "ftpretr" helper function with a basic
retry.
Refs: fatcat-workers/issues/92151, fatcat-workers/issues/91102
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 29 |
1 files changed, 21 insertions, 8 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index fd4af1f4..d336de04 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -226,7 +226,7 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'): return mapping -def ftpretr(url): +def ftpretr(url, max_retries=10, retry_delay=1): """ Note: This might move into a generic place in the future. @@ -235,16 +235,29 @@ def ftpretr(url): local temporary file. Returns the name of the local, closed temporary file. It is the reponsibility of the caller to cleanup the temporary file. + + Implements a basic retry mechanism, e.g. that became an issue in 08/2021, + when we encountered EOFError while talking to the FTP server. Retry delay in seconds. """ parsed = urlparse(url) server, path = parsed.netloc, parsed.path - ftp = ftplib.FTP(server) - ftp.login() - with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: - print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) - ftp.retrbinary('RETR %s' % path, f.write) - ftp.close() - return f.name + for i in range(max_retries): + try: + ftp = ftplib.FTP(server) + ftp.login() + with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: + print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) + ftp.retrbinary('RETR %s' % path, f.write) + ftp.close() + except EOFError as exc: + print("ftp retrbinary on {} failed with {} ({}) ({} retries left)".format( + path, exc, type(exc), max_retries - (i + 1)), file=sys.stderr) + if i + 1 == max_retries: + raise + else: + time.sleep(retry_delay) + else: + return f.name def xmlstream(filename, tag, encoding='utf-8'): |