diff options
author | Martin Czygan <martin@archive.org> | 2021-08-21 18:46:48 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2021-08-21 18:46:48 +0000 |
commit | fddbc81f231fb59efb9c41dc460b6486c673e94c (patch) | |
tree | c3993f8ef93864cde31ca4dc13ac71d22ad16800 | |
parent | 178d3996e09f26e83b86cca7f70528cca42c4fbe (diff) | |
parent | a4352a003a9fc7085638268ff00c05e305c519f5 (diff) | |
download | fatcat-fddbc81f231fb59efb9c41dc460b6486c673e94c.tar.gz fatcat-fddbc81f231fb59efb9c41dc460b6486c673e94c.zip |
Merge branch 'martin-pubmed-eof-sentry-92151' into 'master'
pubmed harvester: add basic retry logic
See merge request webgroup/fatcat!116
-rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 29 |
1 files changed, 21 insertions, 8 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index fd4af1f4..d336de04 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -226,7 +226,7 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'): return mapping -def ftpretr(url): +def ftpretr(url, max_retries=10, retry_delay=1): """ Note: This might move into a generic place in the future. @@ -235,16 +235,29 @@ def ftpretr(url): local temporary file. Returns the name of the local, closed temporary file. It is the reponsibility of the caller to cleanup the temporary file. + + Implements a basic retry mechanism, e.g. that became an issue in 08/2021, + when we encountered EOFError while talking to the FTP server. Retry delay in seconds. """ parsed = urlparse(url) server, path = parsed.netloc, parsed.path - ftp = ftplib.FTP(server) - ftp.login() - with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: - print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) - ftp.retrbinary('RETR %s' % path, f.write) - ftp.close() - return f.name + for i in range(max_retries): + try: + ftp = ftplib.FTP(server) + ftp.login() + with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: + print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) + ftp.retrbinary('RETR %s' % path, f.write) + ftp.close() + except EOFError as exc: + print("ftp retrbinary on {} failed with {} ({}) ({} retries left)".format( + path, exc, type(exc), max_retries - (i + 1)), file=sys.stderr) + if i + 1 == max_retries: + raise + else: + time.sleep(retry_delay) + else: + return f.name def xmlstream(filename, tag, encoding='utf-8'): |