summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/harvest
diff options
context:
space:
mode:
authorMartin Czygan <martin@archive.org>2021-08-21 18:46:48 +0000
committerMartin Czygan <martin@archive.org>2021-08-21 18:46:48 +0000
commitfddbc81f231fb59efb9c41dc460b6486c673e94c (patch)
treec3993f8ef93864cde31ca4dc13ac71d22ad16800 /python/fatcat_tools/harvest
parent178d3996e09f26e83b86cca7f70528cca42c4fbe (diff)
parenta4352a003a9fc7085638268ff00c05e305c519f5 (diff)
downloadfatcat-fddbc81f231fb59efb9c41dc460b6486c673e94c.tar.gz
fatcat-fddbc81f231fb59efb9c41dc460b6486c673e94c.zip
Merge branch 'martin-pubmed-eof-sentry-92151' into 'master'
pubmed harvester: add basic retry logic See merge request webgroup/fatcat!116
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r--python/fatcat_tools/harvest/pubmed.py29
1 files changed, 21 insertions, 8 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index fd4af1f4..d336de04 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -226,7 +226,7 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):
return mapping
-def ftpretr(url):
+def ftpretr(url, max_retries=10, retry_delay=1):
"""
Note: This might move into a generic place in the future.
@@ -235,16 +235,29 @@ def ftpretr(url):
local temporary file. Returns the name of the local, closed temporary file.
It is the reponsibility of the caller to cleanup the temporary file.
+
+ Implements a basic retry mechanism, e.g. that became an issue in 08/2021,
+ when we encountered EOFError while talking to the FTP server. Retry delay in seconds.
"""
parsed = urlparse(url)
server, path = parsed.netloc, parsed.path
- ftp = ftplib.FTP(server)
- ftp.login()
- with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f:
- print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr)
- ftp.retrbinary('RETR %s' % path, f.write)
- ftp.close()
- return f.name
+ for i in range(max_retries):
+ try:
+ ftp = ftplib.FTP(server)
+ ftp.login()
+ with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f:
+ print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr)
+ ftp.retrbinary('RETR %s' % path, f.write)
+ ftp.close()
+ except EOFError as exc:
+ print("ftp retrbinary on {} failed with {} ({}) ({} retries left)".format(
+ path, exc, type(exc), max_retries - (i + 1)), file=sys.stderr)
+ if i + 1 == max_retries:
+ raise
+ else:
+ time.sleep(retry_delay)
+ else:
+ return f.name
def xmlstream(filename, tag, encoding='utf-8'):