diff options
Diffstat (limited to 'python/fatcat_tools/harvest')
| -rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 29 | 
1 files changed, 21 insertions, 8 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index fd4af1f4..d336de04 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -226,7 +226,7 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):      return mapping -def ftpretr(url): +def ftpretr(url, max_retries=10, retry_delay=1):      """      Note: This might move into a generic place in the future. @@ -235,16 +235,29 @@ def ftpretr(url):      local temporary file. Returns the name of the local, closed temporary file.      It is the reponsibility of the caller to cleanup the temporary file. + +    Implements a basic retry mechanism, e.g. that became an issue in 08/2021, +    when we encountered EOFError while talking to the FTP server. Retry delay in seconds.      """      parsed = urlparse(url)      server, path = parsed.netloc, parsed.path -    ftp = ftplib.FTP(server) -    ftp.login() -    with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: -        print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) -        ftp.retrbinary('RETR %s' % path, f.write) -    ftp.close() -    return f.name +    for i in range(max_retries): +        try: +            ftp = ftplib.FTP(server) +            ftp.login() +            with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: +                print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) +                ftp.retrbinary('RETR %s' % path, f.write) +            ftp.close() +        except EOFError as exc: +            print("ftp retrbinary on {} failed with {} ({}) ({} retries left)".format( +                path, exc, type(exc), max_retries - (i + 1)), file=sys.stderr) +            if i + 1 == max_retries: +                raise +            else: +                time.sleep(retry_delay) +        else: +            return f.name  def xmlstream(filename, tag, encoding='utf-8'):  | 
