diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2021-07-16 14:42:30 +0200 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2021-07-16 23:45:48 +0200 | 
| commit | 47c98540083da302802b54a70f77fb8abc69b4de (patch) | |
| tree | f85c88b1a4ada18408a07845ae74ba7ccb97aac4 /python/fatcat_tools/harvest | |
| parent | 01515fd6e8c3d3999ff5f441ac06a3b1c371d07a (diff) | |
| download | fatcat-47c98540083da302802b54a70f77fb8abc69b4de.tar.gz fatcat-47c98540083da302802b54a70f77fb8abc69b4de.zip | |
pubmed: reconnect on error
ftp retrieval would run but fail with EOFError on
/pubmed/updatefiles/pubmed21n1328_stats.html - not able to find the root
cause; using a fresh client, the exact same file would work just
fine. So when we retry, we reconnect on failure.
Refs: sentry #91102.
Diffstat (limited to 'python/fatcat_tools/harvest')
| -rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 34 | 
1 files changed, 30 insertions, 4 deletions
| diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index 802d31d8..f97702c0 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -9,16 +9,17 @@ Assumptions:  """  import collections +import ftplib  import gzip  import io  import os  import re  import shutil +import socket  import sys  import tempfile  import time  import xml.etree.ElementTree as ET -from ftplib import FTP  from urllib.parse import urlparse  import dateparser @@ -168,15 +169,40 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):      """      mapping = collections.defaultdict(set)      pattern = re.compile(r'Filename: ([^ ]*.xml) -- Created: ([^<]*)') -    ftp = FTP(host) +    ftp = ftplib.FTP(host)      ftp.login()      filenames = ftp.nlst('/pubmed/updatefiles') +    retries, retry_delay = 10, 60      for name in filenames:          if not name.endswith('.html'):              continue          sio = io.StringIO() -        ftp.retrlines('RETR {}'.format(name), sio.write) +        for i in range(retries): +            try: +                # Previously, from 2020-12-14 to 2021-06-30 everything worked +                # fine, then a request for +                # /pubmed/updatefiles/pubmed21n1328_stats.html would always +                # fail with an EOFError, or when retried with a 32 +                # BrokenPipeError. Suspecting the server for some unknown +                # reason dropped the connection. +                # +                # Using a fresh client, the exact same file would work just +                # fine. So when we retry, we setup a new client here as well. +                if i > 0: +                    ftp = ftplib.FTP(host) +                    ftp.login() +                    sio.truncate(0) +                ftp.retrlines('RETR {}'.format(name), sio.write) +            except (EOFError, ftplib.error_temp, socket.gaierror, BrokenPipeError) as exc: +                print("ftp retr on {} failed with {} ({}) ({} retries left)".format( +                    name, exc, type(exc), retries - (i + 1)), file=sys.stderr) +                if i + 1 == retries: +                    raise +                else: +                    time.sleep(retry_delay) +            else: +                break          contents = sio.getvalue()          match = pattern.search(contents)          if match is None: @@ -205,7 +231,7 @@ def ftpretr(url):      """      parsed = urlparse(url)      server, path = parsed.netloc, parsed.path -    ftp = FTP(server) +    ftp = ftplib.FTP(server)      ftp.login()      with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f:          print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) | 
