diff options
author | Martin Czygan <martin@archive.org> | 2021-07-16 22:55:30 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2021-07-16 22:55:30 +0000 |
commit | d1067924b750d691f64da75b1f0a2e4d454dabc8 (patch) | |
tree | f85c88b1a4ada18408a07845ae74ba7ccb97aac4 /python | |
parent | 01515fd6e8c3d3999ff5f441ac06a3b1c371d07a (diff) | |
parent | 47c98540083da302802b54a70f77fb8abc69b4de (diff) | |
download | fatcat-d1067924b750d691f64da75b1f0a2e4d454dabc8.tar.gz fatcat-d1067924b750d691f64da75b1f0a2e4d454dabc8.zip |
Merge branch 'martin-pubmed-eof-sentry-91102' into 'master'
pubmed: reconnect on error
See merge request webgroup/fatcat!110
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 34 |
1 files changed, 30 insertions, 4 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index 802d31d8..f97702c0 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -9,16 +9,17 @@ Assumptions: """ import collections +import ftplib import gzip import io import os import re import shutil +import socket import sys import tempfile import time import xml.etree.ElementTree as ET -from ftplib import FTP from urllib.parse import urlparse import dateparser @@ -168,15 +169,40 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'): """ mapping = collections.defaultdict(set) pattern = re.compile(r'Filename: ([^ ]*.xml) -- Created: ([^<]*)') - ftp = FTP(host) + ftp = ftplib.FTP(host) ftp.login() filenames = ftp.nlst('/pubmed/updatefiles') + retries, retry_delay = 10, 60 for name in filenames: if not name.endswith('.html'): continue sio = io.StringIO() - ftp.retrlines('RETR {}'.format(name), sio.write) + for i in range(retries): + try: + # Previously, from 2020-12-14 to 2021-06-30 everything worked + # fine, then a request for + # /pubmed/updatefiles/pubmed21n1328_stats.html would always + # fail with an EOFError, or when retried with a 32 + # BrokenPipeError. Suspecting the server for some unknown + # reason dropped the connection. + # + # Using a fresh client, the exact same file would work just + # fine. So when we retry, we setup a new client here as well. + if i > 0: + ftp = ftplib.FTP(host) + ftp.login() + sio.truncate(0) + ftp.retrlines('RETR {}'.format(name), sio.write) + except (EOFError, ftplib.error_temp, socket.gaierror, BrokenPipeError) as exc: + print("ftp retr on {} failed with {} ({}) ({} retries left)".format( + name, exc, type(exc), retries - (i + 1)), file=sys.stderr) + if i + 1 == retries: + raise + else: + time.sleep(retry_delay) + else: + break contents = sio.getvalue() match = pattern.search(contents) if match is None: @@ -205,7 +231,7 @@ def ftpretr(url): """ parsed = urlparse(url) server, path = parsed.netloc, parsed.path - ftp = FTP(server) + ftp = ftplib.FTP(server) ftp.login() with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) |