aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/harvest
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r--python/fatcat_tools/harvest/pubmed.py34
1 files changed, 30 insertions, 4 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index 802d31d8..f97702c0 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -9,16 +9,17 @@ Assumptions:
"""
import collections
+import ftplib
import gzip
import io
import os
import re
import shutil
+import socket
import sys
import tempfile
import time
import xml.etree.ElementTree as ET
-from ftplib import FTP
from urllib.parse import urlparse
import dateparser
@@ -168,15 +169,40 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):
"""
mapping = collections.defaultdict(set)
pattern = re.compile(r'Filename: ([^ ]*.xml) -- Created: ([^<]*)')
- ftp = FTP(host)
+ ftp = ftplib.FTP(host)
ftp.login()
filenames = ftp.nlst('/pubmed/updatefiles')
+ retries, retry_delay = 10, 60
for name in filenames:
if not name.endswith('.html'):
continue
sio = io.StringIO()
- ftp.retrlines('RETR {}'.format(name), sio.write)
+ for i in range(retries):
+ try:
+ # Previously, from 2020-12-14 to 2021-06-30 everything worked
+ # fine, then a request for
+ # /pubmed/updatefiles/pubmed21n1328_stats.html would always
+ # fail with an EOFError, or when retried with a 32
+ # BrokenPipeError. Suspecting the server for some unknown
+ # reason dropped the connection.
+ #
+ # Using a fresh client, the exact same file would work just
+ # fine. So when we retry, we setup a new client here as well.
+ if i > 0:
+ ftp = ftplib.FTP(host)
+ ftp.login()
+ sio.truncate(0)
+ ftp.retrlines('RETR {}'.format(name), sio.write)
+ except (EOFError, ftplib.error_temp, socket.gaierror, BrokenPipeError) as exc:
+ print("ftp retr on {} failed with {} ({}) ({} retries left)".format(
+ name, exc, type(exc), retries - (i + 1)), file=sys.stderr)
+ if i + 1 == retries:
+ raise
+ else:
+ time.sleep(retry_delay)
+ else:
+ break
contents = sio.getvalue()
match = pattern.search(contents)
if match is None:
@@ -205,7 +231,7 @@ def ftpretr(url):
"""
parsed = urlparse(url)
server, path = parsed.netloc, parsed.path
- ftp = FTP(server)
+ ftp = ftplib.FTP(server)
ftp.login()
with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f:
print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr)