summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/harvest
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r--python/fatcat_tools/harvest/pubmed.py45
1 files changed, 21 insertions, 24 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index 48f634d0..579dd505 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -115,7 +115,7 @@ class PubmedFTPWorker:
for path in paths:
# Fetch and decompress file.
url = "ftp://{}{}".format(self.host, path)
- filename = ftpretr(url, use_lftp=True)
+ filename = ftpretr(url, proxy_hostport="159.69.240.245:15201") # TODO: proxy obsolete, when networking issue is resolved
with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as decomp:
try:
gzf = gzip.open(filename)
@@ -227,7 +227,7 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):
return mapping
-def ftpretr(url, max_retries=10, retry_delay=1, use_lftp=False):
+def ftpretr(url, max_retries=10, retry_delay=1, proxy_hostport=None):
"""
Note: This might move into a generic place in the future.
@@ -240,8 +240,8 @@ def ftpretr(url, max_retries=10, retry_delay=1, use_lftp=False):
Implements a basic retry mechanism, e.g. that became an issue in 08/2021,
when we encountered EOFError while talking to the FTP server. Retry delay in seconds.
"""
- if use_lftp is True:
- return ftpretr_lftp(url, max_retries=max_retries, retry_delay=retry_delay)
+ if proxy_hostport is not None:
+ return ftpretr_via_http_proxy(url, proxy_hostport, max_retries=max_retries, retry_delay=retry_delay)
parsed = urlparse(url)
server, path = parsed.netloc, parsed.path
for i in range(max_retries):
@@ -263,30 +263,27 @@ def ftpretr(url, max_retries=10, retry_delay=1, use_lftp=False):
return f.name
-def ftpretr_lftp(url, max_retries=10, retry_delay=1):
+def ftpretr_via_http_proxy(url, proxy_hostport="159.69.240.245:15201", max_retries=10, retry_delay=1):
"""
- Same as ftpretr, but mirrors the relevant files beforehand, then picks out
- the requested file. Requires a few GB spare space for the mirror.
-
- Mirrors everything from `path` on `host` to `sync_dir`, which will be under
- the system tempdir (cf. `systemctl status systemd-tmpfiles-clean.timer`) by default.
-
- Workaround, since networking issues (probably internet2) limit our
- bandwith; and we cannot hold a conn longer than about 90 seconds with the
- python ftp lib or curl. Mitigation through a hopefully more resilient
- client like lftp.
-
- If this does not work, check available mirrors outside nih.gov.
+ Fetch file from FTP via external HTTP proxy, e.g. ftp.host.com:/a/b/c would
+ be retrievable via proxy.com/a/b/c.
"""
parsed = urlparse(url)
server, path = parsed.netloc, parsed.path
- with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f:
- print('retrieving [lftp] {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr)
- lftp_command = """ set net:max-retries {}; set net:reconnect-interval-base {}; pget -c {} -o {}; exit """.format(max_retries, retry_delay, path, f.name)
- cmd = ["lftp", "-u", "anonymous,anonymous", "-e", lftp_command, "ftp.ncbi.nlm.nih.gov"]
- result = subprocess.run(cmd)
- result.check_returncode()
- return f.name
+ for i in range(max_retries):
+ try:
+ url = "http://{}{}".format(proxy_hostport, path)
+ print("retrieving file via proxy (ftpup) from {}".format(url), file=sys.stderr)
+ with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f:
+ cmd = ["wget", "-c", url, "-O", f.name]
+ result = subprocess.run(cmd)
+ return f.name
+ except (subprocess.CalledProcessError, OSError, ValueError) as exc:
+ print("ftp fetch {} failed with {} ({}) ({} retries left)".format(
+ url, exc, type(exc), max_retries - (i + 1)), file=sys.stderr)
+ if i + 1 == max_retries:
+ raise
+ time.sleep(retry_delay)
def xmlstream(filename, tag, encoding='utf-8'):