diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2021-08-30 23:23:28 +0200 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2021-09-08 22:02:48 +0200 | 
| commit | 6d9d67c9c4d1a0b208fc2056ab485a1c8d21e100 (patch) | |
| tree | 323e38da3515dea702a3d244b2724348aaab61bf | |
| parent | fddbc81f231fb59efb9c41dc460b6486c673e94c (diff) | |
| download | fatcat-6d9d67c9c4d1a0b208fc2056ab485a1c8d21e100.tar.gz fatcat-6d9d67c9c4d1a0b208fc2056ab485a1c8d21e100.zip | |
pubmed: add option to ftp download with lftp
lftp is a classic command line ftp client, and we hope that its retry
capabilities are enough of a workaround for the current networking issue
| -rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 33 | 
1 files changed, 31 insertions, 2 deletions
| diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index d336de04..48f634d0 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -16,6 +16,7 @@ import os  import re  import shutil  import socket +import subprocess  import sys  import tempfile  import time @@ -114,7 +115,7 @@ class PubmedFTPWorker:          for path in paths:              # Fetch and decompress file.              url = "ftp://{}{}".format(self.host, path) -            filename = ftpretr(url) +            filename = ftpretr(url, use_lftp=True)              with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as decomp:                  try:                      gzf = gzip.open(filename) @@ -226,7 +227,7 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):      return mapping -def ftpretr(url, max_retries=10, retry_delay=1): +def ftpretr(url, max_retries=10, retry_delay=1, use_lftp=False):      """      Note: This might move into a generic place in the future. @@ -239,6 +240,8 @@ def ftpretr(url, max_retries=10, retry_delay=1):      Implements a basic retry mechanism, e.g. that became an issue in 08/2021,      when we encountered EOFError while talking to the FTP server. Retry delay in seconds.      """ +    if use_lftp is True: +        return ftpretr_lftp(url, max_retries=max_retries, retry_delay=retry_delay)      parsed = urlparse(url)      server, path = parsed.netloc, parsed.path      for i in range(max_retries): @@ -260,6 +263,32 @@ def ftpretr(url, max_retries=10, retry_delay=1):              return f.name +def ftpretr_lftp(url, max_retries=10, retry_delay=1): +    """ +    Same as ftpretr, but mirrors the relevant files beforehand, then picks out +    the requested file. Requires a few GB spare space for the mirror. + +    Mirrors everything from `path` on `host` to `sync_dir`, which will be under +    the system tempdir (cf. `systemctl status systemd-tmpfiles-clean.timer`) by default. + +    Workaround, since networking issues (probably internet2) limit our +    bandwith; and we cannot hold a conn longer than about 90 seconds with the +    python ftp lib or curl. Mitigation through a hopefully more resilient +    client like lftp. + +    If this does not work, check available mirrors outside nih.gov. +    """ +    parsed = urlparse(url) +    server, path = parsed.netloc, parsed.path +    with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: +        print('retrieving [lftp] {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) +        lftp_command = """ set net:max-retries {}; set net:reconnect-interval-base {}; pget -c {} -o {}; exit """.format(max_retries, retry_delay, path, f.name) +        cmd = ["lftp", "-u", "anonymous,anonymous", "-e", lftp_command, "ftp.ncbi.nlm.nih.gov"] +        result = subprocess.run(cmd) +        result.check_returncode() +        return f.name + +  def xmlstream(filename, tag, encoding='utf-8'):      """      Note: This might move into a generic place in the future. | 
