diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 10 | 
1 files changed, 8 insertions, 2 deletions
| diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index f97702c0..98bb7f00 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -20,6 +20,7 @@ import sys  import tempfile  import time  import xml.etree.ElementTree as ET +import zlib  from urllib.parse import urlparse  import dateparser @@ -114,8 +115,13 @@ class PubmedFTPWorker:              url = "ftp://{}{}".format(self.host, path)              filename = ftpretr(url)              with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as decomp: -                gzf = gzip.open(filename) -                shutil.copyfileobj(gzf, decomp) +                try: +                    gzf = gzip.open(filename) +                    shutil.copyfileobj(gzf, decomp) +                except zlib.error as exc: +                    print('[skip] retrieving {} failed with {} (maybe empty, missing or broken gzip)'.format( +                        url, exc), file=sys.stderr) +                    continue              # Here, blob is the unparsed XML; we peek into it to use PMID as              # message key. We need streaming, since some updates would consume | 
