diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-07-17 01:29:16 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-07-17 01:29:16 +0200 |
commit | 6056dcc93fa8111a04d76a7af5bcddb12704cb96 (patch) | |
tree | d21bb01bff07dc92122b57cc7d715c326a306688 | |
parent | d1067924b750d691f64da75b1f0a2e4d454dabc8 (diff) | |
download | fatcat-6056dcc93fa8111a04d76a7af5bcddb12704cb96.tar.gz fatcat-6056dcc93fa8111a04d76a7af5bcddb12704cb96.zip |
pubmed: do not fail when accessing missing file
after a sync gap (e.g. 06/07 2021) harvester wanted to fetch a file,
that was not on the server (any more) - do not fail in this case
we'll need to backfill missing records via full data dump
-rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index f97702c0..98bb7f00 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -20,6 +20,7 @@ import sys import tempfile import time import xml.etree.ElementTree as ET +import zlib from urllib.parse import urlparse import dateparser @@ -114,8 +115,13 @@ class PubmedFTPWorker: url = "ftp://{}{}".format(self.host, path) filename = ftpretr(url) with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as decomp: - gzf = gzip.open(filename) - shutil.copyfileobj(gzf, decomp) + try: + gzf = gzip.open(filename) + shutil.copyfileobj(gzf, decomp) + except zlib.error as exc: + print('[skip] retrieving {} failed with {} (maybe empty, missing or broken gzip)'.format( + url, exc), file=sys.stderr) + continue # Here, blob is the unparsed XML; we peek into it to use PMID as # message key. We need streaming, since some updates would consume |