diff options
author | bnewbold <bnewbold@archive.org> | 2021-07-17 01:04:36 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2021-07-17 01:04:36 +0000 |
commit | 16157db3c47e0663a9cfaa60482204f88126e8f7 (patch) | |
tree | 6c2297a701fcba416c74b2647823c4c95d2ba661 | |
parent | d1067924b750d691f64da75b1f0a2e4d454dabc8 (diff) | |
parent | 0202f5f9d0c508e2c4cc4af6a8b22bd624bcbd0b (diff) | |
download | fatcat-16157db3c47e0663a9cfaa60482204f88126e8f7.tar.gz fatcat-16157db3c47e0663a9cfaa60482204f88126e8f7.zip |
Merge branch 'martin-pubmed-fetch-gzip-error' into 'master'
pubmed: do not fail when accessing missing file
See merge request webgroup/fatcat!111
-rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 15 |
1 files changed, 11 insertions, 4 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index f97702c0..fd4af1f4 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -20,6 +20,7 @@ import sys import tempfile import time import xml.etree.ElementTree as ET +import zlib from urllib.parse import urlparse import dateparser @@ -94,8 +95,9 @@ class PubmedFTPWorker: def fetch_date(self, date): """ - Fetch file for a given date and feed Kafka one article per message. If - the fetched XML does not contain a PMID, this method will fail. + Fetch file or files for a given date and feed Kafka one article per + message. If the fetched XML does not contain a PMID an exception is + raised. If no date file mapping is found, this will fail. """ @@ -114,8 +116,13 @@ class PubmedFTPWorker: url = "ftp://{}{}".format(self.host, path) filename = ftpretr(url) with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as decomp: - gzf = gzip.open(filename) - shutil.copyfileobj(gzf, decomp) + try: + gzf = gzip.open(filename) + shutil.copyfileobj(gzf, decomp) + except zlib.error as exc: + print('[skip] retrieving {} failed with {} (maybe empty, missing or broken gzip)'.format( + url, exc), file=sys.stderr) + continue # Here, blob is the unparsed XML; we peek into it to use PMID as # message key. We need streaming, since some updates would consume |