aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2021-07-17 01:04:36 +0000
committerbnewbold <bnewbold@archive.org>2021-07-17 01:04:36 +0000
commit16157db3c47e0663a9cfaa60482204f88126e8f7 (patch)
tree6c2297a701fcba416c74b2647823c4c95d2ba661
parentd1067924b750d691f64da75b1f0a2e4d454dabc8 (diff)
parent0202f5f9d0c508e2c4cc4af6a8b22bd624bcbd0b (diff)
downloadfatcat-16157db3c47e0663a9cfaa60482204f88126e8f7.tar.gz
fatcat-16157db3c47e0663a9cfaa60482204f88126e8f7.zip
Merge branch 'martin-pubmed-fetch-gzip-error' into 'master'
pubmed: do not fail when accessing missing file See merge request webgroup/fatcat!111
-rw-r--r--python/fatcat_tools/harvest/pubmed.py15
1 files changed, 11 insertions, 4 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index f97702c0..fd4af1f4 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -20,6 +20,7 @@ import sys
import tempfile
import time
import xml.etree.ElementTree as ET
+import zlib
from urllib.parse import urlparse
import dateparser
@@ -94,8 +95,9 @@ class PubmedFTPWorker:
def fetch_date(self, date):
"""
- Fetch file for a given date and feed Kafka one article per message. If
- the fetched XML does not contain a PMID, this method will fail.
+ Fetch file or files for a given date and feed Kafka one article per
+ message. If the fetched XML does not contain a PMID an exception is
+ raised.
If no date file mapping is found, this will fail.
"""
@@ -114,8 +116,13 @@ class PubmedFTPWorker:
url = "ftp://{}{}".format(self.host, path)
filename = ftpretr(url)
with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as decomp:
- gzf = gzip.open(filename)
- shutil.copyfileobj(gzf, decomp)
+ try:
+ gzf = gzip.open(filename)
+ shutil.copyfileobj(gzf, decomp)
+ except zlib.error as exc:
+ print('[skip] retrieving {} failed with {} (maybe empty, missing or broken gzip)'.format(
+ url, exc), file=sys.stderr)
+ continue
# Here, blob is the unparsed XML; we peek into it to use PMID as
# message key. We need streaming, since some updates would consume