aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-17 01:29:16 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-17 01:29:16 +0200
commit6056dcc93fa8111a04d76a7af5bcddb12704cb96 (patch)
treed21bb01bff07dc92122b57cc7d715c326a306688
parentd1067924b750d691f64da75b1f0a2e4d454dabc8 (diff)
downloadfatcat-6056dcc93fa8111a04d76a7af5bcddb12704cb96.tar.gz
fatcat-6056dcc93fa8111a04d76a7af5bcddb12704cb96.zip
pubmed: do not fail when accessing missing file
after a sync gap (e.g. 06/07 2021) harvester wanted to fetch a file, that was not on the server (any more) - do not fail in this case we'll need to backfill missing records via full data dump
-rw-r--r--python/fatcat_tools/harvest/pubmed.py10
1 files changed, 8 insertions, 2 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index f97702c0..98bb7f00 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -20,6 +20,7 @@ import sys
import tempfile
import time
import xml.etree.ElementTree as ET
+import zlib
from urllib.parse import urlparse
import dateparser
@@ -114,8 +115,13 @@ class PubmedFTPWorker:
url = "ftp://{}{}".format(self.host, path)
filename = ftpretr(url)
with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as decomp:
- gzf = gzip.open(filename)
- shutil.copyfileobj(gzf, decomp)
+ try:
+ gzf = gzip.open(filename)
+ shutil.copyfileobj(gzf, decomp)
+ except zlib.error as exc:
+ print('[skip] retrieving {} failed with {} (maybe empty, missing or broken gzip)'.format(
+ url, exc), file=sys.stderr)
+ continue
# Here, blob is the unparsed XML; we peek into it to use PMID as
# message key. We need streaming, since some updates would consume