aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-04 11:49:33 -0700
committerBryan Newbold <bnewbold@archive.org>2021-11-04 17:19:52 -0700
commitdd8cdc88f71e6a395ab5b10d84d6443f70e39048 (patch)
treec97e096d81c49d8ac1f4853b565b5a527960a0f3
parent34b3415433c65dfb41746a3a335e7217c7d1144e (diff)
downloadsandcrawler-dd8cdc88f71e6a395ab5b10d84d6443f70e39048.tar.gz
sandcrawler-dd8cdc88f71e6a395ab5b10d84d6443f70e39048.zip
crossref grobid refs: another error case (ReadTimeout)
With this last exception handled, was about to get through millions of rows of references, with only a few dozen errors (mostly invalid XML).
-rw-r--r--python/sandcrawler/grobid.py10
-rw-r--r--python/sandcrawler/persist.py6
2 files changed, 11 insertions, 5 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index b9dd196..a6858ff 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -311,7 +311,7 @@ class CrossrefRefsWorker(SandcrawlerWorker):
return self.grobid_client.crossref_refs(record)
except xml.etree.ElementTree.ParseError:
print(
- f" GROBID returned bad XML for Crossref DOI: {record.get('DOI')}",
+ f"GROBID returned bad XML for Crossref DOI: {record.get('DOI')}",
file=sys.stderr,
)
# but add a small slow-down so we don't churn through these if
@@ -319,9 +319,11 @@ class CrossrefRefsWorker(SandcrawlerWorker):
time.sleep(3)
return None
except requests.exceptions.HTTPError:
- print(f" GROBID HTTP error for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
- # but add a small slow-down so we don't churn through these if
- # GROBID is just misconfigured or something
+ print(f"GROBID HTTP error for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
+ time.sleep(3)
+ return None
+ except requests.exceptions.ReadTimeout:
+ print(f"GROBID HTTP timeout for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
time.sleep(3)
return None
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 2d15dbf..6847e2e 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -715,7 +715,11 @@ class PersistCrossrefWorker(SandcrawlerWorker):
try:
parsed_refs = self.grobid_client.crossref_refs(record)
refs_batch.append(parsed_refs)
- except (xml.etree.ElementTree.ParseError, requests.exceptions.HTTPError):
+ except (
+ xml.etree.ElementTree.ParseError,
+ requests.exceptions.HTTPError,
+ requests.exceptions.ReadTimeout,
+ ):
print("GROBID crossref refs parsing error, skipping with a sleep")
time.sleep(3)
pass