From dd8cdc88f71e6a395ab5b10d84d6443f70e39048 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 4 Nov 2021 11:49:33 -0700 Subject: crossref grobid refs: another error case (ReadTimeout) With this last exception handled, was about to get through millions of rows of references, with only a few dozen errors (mostly invalid XML). --- python/sandcrawler/persist.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'python/sandcrawler/persist.py') diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 2d15dbf..6847e2e 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -715,7 +715,11 @@ class PersistCrossrefWorker(SandcrawlerWorker): try: parsed_refs = self.grobid_client.crossref_refs(record) refs_batch.append(parsed_refs) - except (xml.etree.ElementTree.ParseError, requests.exceptions.HTTPError): + except ( + xml.etree.ElementTree.ParseError, + requests.exceptions.HTTPError, + requests.exceptions.ReadTimeout, + ): print("GROBID crossref refs parsing error, skipping with a sleep") time.sleep(3) pass -- cgit v1.2.3