diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-04 11:49:33 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-04 17:19:52 -0700 |
commit | dd8cdc88f71e6a395ab5b10d84d6443f70e39048 (patch) | |
tree | c97e096d81c49d8ac1f4853b565b5a527960a0f3 | |
parent | 34b3415433c65dfb41746a3a335e7217c7d1144e (diff) | |
download | sandcrawler-dd8cdc88f71e6a395ab5b10d84d6443f70e39048.tar.gz sandcrawler-dd8cdc88f71e6a395ab5b10d84d6443f70e39048.zip |
crossref grobid refs: another error case (ReadTimeout)
With this last exception handled, was about to get through millions of
rows of references, with only a few dozen errors (mostly invalid XML).
-rw-r--r-- | python/sandcrawler/grobid.py | 10 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 6 |
2 files changed, 11 insertions, 5 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index b9dd196..a6858ff 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -311,7 +311,7 @@ class CrossrefRefsWorker(SandcrawlerWorker): return self.grobid_client.crossref_refs(record) except xml.etree.ElementTree.ParseError: print( - f" GROBID returned bad XML for Crossref DOI: {record.get('DOI')}", + f"GROBID returned bad XML for Crossref DOI: {record.get('DOI')}", file=sys.stderr, ) # but add a small slow-down so we don't churn through these if @@ -319,9 +319,11 @@ class CrossrefRefsWorker(SandcrawlerWorker): time.sleep(3) return None except requests.exceptions.HTTPError: - print(f" GROBID HTTP error for Crossref DOI: {record.get('DOI')}", file=sys.stderr) - # but add a small slow-down so we don't churn through these if - # GROBID is just misconfigured or something + print(f"GROBID HTTP error for Crossref DOI: {record.get('DOI')}", file=sys.stderr) + time.sleep(3) + return None + except requests.exceptions.ReadTimeout: + print(f"GROBID HTTP timeout for Crossref DOI: {record.get('DOI')}", file=sys.stderr) time.sleep(3) return None diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 2d15dbf..6847e2e 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -715,7 +715,11 @@ class PersistCrossrefWorker(SandcrawlerWorker): try: parsed_refs = self.grobid_client.crossref_refs(record) refs_batch.append(parsed_refs) - except (xml.etree.ElementTree.ParseError, requests.exceptions.HTTPError): + except ( + xml.etree.ElementTree.ParseError, + requests.exceptions.HTTPError, + requests.exceptions.ReadTimeout, + ): print("GROBID crossref refs parsing error, skipping with a sleep") time.sleep(3) pass |