diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-03 18:57:59 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-04 17:19:52 -0700 |
commit | b4ceb130504cacbb75549e46719159f4e5ab5c51 (patch) | |
tree | fca88f1af4f0c15410029e86074afff252ebdf34 /python | |
parent | 1f078fe94a5cf5322527b97dcdf0cb054e0c7540 (diff) | |
download | sandcrawler-b4ceb130504cacbb75549e46719159f4e5ab5c51.tar.gz sandcrawler-b4ceb130504cacbb75549e46719159f4e5ab5c51.zip |
grobid crossref refs: try to handle HTTP 5xx and XML parse errors
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/grobid.py | 28 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 10 |
2 files changed, 33 insertions, 5 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 7d7f6b5..8ed6d7e 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -1,5 +1,7 @@ import html import sys +import time +import xml.etree.ElementTree from typing import Any, Dict, List, Optional import requests @@ -242,9 +244,9 @@ class GrobidClient(object): ) unstructured_refs = unstructured_refs[:2000] - refs = self.process_citation_list( - [clean_crossref_unstructured(r["unstructured"]) for r in unstructured_refs] - ) + clean_refs = [clean_crossref_unstructured(r["unstructured"]) for r in unstructured_refs] + refs = self.process_citation_list(clean_refs) + assert len(refs) == len(unstructured_refs) refs_json = [] for i in range(len(refs)): @@ -302,7 +304,25 @@ class CrossrefRefsWorker(SandcrawlerWorker): self.sink = sink def process(self, record: Any, key: Optional[str] = None) -> Any: - return self.grobid_client.crossref_refs(record) + # handle the rare case of bad TEI-XML response + # eg: https://github.com/kermitt2/grobid/issues/848 + try: + return self.grobid_client.crossref_refs(record) + except xml.etree.ElementTree.ParseError: + print( + f" GROBID returned bad XML for Crossref DOI: {record.get('DOI')}", + file=sys.stderr, + ) + # but add a small slow-down so we don't churn through these if + # GROBID is just misconfigured or something + time.sleep(3) + return None + except requests.exceptions.HTTPError: + print(f" GROBID HTTP error for Crossref DOI: {record.get('DOI')}", file=sys.stderr) + # but add a small slow-down so we don't churn through these if + # GROBID is just misconfigured or something + time.sleep(3) + return None class GrobidBlobWorker(SandcrawlerWorker): diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 13b1232..2d15dbf 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -19,10 +19,12 @@ grobid """ import os +import time import xml.etree.ElementTree from typing import Any, Dict, List, Optional import psycopg2 +import requests from sandcrawler.db import SandcrawlerPostgresClient from sandcrawler.grobid import GrobidClient @@ -710,7 +712,13 @@ class PersistCrossrefWorker(SandcrawlerWorker): ) ) if self.parse_refs: - refs_batch.append(self.grobid_client.crossref_refs(record)) + try: + parsed_refs = self.grobid_client.crossref_refs(record) + refs_batch.append(parsed_refs) + except (xml.etree.ElementTree.ParseError, requests.exceptions.HTTPError): + print("GROBID crossref refs parsing error, skipping with a sleep") + time.sleep(3) + pass resp = self.db.insert_crossref(self.cur, crossref_batch) if len(crossref_batch) < len(batch): |