grobid crossref refs: try to handle HTTP 5xx and XML parse errors

author: Bryan Newbold <bnewbold@archive.org> 2021-11-03 18:57:59 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-11-04 17:19:52 -0700
commit: b4ceb130504cacbb75549e46719159f4e5ab5c51 (patch)
tree: fca88f1af4f0c15410029e86074afff252ebdf34 /python
parent: 1f078fe94a5cf5322527b97dcdf0cb054e0c7540 (diff)
download: sandcrawler-b4ceb130504cacbb75549e46719159f4e5ab5c51.tar.gz
sandcrawler-b4ceb130504cacbb75549e46719159f4e5ab5c51.zip
2 files changed, 33 insertions, 5 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 7d7f6b5..8ed6d7e 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,5 +1,7 @@
 import html
 import sys
+import time
+import xml.etree.ElementTree
 from typing import Any, Dict, List, Optional
 
 import requests
@@ -242,9 +244,9 @@ class GrobidClient(object):
             )
             unstructured_refs = unstructured_refs[:2000]
 
-        refs = self.process_citation_list(
-            [clean_crossref_unstructured(r["unstructured"]) for r in unstructured_refs]
-        )
+        clean_refs = [clean_crossref_unstructured(r["unstructured"]) for r in unstructured_refs]
+        refs = self.process_citation_list(clean_refs)
+
         assert len(refs) == len(unstructured_refs)
         refs_json = []
         for i in range(len(refs)):
@@ -302,7 +304,25 @@ class CrossrefRefsWorker(SandcrawlerWorker):
         self.sink = sink
 
     def process(self, record: Any, key: Optional[str] = None) -> Any:
-        return self.grobid_client.crossref_refs(record)
+        # handle the rare case of bad TEI-XML response
+        # eg: https://github.com/kermitt2/grobid/issues/848
+        try:
+            return self.grobid_client.crossref_refs(record)
+        except xml.etree.ElementTree.ParseError:
+            print(
+                f"  GROBID returned bad XML for Crossref DOI: {record.get('DOI')}",
+                file=sys.stderr,
+            )
+            # but add a small slow-down so we don't churn through these if
+            # GROBID is just misconfigured or something
+            time.sleep(3)
+            return None
+        except requests.exceptions.HTTPError:
+            print(f"  GROBID HTTP error for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
+            # but add a small slow-down so we don't churn through these if
+            # GROBID is just misconfigured or something
+            time.sleep(3)
+            return None
 
 
 class GrobidBlobWorker(SandcrawlerWorker):
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 13b1232..2d15dbf 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -19,10 +19,12 @@ grobid
 """
 
 import os
+import time
 import xml.etree.ElementTree
 from typing import Any, Dict, List, Optional
 
 import psycopg2
+import requests
 
 from sandcrawler.db import SandcrawlerPostgresClient
 from sandcrawler.grobid import GrobidClient
@@ -710,7 +712,13 @@ class PersistCrossrefWorker(SandcrawlerWorker):
                 )
             )
             if self.parse_refs:
-                refs_batch.append(self.grobid_client.crossref_refs(record))
+                try:
+                    parsed_refs = self.grobid_client.crossref_refs(record)
+                    refs_batch.append(parsed_refs)
+                except (xml.etree.ElementTree.ParseError, requests.exceptions.HTTPError):
+                    print("GROBID crossref refs parsing error, skipping with a sleep")
+                    time.sleep(3)
+                    pass
 
         resp = self.db.insert_crossref(self.cur, crossref_batch)
         if len(crossref_batch) < len(batch):
author	Bryan Newbold <bnewbold@archive.org>	2021-11-03 18:57:59 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-11-04 17:19:52 -0700
commit	b4ceb130504cacbb75549e46719159f4e5ab5c51 (patch)
tree	fca88f1af4f0c15410029e86074afff252ebdf34 /python
parent	1f078fe94a5cf5322527b97dcdf0cb054e0c7540 (diff)
download	sandcrawler-b4ceb130504cacbb75549e46719159f4e5ab5c51.tar.gz sandcrawler-b4ceb130504cacbb75549e46719159f4e5ab5c51.zip