aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/grobid.py28
-rw-r--r--python/sandcrawler/persist.py10
2 files changed, 33 insertions, 5 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 7d7f6b5..8ed6d7e 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,5 +1,7 @@
import html
import sys
+import time
+import xml.etree.ElementTree
from typing import Any, Dict, List, Optional
import requests
@@ -242,9 +244,9 @@ class GrobidClient(object):
)
unstructured_refs = unstructured_refs[:2000]
- refs = self.process_citation_list(
- [clean_crossref_unstructured(r["unstructured"]) for r in unstructured_refs]
- )
+ clean_refs = [clean_crossref_unstructured(r["unstructured"]) for r in unstructured_refs]
+ refs = self.process_citation_list(clean_refs)
+
assert len(refs) == len(unstructured_refs)
refs_json = []
for i in range(len(refs)):
@@ -302,7 +304,25 @@ class CrossrefRefsWorker(SandcrawlerWorker):
self.sink = sink
def process(self, record: Any, key: Optional[str] = None) -> Any:
- return self.grobid_client.crossref_refs(record)
+ # handle the rare case of bad TEI-XML response
+ # eg: https://github.com/kermitt2/grobid/issues/848
+ try:
+ return self.grobid_client.crossref_refs(record)
+ except xml.etree.ElementTree.ParseError:
+ print(
+ f" GROBID returned bad XML for Crossref DOI: {record.get('DOI')}",
+ file=sys.stderr,
+ )
+ # but add a small slow-down so we don't churn through these if
+ # GROBID is just misconfigured or something
+ time.sleep(3)
+ return None
+ except requests.exceptions.HTTPError:
+ print(f" GROBID HTTP error for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
+ # but add a small slow-down so we don't churn through these if
+ # GROBID is just misconfigured or something
+ time.sleep(3)
+ return None
class GrobidBlobWorker(SandcrawlerWorker):
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 13b1232..2d15dbf 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -19,10 +19,12 @@ grobid
"""
import os
+import time
import xml.etree.ElementTree
from typing import Any, Dict, List, Optional
import psycopg2
+import requests
from sandcrawler.db import SandcrawlerPostgresClient
from sandcrawler.grobid import GrobidClient
@@ -710,7 +712,13 @@ class PersistCrossrefWorker(SandcrawlerWorker):
)
)
if self.parse_refs:
- refs_batch.append(self.grobid_client.crossref_refs(record))
+ try:
+ parsed_refs = self.grobid_client.crossref_refs(record)
+ refs_batch.append(parsed_refs)
+ except (xml.etree.ElementTree.ParseError, requests.exceptions.HTTPError):
+ print("GROBID crossref refs parsing error, skipping with a sleep")
+ time.sleep(3)
+ pass
resp = self.db.insert_crossref(self.cur, crossref_batch)
if len(crossref_batch) < len(batch):