aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/persist.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-03 18:57:59 -0700
committerBryan Newbold <bnewbold@archive.org>2021-11-04 17:19:52 -0700
commitb4ceb130504cacbb75549e46719159f4e5ab5c51 (patch)
treefca88f1af4f0c15410029e86074afff252ebdf34 /python/sandcrawler/persist.py
parent1f078fe94a5cf5322527b97dcdf0cb054e0c7540 (diff)
downloadsandcrawler-b4ceb130504cacbb75549e46719159f4e5ab5c51.tar.gz
sandcrawler-b4ceb130504cacbb75549e46719159f4e5ab5c51.zip
grobid crossref refs: try to handle HTTP 5xx and XML parse errors
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r--python/sandcrawler/persist.py10
1 files changed, 9 insertions, 1 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 13b1232..2d15dbf 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -19,10 +19,12 @@ grobid
"""
import os
+import time
import xml.etree.ElementTree
from typing import Any, Dict, List, Optional
import psycopg2
+import requests
from sandcrawler.db import SandcrawlerPostgresClient
from sandcrawler.grobid import GrobidClient
@@ -710,7 +712,13 @@ class PersistCrossrefWorker(SandcrawlerWorker):
)
)
if self.parse_refs:
- refs_batch.append(self.grobid_client.crossref_refs(record))
+ try:
+ parsed_refs = self.grobid_client.crossref_refs(record)
+ refs_batch.append(parsed_refs)
+ except (xml.etree.ElementTree.ParseError, requests.exceptions.HTTPError):
+ print("GROBID crossref refs parsing error, skipping with a sleep")
+ time.sleep(3)
+ pass
resp = self.db.insert_crossref(self.cur, crossref_batch)
if len(crossref_batch) < len(batch):