diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-03 18:57:59 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-04 17:19:52 -0700 |
commit | b4ceb130504cacbb75549e46719159f4e5ab5c51 (patch) | |
tree | fca88f1af4f0c15410029e86074afff252ebdf34 /python/sandcrawler/persist.py | |
parent | 1f078fe94a5cf5322527b97dcdf0cb054e0c7540 (diff) | |
download | sandcrawler-b4ceb130504cacbb75549e46719159f4e5ab5c51.tar.gz sandcrawler-b4ceb130504cacbb75549e46719159f4e5ab5c51.zip |
grobid crossref refs: try to handle HTTP 5xx and XML parse errors
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r-- | python/sandcrawler/persist.py | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 13b1232..2d15dbf 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -19,10 +19,12 @@ grobid """ import os +import time import xml.etree.ElementTree from typing import Any, Dict, List, Optional import psycopg2 +import requests from sandcrawler.db import SandcrawlerPostgresClient from sandcrawler.grobid import GrobidClient @@ -710,7 +712,13 @@ class PersistCrossrefWorker(SandcrawlerWorker): ) ) if self.parse_refs: - refs_batch.append(self.grobid_client.crossref_refs(record)) + try: + parsed_refs = self.grobid_client.crossref_refs(record) + refs_batch.append(parsed_refs) + except (xml.etree.ElementTree.ParseError, requests.exceptions.HTTPError): + print("GROBID crossref refs parsing error, skipping with a sleep") + time.sleep(3) + pass resp = self.db.insert_crossref(self.cur, crossref_batch) if len(crossref_batch) < len(batch): |