From 383349341a09f9d05f58b6104064af7b0c89dcc1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 May 2020 20:24:12 -0700 Subject: handle petabox read timeouts a bit --- fatcat_scholar/sim_pipeline.py | 7 ++++++- fatcat_scholar/work_pipeline.py | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'fatcat_scholar') diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py index c96681a..5657cb7 100644 --- a/fatcat_scholar/sim_pipeline.py +++ b/fatcat_scholar/sim_pipeline.py @@ -4,6 +4,7 @@ import io import sys import sqlite3 import argparse +import requests from pydantic import BaseModel, validator from typing import List, Dict, Tuple, Optional, Any, Sequence from fatcat_openapi_client import ReleaseEntity, FileEntity @@ -113,7 +114,11 @@ class SimPipeline(): # TODO: more filters; also redundant with IssueDB code? if row['issue_item'].endswith('_contents') or row['issue_item'].endswith('_index'): continue - full_issue = self.fetch_sim_issue(row) + try: + full_issue = self.fetch_sim_issue(row) + except requests.exceptions.ReadTimeout as e: + print(str(e), file=sys.stderr) + continue if not full_issue: continue for leaf in full_issue['page_texts']: diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 363535c..f0c0e6f 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -3,6 +3,7 @@ import os import io import sys import minio +import requests import argparse from pydantic import BaseModel, validator from typing import List, Dict, Tuple, Optional, Any, Sequence @@ -255,7 +256,11 @@ class WorkPipeline(): if not sim_pub: continue # XXX: control flow tweak? - sim_fulltext = self.fetch_sim(sim_issue, sim_pub, release.pages, release.ident) + try: + sim_fulltext = self.fetch_sim(sim_issue, sim_pub, release.pages, release.ident) + except requests.exceptions.ReadTimeout as e: + print(str(e), file=sys.stderr) + continue if sim_fulltext: break -- cgit v1.2.3