summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-21 20:24:12 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-21 20:24:12 -0700
commit383349341a09f9d05f58b6104064af7b0c89dcc1 (patch)
tree344797bb1fb1c43fb116013ba23a96de7c090389
parentfee17cf6518e13b6f1c3945dd769aba56d7606d5 (diff)
downloadfatcat-scholar-383349341a09f9d05f58b6104064af7b0c89dcc1.tar.gz
fatcat-scholar-383349341a09f9d05f58b6104064af7b0c89dcc1.zip
handle petabox read timeouts a bit
-rw-r--r--fatcat_scholar/sim_pipeline.py7
-rw-r--r--fatcat_scholar/work_pipeline.py7
2 files changed, 12 insertions, 2 deletions
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py
index c96681a..5657cb7 100644
--- a/fatcat_scholar/sim_pipeline.py
+++ b/fatcat_scholar/sim_pipeline.py
@@ -4,6 +4,7 @@ import io
import sys
import sqlite3
import argparse
+import requests
from pydantic import BaseModel, validator
from typing import List, Dict, Tuple, Optional, Any, Sequence
from fatcat_openapi_client import ReleaseEntity, FileEntity
@@ -113,7 +114,11 @@ class SimPipeline():
# TODO: more filters; also redundant with IssueDB code?
if row['issue_item'].endswith('_contents') or row['issue_item'].endswith('_index'):
continue
- full_issue = self.fetch_sim_issue(row)
+ try:
+ full_issue = self.fetch_sim_issue(row)
+ except requests.exceptions.ReadTimeout as e:
+ print(str(e), file=sys.stderr)
+ continue
if not full_issue:
continue
for leaf in full_issue['page_texts']:
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 363535c..f0c0e6f 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -3,6 +3,7 @@ import os
import io
import sys
import minio
+import requests
import argparse
from pydantic import BaseModel, validator
from typing import List, Dict, Tuple, Optional, Any, Sequence
@@ -255,7 +256,11 @@ class WorkPipeline():
if not sim_pub:
continue
# XXX: control flow tweak?
- sim_fulltext = self.fetch_sim(sim_issue, sim_pub, release.pages, release.ident)
+ try:
+ sim_fulltext = self.fetch_sim(sim_issue, sim_pub, release.pages, release.ident)
+ except requests.exceptions.ReadTimeout as e:
+ print(str(e), file=sys.stderr)
+ continue
if sim_fulltext:
break