aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/harvest
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-03-06 11:39:36 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-03-06 11:39:36 -0800
commit5dc322c93eb3b92324c4f947697d2a2c69749040 (patch)
tree5b70e74cd5436b24378ac1aec4854b2fc8161c80 /python/fatcat_tools/harvest
parent10af3b5ab23b8df76b08ef1173f7547db3df4125 (diff)
downloadfatcat-5dc322c93eb3b92324c4f947697d2a2c69749040.tar.gz
fatcat-5dc322c93eb3b92324c4f947697d2a2c69749040.zip
retry/backoff for Crossref harvester
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r--python/fatcat_tools/harvest/doi_registrars.py4
-rw-r--r--python/fatcat_tools/harvest/harvest_common.py22
2 files changed, 24 insertions, 2 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index 55d85ef9..1aaad56a 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -10,7 +10,7 @@ import requests
from pykafka import KafkaClient
from fatcat_tools.workers import most_recent_message
-from .harvest_common import HarvestState
+from .harvest_common import HarvestState, requests_retry_session
class HarvestCrossrefWorker:
@@ -93,7 +93,7 @@ class HarvestCrossrefWorker:
count = 0
with produce_topic.get_producer() as producer:
while True:
- http_resp = requests.get(self.api_host_url, params, headers=headers)
+ http_resp = requests_retry_session().get(self.api_host_url, params, headers=headers)
if http_resp.status_code == 503:
# crude backoff
print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code))
diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py
index f4d74be2..11fd5fe8 100644
--- a/python/fatcat_tools/harvest/harvest_common.py
+++ b/python/fatcat_tools/harvest/harvest_common.py
@@ -3,10 +3,32 @@ import sys
import json
import time
import datetime
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+
# Used for parsing ISO date format (YYYY-MM-DD)
DATE_FMT = "%Y-%m-%d"
+def requests_retry_session(retries=10, backoff_factor=3,
+ status_forcelist=(500, 502, 504), session=None):
+ """
+ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+ """
+ session = session or requests.Session()
+ retry = Retry(
+ total=retries,
+ read=retries,
+ connect=retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist,
+ )
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount('http://', adapter)
+ session.mount('https://', adapter)
+ return session
+
class HarvestState:
"""
First version of this works with full days (dates)