diff options
| -rw-r--r-- | python/fatcat_tools/harvest/doi_registrars.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/harvest/harvest_common.py | 22 | 
2 files changed, 24 insertions, 2 deletions
| diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 55d85ef9..1aaad56a 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -10,7 +10,7 @@ import requests  from pykafka import KafkaClient  from fatcat_tools.workers import most_recent_message -from .harvest_common import HarvestState +from .harvest_common import HarvestState, requests_retry_session  class HarvestCrossrefWorker: @@ -93,7 +93,7 @@ class HarvestCrossrefWorker:          count = 0          with produce_topic.get_producer() as producer:              while True: -                http_resp = requests.get(self.api_host_url, params, headers=headers) +                http_resp = requests_retry_session().get(self.api_host_url, params, headers=headers)                  if http_resp.status_code == 503:                      # crude backoff                      print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code)) diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py index f4d74be2..11fd5fe8 100644 --- a/python/fatcat_tools/harvest/harvest_common.py +++ b/python/fatcat_tools/harvest/harvest_common.py @@ -3,10 +3,32 @@ import sys  import json  import time  import datetime +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry +  # Used for parsing ISO date format (YYYY-MM-DD)  DATE_FMT = "%Y-%m-%d" +def requests_retry_session(retries=10, backoff_factor=3, +        status_forcelist=(500, 502, 504), session=None): +    """ +    From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests +    """ +    session = session or requests.Session() +    retry = Retry( +        total=retries, +        read=retries, +        connect=retries, +        backoff_factor=backoff_factor, +        status_forcelist=status_forcelist, +    ) +    adapter = HTTPAdapter(max_retries=retry) +    session.mount('http://', adapter) +    session.mount('https://', adapter) +    return session +  class HarvestState:      """      First version of this works with full days (dates) | 
