diff options
Diffstat (limited to 'python/fatcat_tools/harvest')
| -rw-r--r-- | python/fatcat_tools/harvest/doi_registrars.py | 19 | 
1 files changed, 12 insertions, 7 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 1a6807d2..ed80cfc9 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -3,6 +3,7 @@ import re  import sys  import csv  import json +import time  import requests  import itertools  import datetime @@ -10,6 +11,11 @@ from pykafka import KafkaClient  from fatcat_tools.workers.worker_common import most_recent_message +# Skip pylint due to: +#   AttributeError: 'NoneType' object has no attribute 'scope' +# in 'astroid/node_classes.py' +# pylint: skip-file +  DATE_FMT = "%Y-%m-%d" @@ -79,7 +85,7 @@ class HarvestCrossrefWorker:              date_str, date_str)          if self.is_update_filter is not None:              filter_param += ',is_update:{}'.format(bool(self.is_update_filter)) -        params = { +        return {              'filter': filter_param,              'rows': self.api_batch_size,              'cursor': '*', @@ -93,7 +99,7 @@ class HarvestCrossrefWorker:          state_topic = self.kafka.topics[self.state_topic]          produce_topic = self.kafka.topics[self.produce_topic] -  +          date_str = date.strftime(DATE_FMT)          params = self.params(date_str)          headers = { @@ -103,12 +109,12 @@ class HarvestCrossrefWorker:          with produce_topic.get_producer() as producer:              while True:                  http_resp = requests.get(self.api_host_url, params, headers=headers) -                if http_resp.status_code is 503: +                if http_resp.status_code == 503:                      # crud backoff                      print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code))                      time.sleep(30.0)                      continue -                assert http_resp.status_code is 200 +                assert http_resp.status_code == 200                  resp = http_resp.json()                  items = self.extract_items(resp)                  count += len(items) @@ -135,7 +141,7 @@ class HarvestCrossrefWorker:          today_utc = datetime.datetime.utcnow().date()          if self.start_date is None:              self.start_date = self.get_latest_date() -            if self.start_date:  +            if self.start_date:                  # if we are continuing, start day after last success                  self.start_date = self.start_date + datetime.timedelta(days=1)          if self.start_date is None: @@ -167,7 +173,7 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):      """      datacite has a REST API as well as OAI-PMH endpoint. -    have about 8 million  +    have about 8 million      bulk export notes: https://github.com/datacite/datacite/issues/188 @@ -206,4 +212,3 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):      def update_params(self, params, resp):          params['page[number]'] = resp['meta']['page'] + 1          return params -  | 
