summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/harvest
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r--python/fatcat_tools/harvest/doi_registrars.py19
1 files changed, 12 insertions, 7 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index 1a6807d2..ed80cfc9 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -3,6 +3,7 @@ import re
import sys
import csv
import json
+import time
import requests
import itertools
import datetime
@@ -10,6 +11,11 @@ from pykafka import KafkaClient
from fatcat_tools.workers.worker_common import most_recent_message
+# Skip pylint due to:
+# AttributeError: 'NoneType' object has no attribute 'scope'
+# in 'astroid/node_classes.py'
+# pylint: skip-file
+
DATE_FMT = "%Y-%m-%d"
@@ -79,7 +85,7 @@ class HarvestCrossrefWorker:
date_str, date_str)
if self.is_update_filter is not None:
filter_param += ',is_update:{}'.format(bool(self.is_update_filter))
- params = {
+ return {
'filter': filter_param,
'rows': self.api_batch_size,
'cursor': '*',
@@ -93,7 +99,7 @@ class HarvestCrossrefWorker:
state_topic = self.kafka.topics[self.state_topic]
produce_topic = self.kafka.topics[self.produce_topic]
-
+
date_str = date.strftime(DATE_FMT)
params = self.params(date_str)
headers = {
@@ -103,12 +109,12 @@ class HarvestCrossrefWorker:
with produce_topic.get_producer() as producer:
while True:
http_resp = requests.get(self.api_host_url, params, headers=headers)
- if http_resp.status_code is 503:
+ if http_resp.status_code == 503:
# crud backoff
print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code))
time.sleep(30.0)
continue
- assert http_resp.status_code is 200
+ assert http_resp.status_code == 200
resp = http_resp.json()
items = self.extract_items(resp)
count += len(items)
@@ -135,7 +141,7 @@ class HarvestCrossrefWorker:
today_utc = datetime.datetime.utcnow().date()
if self.start_date is None:
self.start_date = self.get_latest_date()
- if self.start_date:
+ if self.start_date:
# if we are continuing, start day after last success
self.start_date = self.start_date + datetime.timedelta(days=1)
if self.start_date is None:
@@ -167,7 +173,7 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):
"""
datacite has a REST API as well as OAI-PMH endpoint.
- have about 8 million
+ have about 8 million
bulk export notes: https://github.com/datacite/datacite/issues/188
@@ -206,4 +212,3 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):
def update_params(self, params, resp):
params['page[number]'] = resp['meta']['page'] + 1
return params
-