aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/harvest/doi_registrars.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-11-19 23:04:18 -0800
committerBryan Newbold <bnewbold@robocracy.org>2018-11-19 23:04:18 -0800
commite590eec544ab6f2e54e8770f01e64eef3158fdaa (patch)
tree5f1fe36a489e7e42642d96a3a719dcbd74d60901 /python/fatcat_tools/harvest/doi_registrars.py
parent65bdebea35f2ab3c9c8b0f8a8b0a9a577a36bee2 (diff)
downloadfatcat-e590eec544ab6f2e54e8770f01e64eef3158fdaa.tar.gz
fatcat-e590eec544ab6f2e54e8770f01e64eef3158fdaa.zip
initial OAI-PMH harvesters
Diffstat (limited to 'python/fatcat_tools/harvest/doi_registrars.py')
-rw-r--r--python/fatcat_tools/harvest/doi_registrars.py13
1 files changed, 8 insertions, 5 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index d5e4b7ec..10492c17 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -10,15 +10,13 @@ import datetime
from pykafka import KafkaClient
from fatcat_tools.workers import most_recent_message
-from .harvest_common import HarvestState
+from .harvest_common import HarvestState, DATE_FMT
# Skip pylint due to:
# AttributeError: 'NoneType' object has no attribute 'scope'
# in 'astroid/node_classes.py'
# pylint: skip-file
-DATE_FMT = "%Y-%m-%d"
-
class HarvestCrossrefWorker:
"""
@@ -68,7 +66,6 @@ class HarvestCrossrefWorker:
self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks
self.api_batch_size = 50
- # for crossref, it's "from-index-date"
self.name = "Crossref"
def params(self, date_str):
@@ -86,6 +83,9 @@ class HarvestCrossrefWorker:
params['cursor'] = resp['message']['next-cursor']
return params
+ def extract_key(self, obj):
+ return obj['DOI'].encode('utf-8')
+
def fetch_date(self, date):
produce_topic = self.kafka.topics[self.produce_topic]
@@ -112,7 +112,7 @@ class HarvestCrossrefWorker:
self.extract_total(resp), http_resp.elapsed))
#print(json.dumps(resp))
for work in items:
- producer.produce(json.dumps(work).encode('utf-8'))
+ producer.produce(json.dumps(work).encode('utf-8'), partition_key=self.extract_key(work))
if len(items) < self.api_batch_size:
break
params = self.update_params(params, resp)
@@ -181,6 +181,9 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):
def extract_total(self, resp):
return resp['meta']['total']
+ def extract_key(self, obj):
+ return obj['doi'].encode('utf-8')
+
def update_params(self, params, resp):
params['page[number]'] = resp['meta']['page'] + 1
return params