diff options
author | Martin Czygan <martin@archive.org> | 2019-12-09 19:41:09 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2019-12-09 19:41:09 +0000 |
commit | d6bc26046bbbe7bd76f4740b12170b1b73e6d264 (patch) | |
tree | 6a9a7803bd955740b88e82d5687d29111d2ffa6e /python/fatcat_tools/harvest | |
parent | e5a1738b67c098ad61257c4b872ecdb3f6ad74a8 (diff) | |
parent | 7cba221ba4876bd7c011c6a46dc86c4494218366 (diff) | |
download | fatcat-d6bc26046bbbe7bd76f4740b12170b1b73e6d264.tar.gz fatcat-d6bc26046bbbe7bd76f4740b12170b1b73e6d264.zip |
Merge branch 'bnewbold-crossref-harvest-test' into 'master'
Basic mocked test for crossref harvester
See merge request webgroup/fatcat!7
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r-- | python/fatcat_tools/harvest/doi_registrars.py | 47 |
1 files changed, 26 insertions, 21 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 2df13283..13abb2e6 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -63,6 +63,27 @@ class HarvestCrossrefWorker: self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks self.api_batch_size = 50 self.name = "Crossref" + self.producer = self._kafka_producer() + + def _kafka_producer(self): + + def fail_fast(err, msg): + if err is not None: + print("Kafka producer delivery error: {}".format(err)) + print("Bailing out...") + # TODO: should it be sys.exit(-1)? + raise KafkaException(err) + + self._kafka_fail_fast = fail_fast + + producer_conf = self.kafka_config.copy() + producer_conf.update({ + 'delivery.report.only.error': True, + 'default.topic.config': { + 'request.required.acks': -1, # all brokers must confirm + }, + }) + return Producer(producer_conf) def params(self, date_str): filter_param = 'from-index-date:{},until-index-date:{}'.format( @@ -82,22 +103,6 @@ class HarvestCrossrefWorker: def fetch_date(self, date): - def fail_fast(err, msg): - if err is not None: - print("Kafka producer delivery error: {}".format(err)) - print("Bailing out...") - # TODO: should it be sys.exit(-1)? - raise KafkaException(err) - - producer_conf = self.kafka_config.copy() - producer_conf.update({ - 'delivery.report.only.error': True, - 'default.topic.config': { - 'request.required.acks': -1, # all brokers must confirm - }, - }) - producer = Producer(producer_conf) - date_str = date.isoformat() params = self.params(date_str) http_session = requests_retry_session() @@ -113,7 +118,7 @@ class HarvestCrossrefWorker: # backoff, but allows for longer backoff/downtime on remote end print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code)) # keep kafka producer connection alive - producer.poll(0) + self.producer.poll(0) time.sleep(30.0) continue http_resp.raise_for_status() @@ -124,16 +129,16 @@ class HarvestCrossrefWorker: self.extract_total(resp), http_resp.elapsed)) #print(json.dumps(resp)) for work in items: - producer.produce( + self.producer.produce( self.produce_topic, json.dumps(work).encode('utf-8'), key=self.extract_key(work), - on_delivery=fail_fast) - producer.poll(0) + on_delivery=self._kafka_fail_fast) + self.producer.poll(0) if len(items) < self.api_batch_size: break params = self.update_params(params, resp) - producer.flush() + self.producer.flush() def extract_items(self, resp): return resp['message']['items'] |