From f21d28315aa632cdb9f84ea8787762d1e27b4310 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 15 Nov 2018 12:21:45 -0800 Subject: refactoring harvesters --- python/fatcat_tools/harvest/__init__.py | 2 +- python/fatcat_tools/harvest/crossrefish.py | 39 ----- python/fatcat_tools/harvest/datacite.py | 29 ---- python/fatcat_tools/harvest/doi_registrars.py | 209 ++++++++++++++++++++++++++ python/fatcat_tools/harvest/ingest_common.py | 127 ---------------- 5 files changed, 210 insertions(+), 196 deletions(-) delete mode 100644 python/fatcat_tools/harvest/crossrefish.py delete mode 100644 python/fatcat_tools/harvest/datacite.py create mode 100644 python/fatcat_tools/harvest/doi_registrars.py delete mode 100644 python/fatcat_tools/harvest/ingest_common.py (limited to 'python/fatcat_tools/harvest') diff --git a/python/fatcat_tools/harvest/__init__.py b/python/fatcat_tools/harvest/__init__.py index 85034f04..e1bde753 100644 --- a/python/fatcat_tools/harvest/__init__.py +++ b/python/fatcat_tools/harvest/__init__.py @@ -1,2 +1,2 @@ -from .crossrefish import HarvestCrossrefWorker +from .doi_registrars import HarvestCrossrefWorker, HarvestDataciteWorker diff --git a/python/fatcat_tools/harvest/crossrefish.py b/python/fatcat_tools/harvest/crossrefish.py deleted file mode 100644 index a88cedbd..00000000 --- a/python/fatcat_tools/harvest/crossrefish.py +++ /dev/null @@ -1,39 +0,0 @@ - -""" -Notes on crossref API: - -- from-index-date is the updated time -- is-update can be false, to catch only new or only old works - -https://api.crossref.org/works?filter=from-index-date:2018-11-14,is-update:false&rows=2 - -I think the design is going to have to be a cronjob or long-running job -(with long sleeps) which publishes "success through" to a separate state -queue, as simple YYYY-MM-DD strings. - -Within a day, will need to use a resumption token. Maybe should use a -crossref library... meh. - -will want to have some mechanism in kafka consumer (pushing to fatcat) to group -in batches as well. maybe even pass through as batches? or just use timeouts on -iteration. -""" - -from fatcat_tools.harvest.ingest_common import DoiApiHarvest - -class HarvestCrossrefWorker(DoiApiHarvest): - - def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, - api_host_url="https://api.crossref.org/works", - is_update_filter=None, - start_date=None, end_date=None): - super().__init__(kafka_hosts=kafka_hosts, - produce_topic=produce_topic, - state_topic=state_topic, - api_host_url=api_host_url, - contact_email=contact_email, - start_date=start_date, - end_date=end_date) - - self.is_update_filter = is_update_filter - diff --git a/python/fatcat_tools/harvest/datacite.py b/python/fatcat_tools/harvest/datacite.py deleted file mode 100644 index 12860810..00000000 --- a/python/fatcat_tools/harvest/datacite.py +++ /dev/null @@ -1,29 +0,0 @@ - -""" -datacite has a REST API as well as OAI-PMH endpoint. - -have about 8 million - -bulk export notes: https://github.com/datacite/datacite/issues/188 - -fundamentally, very similar to crossref. don't have a scrape... maybe -could/should use this script for that, and dump to JSON? -""" - -from fatcat_tools.harvest.ingest_common import DoiApiHarvest - -class HarvestDataciteWorker(DoiApiHarvest): - - def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, - api_host_url="https://api.datacite.org/works", - start_date=None, end_date=None): - super().__init__(kafka_hosts=kafka_hosts, - produce_topic=produce_topic, - state_topic=state_topic, - api_host_url=api_host_url, - contact_email=contact_email, - start_date=start_date, - end_date=end_date) - - self.update_filter_name = "update" - diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py new file mode 100644 index 00000000..1a6807d2 --- /dev/null +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -0,0 +1,209 @@ + +import re +import sys +import csv +import json +import requests +import itertools +import datetime +from pykafka import KafkaClient + +from fatcat_tools.workers.worker_common import most_recent_message + +DATE_FMT = "%Y-%m-%d" + + +class HarvestCrossrefWorker: + """ + Notes on crossref API: + + - from-index-date is the updated time + - is-update can be false, to catch only new or only old works + + https://api.crossref.org/works?filter=from-index-date:2018-11-14,is-update:false&rows=2 + + I think the design is going to have to be a cronjob or long-running job + (with long sleeps) which publishes "success through" to a separate state + queue, as simple YYYY-MM-DD strings. + + Within a day, will need to use a resumption token. Maybe should use a + crossref library... meh. + + will want to have some mechanism in kafka consumer (pushing to fatcat) to group + in batches as well. maybe even pass through as batches? or just use timeouts on + iteration. + + logic of this worker: + - on start, fetch latest date from state feed + - in a function (unit-testable), decide which dates to ingest + - for each date needing update: + - start a loop for just that date, using resumption token for this query + - when done, publish to state feed, with immediate sync + + TODO: what sort of parallelism? I guess multi-processing on dates, but need + to be careful how state is serialized back into kafka. + """ + + + def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, + api_host_url="https://api.crossref.org/works", start_date=None, + end_date=None, is_update_filter=None): + + self.api_host_url = api_host_url + self.produce_topic = produce_topic + self.state_topic = state_topic + self.contact_email = contact_email + self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0") + self.is_update_filter = is_update_filter + + # these are both optional, and should be datetime.date + self.start_date = start_date + self.end_date = end_date + + self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks + self.api_batch_size = 50 + # for crossref, it's "from-index-date" + self.name = "Crossref" + + def get_latest_date(self): + + state_topic = self.kafka.topics[self.state_topic] + latest = most_recent_message(state_topic) + if latest: + latest = datetime.datetime.strptime(latest.decode('utf-8'), DATE_FMT).date() + print("Latest date found: {}".format(latest)) + return latest + + def params(self, date_str): + filter_param = 'from-index-date:{},until-index-date:{}'.format( + date_str, date_str) + if self.is_update_filter is not None: + filter_param += ',is_update:{}'.format(bool(self.is_update_filter)) + params = { + 'filter': filter_param, + 'rows': self.api_batch_size, + 'cursor': '*', + } + + def update_params(self, params, resp): + params['cursor'] = resp['message']['next-cursor'] + return params + + def fetch_date(self, date): + + state_topic = self.kafka.topics[self.state_topic] + produce_topic = self.kafka.topics[self.produce_topic] + + date_str = date.strftime(DATE_FMT) + params = self.params(date_str) + headers = { + 'User-Agent': 'fatcat_tools/0.1.0 (https://fatcat.wiki; mailto:{}) python-requests'.format(self.contact_email), + } + count = 0 + with produce_topic.get_producer() as producer: + while True: + http_resp = requests.get(self.api_host_url, params, headers=headers) + if http_resp.status_code is 503: + # crud backoff + print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code)) + time.sleep(30.0) + continue + assert http_resp.status_code is 200 + resp = http_resp.json() + items = self.extract_items(resp) + count += len(items) + print("... got {} ({} of {}) in {}".format(len(items), count, + self.extract_total(resp), http_resp.elapsed)) + #print(json.dumps(resp)) + for work in items: + producer.produce(json.dumps(work).encode('utf-8')) + if len(items) < self.api_batch_size: + break + params = self.update_params(params, resp) + + # record our completion state + with state_topic.get_sync_producer() as producer: + producer.produce(date.strftime(DATE_FMT).encode('utf-8')) + + def extract_items(self, resp): + return resp['message']['items'] + + def extract_total(self, resp): + return resp['message']['total-results'] + + def run_once(self): + today_utc = datetime.datetime.utcnow().date() + if self.start_date is None: + self.start_date = self.get_latest_date() + if self.start_date: + # if we are continuing, start day after last success + self.start_date = self.start_date + datetime.timedelta(days=1) + if self.start_date is None: + # bootstrap to yesterday (don't want to start on today until it's over) + self.start_date = datetime.datetime.utcnow().date() + if self.end_date is None: + # bootstrap to yesterday (don't want to start on today until it's over) + self.end_date = today_utc - datetime.timedelta(days=1) + print("Harvesting from {} through {}".format(self.start_date, self.end_date)) + current = self.start_date + while current <= self.end_date: + print("Fetching DOIs updated on {} (UTC)".format(current)) + self.fetch_date(current) + current += datetime.timedelta(days=1) + print("{} DOI ingest caught up through {}".format(self.name, self.end_date)) + return self.end_date + + def run_loop(self): + while True: + last = self.run_once() + self.start_date = last + self.end_date = None + print("Sleeping {} seconds...".format(self.loop_sleep)) + time.sleep(self.loop_sleep()) + + + +class HarvestDataciteWorker(HarvestCrossrefWorker): + """ + datacite has a REST API as well as OAI-PMH endpoint. + + have about 8 million + + bulk export notes: https://github.com/datacite/datacite/issues/188 + + fundamentally, very similar to crossref. don't have a scrape... maybe + could/should use this script for that, and dump to JSON? + """ + + def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, + api_host_url="https://api.datacite.org/works", + start_date=None, end_date=None): + super().__init__(kafka_hosts=kafka_hosts, + produce_topic=produce_topic, + state_topic=state_topic, + api_host_url=api_host_url, + contact_email=contact_email, + start_date=start_date, + end_date=end_date) + + # for datecite, it's "from-update-date" + self.name = "Datacite" + + def params(self, date_str): + return { + 'from-update-date': date_str, + 'until-update-date': date_str, + 'page[size]': self.api_batch_size, + 'page[number]': 1, + } + + def extract_items(self, resp): + return resp['data'] + + def extract_total(self, resp): + return resp['meta']['total'] + + def update_params(self, params, resp): + params['page[number]'] = resp['meta']['page'] + 1 + return params + diff --git a/python/fatcat_tools/harvest/ingest_common.py b/python/fatcat_tools/harvest/ingest_common.py deleted file mode 100644 index 67ff3dc3..00000000 --- a/python/fatcat_tools/harvest/ingest_common.py +++ /dev/null @@ -1,127 +0,0 @@ - -""" -logic: -- on start, fetch latest date from state feed -- in a function (unit-testable), decide which dates to ingest -- for each date needing update: - - start a loop for just that date, using resumption token for this query - - when done, publish to state feed, with immediate sync -""" - -import re -import sys -import csv -import json -import requests -import itertools -import datetime -from pykafka import KafkaClient - -from fatcat_tools.workers.worker_common import most_recent_message - -DATE_FMT = "%Y-%m-%d" - -class DoiApiHarvest: - """ - This class supports core features for both the Crossref and Datacite REST - APIs for fetching updated metadata (the Datacite API seems to be moduled on - the Crossref API). - - Implementations must provide the push results function. - """ - - def __init__(self, kafka_hosts, produce_topic, state_topic, api_host_url, - contact_email, start_date=None, end_date=None): - self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks - self.api_batch_size = 50 - self.api_host_url = api_host_url - self.produce_topic = produce_topic - self.state_topic = state_topic - self.contact_email = contact_email - self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0") - self.is_update_filter = None - self.update_filter_name = "index" - - # these are both optional, and should be datetime.date - self.start_date = start_date - self.end_date = end_date - - def get_latest_date(self): - - state_topic = self.kafka.topics[self.state_topic] - latest = most_recent_message(state_topic) - if latest: - latest = datetime.datetime.strptime(latest.decode('utf-8'), DATE_FMT).date() - print("Latest date found: {}".format(latest)) - return latest - - def fetch_date(self, date): - - state_topic = self.kafka.topics[self.state_topic] - produce_topic = self.kafka.topics[self.produce_topic] - - date_str = date.strftime(DATE_FMT) - filter_param = 'from-{index}-date:{},until-{index}-date:{}'.format( - date_str, date_str, index=self.update_filter_name) - if self.is_update_filter is not None: - filter_param += ',is_update:{}'.format(bool(is_update)) - params = { - 'filter': filter_param, - 'rows': self.api_batch_size, - 'cursor': '*', - } - headers = { - 'User-Agent': 'fatcat_tools/0.1.0 (https://fatcat.wiki; mailto:{}) python-requests'.format(self.contact_email), - } - count = 0 - with produce_topic.get_producer() as producer: - while True: - http_resp = requests.get(self.api_host_url, params, headers=headers) - assert http_resp.status_code is 200 - resp = http_resp.json() - items = resp['message']['items'] - count += len(items) - print("... got {} ({} of {}) in {}".format(len(items), count, - resp['message']['total-results']), http_resp.elapsed) - #print(json.dumps(resp)) - for work in items: - producer.produce(json.dumps(work).encode('utf-8')) - if len(items) < params['rows']: - break - params['cursor'] = resp['message']['next-cursor'] - - # record our completion state - with state_topic.get_sync_producer() as producer: - producer.produce(date.strftime(DATE_FMT).encode('utf-8')) - - - def run_once(self): - today_utc = datetime.datetime.utcnow().date() - if self.start_date is None: - self.start_date = self.get_latest_date() - if self.start_date: - # if we are continuing, start day after last success - self.start_date = self.start_date + datetime.timedelta(days=1) - if self.start_date is None: - # bootstrap to yesterday (don't want to start on today until it's over) - self.start_date = datetime.datetime.utcnow().date() - if self.end_date is None: - # bootstrap to yesterday (don't want to start on today until it's over) - self.end_date = today_utc - datetime.timedelta(days=1) - print("Harvesting from {} through {}".format(self.start_date, self.end_date)) - current = self.start_date - while current <= self.end_date: - print("Fetching DOIs updated on {} (UTC)".format(current)) - self.fetch_date(current) - current += datetime.timedelta(days=1) - print("Crossref DOI ingest caught up through {}".format(self.end_date)) - return self.end_date - - def run_loop(self): - while True: - last = self.run_once() - self.start_date = last - self.end_date = None - print("Sleeping {} seconds...".format(self.loop_sleep)) - time.sleep(self.loop_sleep()) - -- cgit v1.2.3