diff options
author | Martin Czygan <martin@archive.org> | 2019-12-26 23:52:39 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2019-12-26 23:52:39 +0000 |
commit | ad39a2a347b356fe9bb7dd96d79e2fdbf4844b9c (patch) | |
tree | cecab3300e67be677694cfc87bdccd6871babe6c /python/fatcat_tools/harvest | |
parent | 035ac1480f3d3c69d771b7793e973d38ce1c561a (diff) | |
parent | 6afc3e8d0e01cba781928dbc2bcb53d3c3ff71fd (diff) | |
download | fatcat-ad39a2a347b356fe9bb7dd96d79e2fdbf4844b9c.tar.gz fatcat-ad39a2a347b356fe9bb7dd96d79e2fdbf4844b9c.zip |
Merge branch 'martin-datacite-daily-harvest' into 'master'
Datacite daily harvest
See merge request webgroup/fatcat!6
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r-- | python/fatcat_tools/harvest/doi_registrars.py | 32 |
1 files changed, 27 insertions, 5 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 13abb2e6..33f44600 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -8,6 +8,7 @@ import itertools import datetime import requests from confluent_kafka import Producer, KafkaException +from urllib.parse import urlparse, parse_qs from fatcat_tools.workers import most_recent_message from .harvest_common import HarvestState, requests_retry_session @@ -121,6 +122,10 @@ class HarvestCrossrefWorker: self.producer.poll(0) time.sleep(30.0) continue + if http_resp.status_code == 400: + print("skipping batch for {}, due to HTTP 400. Marking complete. Related: https://github.com/datacite/datacite/issues/897".format(date_str), + file=sys.stderr) + break http_resp.raise_for_status() resp = http_resp.json() items = self.extract_items(resp) @@ -179,7 +184,7 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): """ def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, - api_host_url="https://api.datacite.org/works", + api_host_url="https://api.datacite.org/dois", start_date=None, end_date=None): super().__init__(kafka_hosts=kafka_hosts, produce_topic=produce_topic, @@ -193,11 +198,13 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): self.name = "Datacite" def params(self, date_str): + """ + Dates have to be supplied in 2018-10-27T22:36:30.000Z format. + """ return { - 'from-update-date': date_str, - 'until-update-date': date_str, + 'query': 'updated:[{}T00:00:00.000Z TO {}T23:59:59.999Z]'.format(date_str, date_str), 'page[size]': self.api_batch_size, - 'page[number]': 1, + 'page[cursor]': 1, } def extract_items(self, resp): @@ -210,5 +217,20 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): return obj['attributes']['doi'].encode('utf-8') def update_params(self, params, resp): - params['page[number]'] = resp['meta']['page'] + 1 + """ + Using cursor mechanism (https://support.datacite.org/docs/pagination#section-cursor). + + $ curl -sL https://is.gd/cLbE5h | jq -r .links.next + + Example: https://is.gd/cLbE5h + + Further API errors reported: + https://github.com/datacite/datacite/issues/897 (HTTP 400) + https://github.com/datacite/datacite/issues/898 (HTTP 500) + """ + parsed = urlparse(resp['links']['next']) + page_cursor = parse_qs(parsed.query).get('page[cursor]') + if not page_cursor: + raise ValueError('no page[cursor] in .links.next') + params['page[cursor]'] = page_cursor[0] return params |