diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2019-12-04 16:17:28 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2019-12-27 00:13:40 +0100 |
commit | 5bdfc1518adb545f15d3ac053e831a13cb96c971 (patch) | |
tree | 01fc0db28edb42627c45aa85ce70c1b15b812681 /python/fatcat_tools/harvest | |
parent | 035ac1480f3d3c69d771b7793e973d38ce1c561a (diff) | |
download | fatcat-5bdfc1518adb545f15d3ac053e831a13cb96c971.tar.gz fatcat-5bdfc1518adb545f15d3ac053e831a13cb96c971.zip |
datacite: use v2 of the API (flaky)
Update parameter update for datacite API v2. Works fine, but there are
occasional HTTP 400 responses when using the cursor API (daily updates
can exceed the 10000 record limit for search queries).
The HTTP 400 issue is not solved yet, but reported to datacite as
https://github.com/datacite/datacite/issues/897.
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r-- | python/fatcat_tools/harvest/doi_registrars.py | 33 |
1 files changed, 28 insertions, 5 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 13abb2e6..e24a979d 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -8,6 +8,7 @@ import itertools import datetime import requests from confluent_kafka import Producer, KafkaException +from urllib.parse import urlparse, parse_qs from fatcat_tools.workers import most_recent_message from .harvest_common import HarvestState, requests_retry_session @@ -179,7 +180,7 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): """ def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, - api_host_url="https://api.datacite.org/works", + api_host_url="https://api.datacite.org/dois", start_date=None, end_date=None): super().__init__(kafka_hosts=kafka_hosts, produce_topic=produce_topic, @@ -193,11 +194,13 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): self.name = "Datacite" def params(self, date_str): + """ + Dates have to be supplied in 2018-10-27T22:36:30.000Z format. + """ return { - 'from-update-date': date_str, - 'until-update-date': date_str, + 'query': 'updated:[{}T00:00:00.000Z TO {}T23:59:59.000Z]'.format(date_str, date_str), 'page[size]': self.api_batch_size, - 'page[number]': 1, + 'page[cursor]': 1, } def extract_items(self, resp): @@ -210,5 +213,25 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): return obj['attributes']['doi'].encode('utf-8') def update_params(self, params, resp): - params['page[number]'] = resp['meta']['page'] + 1 + """ + We need to parse out the cursor value from the next link. + + $ curl -sL https://is.gd/cLbE5h | jq -r .links.next + + https://api.datacite.org/dois?page%5Bcursor%5D=MTMxNjgwODE3NTAwMCwxMC41NDM5LzEwMjUxOTI&page%5Bsize%5D=50&query=updated%3A%5B2019-11-18T00%3A00%3A00.000Z+TO+2019-11-18T23%3A59%3A59.000Z%5D + + Notes. + + (1) HTTP 400 issues. + + Funny "search_after has 3 value(s) but sort has 2." on + https://api.datacite.org/dois?page%5Bsize%5D=50&page%5Bcursor%5D=MTQyMzQ2ODQwMTAwMCwxMC41Njc1L0hZV0FfMjAxNSwxXzI&query=updated%3A%5B2019-11-20T00%3A00%3A00.000Z+TO+2019-11-20T23%3A59%3A59.000Z%5D + + Reported as https://github.com/datacite/datacite/issues/897. + """ + parsed = urlparse(resp['links']['next']) + page_cursor = parse_qs(parsed.query).get('page[cursor]') + if not page_cursor: + raise ValueError('no page[cursor] in .links.next') + params['page[cursor]'] = page_cursor[0] return params |