From 5bdfc1518adb545f15d3ac053e831a13cb96c971 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 4 Dec 2019 16:17:28 +0100 Subject: datacite: use v2 of the API (flaky) Update parameter update for datacite API v2. Works fine, but there are occasional HTTP 400 responses when using the cursor API (daily updates can exceed the 10000 record limit for search queries). The HTTP 400 issue is not solved yet, but reported to datacite as https://github.com/datacite/datacite/issues/897. --- python/fatcat_tools/harvest/doi_registrars.py | 33 +++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'python/fatcat_tools/harvest') diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 13abb2e6..e24a979d 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -8,6 +8,7 @@ import itertools import datetime import requests from confluent_kafka import Producer, KafkaException +from urllib.parse import urlparse, parse_qs from fatcat_tools.workers import most_recent_message from .harvest_common import HarvestState, requests_retry_session @@ -179,7 +180,7 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): """ def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, - api_host_url="https://api.datacite.org/works", + api_host_url="https://api.datacite.org/dois", start_date=None, end_date=None): super().__init__(kafka_hosts=kafka_hosts, produce_topic=produce_topic, @@ -193,11 +194,13 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): self.name = "Datacite" def params(self, date_str): + """ + Dates have to be supplied in 2018-10-27T22:36:30.000Z format. + """ return { - 'from-update-date': date_str, - 'until-update-date': date_str, + 'query': 'updated:[{}T00:00:00.000Z TO {}T23:59:59.000Z]'.format(date_str, date_str), 'page[size]': self.api_batch_size, - 'page[number]': 1, + 'page[cursor]': 1, } def extract_items(self, resp): @@ -210,5 +213,25 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): return obj['attributes']['doi'].encode('utf-8') def update_params(self, params, resp): - params['page[number]'] = resp['meta']['page'] + 1 + """ + We need to parse out the cursor value from the next link. + + $ curl -sL https://is.gd/cLbE5h | jq -r .links.next + + https://api.datacite.org/dois?page%5Bcursor%5D=MTMxNjgwODE3NTAwMCwxMC41NDM5LzEwMjUxOTI&page%5Bsize%5D=50&query=updated%3A%5B2019-11-18T00%3A00%3A00.000Z+TO+2019-11-18T23%3A59%3A59.000Z%5D + + Notes. + + (1) HTTP 400 issues. + + Funny "search_after has 3 value(s) but sort has 2." on + https://api.datacite.org/dois?page%5Bsize%5D=50&page%5Bcursor%5D=MTQyMzQ2ODQwMTAwMCwxMC41Njc1L0hZV0FfMjAxNSwxXzI&query=updated%3A%5B2019-11-20T00%3A00%3A00.000Z+TO+2019-11-20T23%3A59%3A59.000Z%5D + + Reported as https://github.com/datacite/datacite/issues/897. + """ + parsed = urlparse(resp['links']['next']) + page_cursor = parse_qs(parsed.query).get('page[cursor]') + if not page_cursor: + raise ValueError('no page[cursor] in .links.next') + params['page[cursor]'] = page_cursor[0] return params -- cgit v1.2.3 From 27d79252aa60379c3dc45b4d6072b21a9f82b8c1 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 6 Dec 2019 15:12:21 +0100 Subject: datacite: update documentation, add links to issues --- python/fatcat_tools/harvest/doi_registrars.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'python/fatcat_tools/harvest') diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index e24a979d..5af5395e 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -214,20 +214,15 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): def update_params(self, params, resp): """ - We need to parse out the cursor value from the next link. + Using cursor mechanism (https://support.datacite.org/docs/pagination#section-cursor). $ curl -sL https://is.gd/cLbE5h | jq -r .links.next - https://api.datacite.org/dois?page%5Bcursor%5D=MTMxNjgwODE3NTAwMCwxMC41NDM5LzEwMjUxOTI&page%5Bsize%5D=50&query=updated%3A%5B2019-11-18T00%3A00%3A00.000Z+TO+2019-11-18T23%3A59%3A59.000Z%5D + Example: https://is.gd/cLbE5h - Notes. - - (1) HTTP 400 issues. - - Funny "search_after has 3 value(s) but sort has 2." on - https://api.datacite.org/dois?page%5Bsize%5D=50&page%5Bcursor%5D=MTQyMzQ2ODQwMTAwMCwxMC41Njc1L0hZV0FfMjAxNSwxXzI&query=updated%3A%5B2019-11-20T00%3A00%3A00.000Z+TO+2019-11-20T23%3A59%3A59.000Z%5D - - Reported as https://github.com/datacite/datacite/issues/897. + Further API errors reported: + https://github.com/datacite/datacite/issues/897 (HTTP 400) + https://github.com/datacite/datacite/issues/898 (HTTP 500) """ parsed = urlparse(resp['links']['next']) page_cursor = parse_qs(parsed.query).get('page[cursor]') -- cgit v1.2.3 From fd50b9492b5fdf3c94f11dea909d63b4b60866b2 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 6 Dec 2019 15:12:32 +0100 Subject: Datacite API v2 throws 400, we cannot recover from, currently. As a first iteration, just mark the daily batch complete and continue. The occasional HTTP 400 issue has been reported as https://github.com/datacite/datacite/issues/897. A possible improvement would be to shrink the window, so losses will be smaller. --- python/fatcat_tools/harvest/doi_registrars.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'python/fatcat_tools/harvest') diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 5af5395e..19b32e18 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -122,6 +122,10 @@ class HarvestCrossrefWorker: self.producer.poll(0) time.sleep(30.0) continue + if http_resp.status_code == 400: + # https://is.gd/0nsEll, https://github.com/datacite/datacite/issues/897 + print("skipping batch for {}, due to HTTP 400. Marking complete. Related: https://git.io/JeylE".format(date_str)) + break http_resp.raise_for_status() resp = http_resp.json() items = self.extract_items(resp) -- cgit v1.2.3 From f1741a635766991f76b34684d9848ca5479a3418 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 9 Dec 2019 19:55:13 +0100 Subject: avoid usage of short links --- python/fatcat_tools/harvest/doi_registrars.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'python/fatcat_tools/harvest') diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 19b32e18..dce7a2bd 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -123,8 +123,8 @@ class HarvestCrossrefWorker: time.sleep(30.0) continue if http_resp.status_code == 400: - # https://is.gd/0nsEll, https://github.com/datacite/datacite/issues/897 - print("skipping batch for {}, due to HTTP 400. Marking complete. Related: https://git.io/JeylE".format(date_str)) + print("skipping batch for {}, due to HTTP 400. Marking complete. Related: https://github.com/datacite/datacite/issues/897".format(date_str), + file=sys.stderr) break http_resp.raise_for_status() resp = http_resp.json() -- cgit v1.2.3 From 0f86388578fd7b0475f4e621f13f65b600ae2655 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 9 Dec 2019 19:55:48 +0100 Subject: datacite: extend range search query The bracket syntax is inclusive. See also: https://www.elastic.co/guide/en/elasticsearch/reference/7.5/query-dsl-query-string-query.html#_ranges --- python/fatcat_tools/harvest/doi_registrars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python/fatcat_tools/harvest') diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index dce7a2bd..33f44600 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -202,7 +202,7 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): Dates have to be supplied in 2018-10-27T22:36:30.000Z format. """ return { - 'query': 'updated:[{}T00:00:00.000Z TO {}T23:59:59.000Z]'.format(date_str, date_str), + 'query': 'updated:[{}T00:00:00.000Z TO {}T23:59:59.999Z]'.format(date_str, date_str), 'page[size]': self.api_batch_size, 'page[cursor]': 1, } -- cgit v1.2.3