summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/harvest
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2019-12-04 16:17:28 +0100
committerMartin Czygan <martin.czygan@gmail.com>2019-12-27 00:13:40 +0100
commit5bdfc1518adb545f15d3ac053e831a13cb96c971 (patch)
tree01fc0db28edb42627c45aa85ce70c1b15b812681 /python/fatcat_tools/harvest
parent035ac1480f3d3c69d771b7793e973d38ce1c561a (diff)
downloadfatcat-5bdfc1518adb545f15d3ac053e831a13cb96c971.tar.gz
fatcat-5bdfc1518adb545f15d3ac053e831a13cb96c971.zip
datacite: use v2 of the API (flaky)
Update parameter update for datacite API v2. Works fine, but there are occasional HTTP 400 responses when using the cursor API (daily updates can exceed the 10000 record limit for search queries). The HTTP 400 issue is not solved yet, but reported to datacite as https://github.com/datacite/datacite/issues/897.
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r--python/fatcat_tools/harvest/doi_registrars.py33
1 files changed, 28 insertions, 5 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index 13abb2e6..e24a979d 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -8,6 +8,7 @@ import itertools
import datetime
import requests
from confluent_kafka import Producer, KafkaException
+from urllib.parse import urlparse, parse_qs
from fatcat_tools.workers import most_recent_message
from .harvest_common import HarvestState, requests_retry_session
@@ -179,7 +180,7 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):
"""
def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email,
- api_host_url="https://api.datacite.org/works",
+ api_host_url="https://api.datacite.org/dois",
start_date=None, end_date=None):
super().__init__(kafka_hosts=kafka_hosts,
produce_topic=produce_topic,
@@ -193,11 +194,13 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):
self.name = "Datacite"
def params(self, date_str):
+ """
+ Dates have to be supplied in 2018-10-27T22:36:30.000Z format.
+ """
return {
- 'from-update-date': date_str,
- 'until-update-date': date_str,
+ 'query': 'updated:[{}T00:00:00.000Z TO {}T23:59:59.000Z]'.format(date_str, date_str),
'page[size]': self.api_batch_size,
- 'page[number]': 1,
+ 'page[cursor]': 1,
}
def extract_items(self, resp):
@@ -210,5 +213,25 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):
return obj['attributes']['doi'].encode('utf-8')
def update_params(self, params, resp):
- params['page[number]'] = resp['meta']['page'] + 1
+ """
+ We need to parse out the cursor value from the next link.
+
+ $ curl -sL https://is.gd/cLbE5h | jq -r .links.next
+
+ https://api.datacite.org/dois?page%5Bcursor%5D=MTMxNjgwODE3NTAwMCwxMC41NDM5LzEwMjUxOTI&page%5Bsize%5D=50&query=updated%3A%5B2019-11-18T00%3A00%3A00.000Z+TO+2019-11-18T23%3A59%3A59.000Z%5D
+
+ Notes.
+
+ (1) HTTP 400 issues.
+
+ Funny "search_after has 3 value(s) but sort has 2." on
+ https://api.datacite.org/dois?page%5Bsize%5D=50&page%5Bcursor%5D=MTQyMzQ2ODQwMTAwMCwxMC41Njc1L0hZV0FfMjAxNSwxXzI&query=updated%3A%5B2019-11-20T00%3A00%3A00.000Z+TO+2019-11-20T23%3A59%3A59.000Z%5D
+
+ Reported as https://github.com/datacite/datacite/issues/897.
+ """
+ parsed = urlparse(resp['links']['next'])
+ page_cursor = parse_qs(parsed.query).get('page[cursor]')
+ if not page_cursor:
+ raise ValueError('no page[cursor] in .links.next')
+ params['page[cursor]'] = page_cursor[0]
return params