From 0d9d71038b8a77baaeb7e9118d5b191b60eed7cc Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Dec 2019 16:30:40 -0800 Subject: crossref is_update isn't what I thought I thought this would filter for metadata updates to an existing DOI, but actually "updates" are a type of DOI (eg, a retraction). TODO: handle 'updates' field. Should both do a lookup and set work_ident appropriately, and store in crossref-specific metadata. --- python/fatcat_tools/harvest/doi_registrars.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 7e791745..2df13283 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -18,9 +18,8 @@ class HarvestCrossrefWorker: Notes on crossref API: - from-index-date is the updated time - - is-update can be false, to catch only new or only old works - https://api.crossref.org/works?filter=from-index-date:2018-11-14,is-update:false&rows=2 + https://api.crossref.org/works?filter=from-index-date:2018-11-14&rows=2 I think the design is going to have to be a cronjob or long-running job (with long sleeps) which publishes "success through" to a separate state @@ -47,13 +46,12 @@ class HarvestCrossrefWorker: def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, api_host_url="https://api.crossref.org/works", start_date=None, - end_date=None, is_update_filter=None): + end_date=None): self.api_host_url = api_host_url self.produce_topic = produce_topic self.state_topic = state_topic self.contact_email = contact_email - self.is_update_filter = is_update_filter self.kafka_config = { 'bootstrap.servers': kafka_hosts, 'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes @@ -69,8 +67,6 @@ class HarvestCrossrefWorker: def params(self, date_str): filter_param = 'from-index-date:{},until-index-date:{}'.format( date_str, date_str) - if self.is_update_filter is not None: - filter_param += ',is_update:{}'.format(bool(self.is_update_filter)) return { 'filter': filter_param, 'rows': self.api_batch_size, -- cgit v1.2.3