From 98933a068ec3d918deb0e7dff30aed517ca515d9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 30 Mar 2020 20:55:44 -0700 Subject: crossref: longer comment about crossref API date fields --- python/fatcat_tools/harvest/doi_registrars.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index d2d71d3c..66fc033c 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -16,12 +16,32 @@ from .harvest_common import HarvestState, requests_retry_session class HarvestCrossrefWorker: """ - Notes on crossref API: + Crossref API date fields (and our interpretation):: - - from-index-date is the updated time + - https://github.com/CrossRef/rest-api-doc#filter-names + - *-index-date: "metadata indexed" is the API/index record update time + - *-deposit-date: "metadata last (re)deposited" is the catalog record update time + - *-update-date: "Metadata updated (Currently the same as *-deposit-date)" + - *-created-date: "metadata first deposited" + - *-pub-date (etc): publisher-supplied, not "meta-meta-data" https://api.crossref.org/works?filter=from-index-date:2018-11-14&rows=2 + Also from the REST API: + + Notes on incremental metadata updates + + When using time filters to retrieve periodic, incremental metadata + updates, the from-index-date filter should be used over + from-update-date, from-deposit-date, from-created-date and + from-pub-date. The timestamp that from-index-date filters on is + guaranteed to be updated every time there is a change to metadata + requiring a reindex. + + However, when Crossref re-indexes tens of millions of rows, using + from-index-date can be very slow, taking several days to process a single + day of updates. + I think the design is going to have to be a cronjob or long-running job (with long sleeps) which publishes "success through" to a separate state queue, as simple YYYY-MM-DD strings. -- cgit v1.2.3 From 851c40143d44a73a92ff2c9556b3a63f29668c2d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 30 Mar 2020 20:56:04 -0700 Subject: crossref: switch from index-date to update-date This goes against what the API docs recommend, but we are currently far behind on updates and need to catch up. Other than what the docs say, this seems to be consistent with the behavior we want. --- python/fatcat_tools/harvest/doi_registrars.py | 2 +- python/tests/harvest_crossref.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 66fc033c..3acb7d96 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -107,7 +107,7 @@ class HarvestCrossrefWorker: return Producer(producer_conf) def params(self, date_str): - filter_param = 'from-index-date:{},until-index-date:{}'.format( + filter_param = 'from-update-date:{},until-update-date:{}'.format( date_str, date_str) return { 'filter': filter_param, diff --git a/python/tests/harvest_crossref.py b/python/tests/harvest_crossref.py index 52aa7b81..e902cda5 100644 --- a/python/tests/harvest_crossref.py +++ b/python/tests/harvest_crossref.py @@ -36,7 +36,7 @@ def test_crossref_harvest_date(mocker): assert "mailto:test@fatcat.wiki" in responses.calls[0].request.headers['User-Agent'] # check that correct date param was passed as expected - assert "filter=from-index-date%3A2019-02-03" in responses.calls[0].request.url + assert "filter=from-update-date%3A2019-02-03" in responses.calls[0].request.url # check that we published the expected number of DOI objects were published # to the (mock) kafka topic -- cgit v1.2.3