summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-03-30 20:55:44 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-03-30 20:55:44 -0700
commit98933a068ec3d918deb0e7dff30aed517ca515d9 (patch)
tree6c9cfe687c34d6470d46752087a175818c5fdd1e
parent48cc4c85008bb3fa2751c042112467c7e60a96aa (diff)
downloadfatcat-98933a068ec3d918deb0e7dff30aed517ca515d9.tar.gz
fatcat-98933a068ec3d918deb0e7dff30aed517ca515d9.zip
crossref: longer comment about crossref API date fields
-rw-r--r--python/fatcat_tools/harvest/doi_registrars.py24
1 files changed, 22 insertions, 2 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index d2d71d3c..66fc033c 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -16,12 +16,32 @@ from .harvest_common import HarvestState, requests_retry_session
class HarvestCrossrefWorker:
"""
- Notes on crossref API:
+ Crossref API date fields (and our interpretation)::
- - from-index-date is the updated time
+ - https://github.com/CrossRef/rest-api-doc#filter-names
+ - *-index-date: "metadata indexed" is the API/index record update time
+ - *-deposit-date: "metadata last (re)deposited" is the catalog record update time
+ - *-update-date: "Metadata updated (Currently the same as *-deposit-date)"
+ - *-created-date: "metadata first deposited"
+ - *-pub-date (etc): publisher-supplied, not "meta-meta-data"
https://api.crossref.org/works?filter=from-index-date:2018-11-14&rows=2
+ Also from the REST API:
+
+ Notes on incremental metadata updates
+
+ When using time filters to retrieve periodic, incremental metadata
+ updates, the from-index-date filter should be used over
+ from-update-date, from-deposit-date, from-created-date and
+ from-pub-date. The timestamp that from-index-date filters on is
+ guaranteed to be updated every time there is a change to metadata
+ requiring a reindex.
+
+ However, when Crossref re-indexes tens of millions of rows, using
+ from-index-date can be very slow, taking several days to process a single
+ day of updates.
+
I think the design is going to have to be a cronjob or long-running job
(with long sleeps) which publishes "success through" to a separate state
queue, as simple YYYY-MM-DD strings.