Merge branch 'bnewbold-crossref-deposit' into 'master'

change crossref harvest date field See merge request webgroup/fatcat!41
author: bnewbold <bnewbold@archive.org> 2020-04-01 18:13:49 +0000
committer: bnewbold <bnewbold@archive.org> 2020-04-01 18:13:49 +0000
commit: d25b813b2e6514196840a3225a8bb9a5d33a22bf (patch)
tree: b21d56c1029a9862f94fffffa627ee643d788767
parent: 48cc4c85008bb3fa2751c042112467c7e60a96aa (diff)
parent: 851c40143d44a73a92ff2c9556b3a63f29668c2d (diff)
download: fatcat-d25b813b2e6514196840a3225a8bb9a5d33a22bf.tar.gz
fatcat-d25b813b2e6514196840a3225a8bb9a5d33a22bf.zip
2 files changed, 24 insertions, 4 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index d2d71d3c..3acb7d96 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -16,12 +16,32 @@ from .harvest_common import HarvestState, requests_retry_session
 
 class HarvestCrossrefWorker:
     """
-    Notes on crossref API:
+    Crossref API date fields (and our interpretation)::
 
-    - from-index-date is the updated time
+    - https://github.com/CrossRef/rest-api-doc#filter-names
+    - *-index-date: "metadata indexed" is the API/index record update time
+    - *-deposit-date: "metadata last (re)deposited" is the catalog record update time
+    - *-update-date: "Metadata updated (Currently the same as *-deposit-date)"
+    - *-created-date: "metadata first deposited"
+    - *-pub-date (etc): publisher-supplied, not "meta-meta-data"
 
     https://api.crossref.org/works?filter=from-index-date:2018-11-14&rows=2
 
+    Also from the REST API:
+
+        Notes on incremental metadata updates
+
+        When using time filters to retrieve periodic, incremental metadata
+        updates, the from-index-date filter should be used over
+        from-update-date, from-deposit-date, from-created-date and
+        from-pub-date. The timestamp that from-index-date filters on is
+        guaranteed to be updated every time there is a change to metadata
+        requiring a reindex.
+
+    However, when Crossref re-indexes tens of millions of rows, using
+    from-index-date can be very slow, taking several days to process a single
+    day of updates.
+
     I think the design is going to have to be a cronjob or long-running job
     (with long sleeps) which publishes "success through" to a separate state
     queue, as simple YYYY-MM-DD strings.
@@ -87,7 +107,7 @@ class HarvestCrossrefWorker:
         return Producer(producer_conf)
 
     def params(self, date_str):
-        filter_param = 'from-index-date:{},until-index-date:{}'.format(
+        filter_param = 'from-update-date:{},until-update-date:{}'.format(
             date_str, date_str)
         return {
             'filter': filter_param,
diff --git a/python/tests/harvest_crossref.py b/python/tests/harvest_crossref.py
index 52aa7b81..e902cda5 100644
--- a/python/tests/harvest_crossref.py
+++ b/python/tests/harvest_crossref.py
@@ -36,7 +36,7 @@ def test_crossref_harvest_date(mocker):
     assert "mailto:test@fatcat.wiki" in responses.calls[0].request.headers['User-Agent']
 
     # check that correct date param was passed as expected
-    assert "filter=from-index-date%3A2019-02-03" in responses.calls[0].request.url
+    assert "filter=from-update-date%3A2019-02-03" in responses.calls[0].request.url
 
     # check that we published the expected number of DOI objects were published
     # to the (mock) kafka topic
author	bnewbold <bnewbold@archive.org>	2020-04-01 18:13:49 +0000
committer	bnewbold <bnewbold@archive.org>	2020-04-01 18:13:49 +0000
commit	d25b813b2e6514196840a3225a8bb9a5d33a22bf (patch)
tree	b21d56c1029a9862f94fffffa627ee643d788767
parent	48cc4c85008bb3fa2751c042112467c7e60a96aa (diff)
parent	851c40143d44a73a92ff2c9556b3a63f29668c2d (diff)
download	fatcat-d25b813b2e6514196840a3225a8bb9a5d33a22bf.tar.gz fatcat-d25b813b2e6514196840a3225a8bb9a5d33a22bf.zip