diff options
| author | bnewbold <bnewbold@archive.org> | 2020-04-01 18:13:49 +0000 | 
|---|---|---|
| committer | bnewbold <bnewbold@archive.org> | 2020-04-01 18:13:49 +0000 | 
| commit | d25b813b2e6514196840a3225a8bb9a5d33a22bf (patch) | |
| tree | b21d56c1029a9862f94fffffa627ee643d788767 /python/fatcat_tools/harvest | |
| parent | 48cc4c85008bb3fa2751c042112467c7e60a96aa (diff) | |
| parent | 851c40143d44a73a92ff2c9556b3a63f29668c2d (diff) | |
| download | fatcat-d25b813b2e6514196840a3225a8bb9a5d33a22bf.tar.gz fatcat-d25b813b2e6514196840a3225a8bb9a5d33a22bf.zip | |
Merge branch 'bnewbold-crossref-deposit' into 'master'
change crossref harvest date field
See merge request webgroup/fatcat!41
Diffstat (limited to 'python/fatcat_tools/harvest')
| -rw-r--r-- | python/fatcat_tools/harvest/doi_registrars.py | 26 | 
1 files changed, 23 insertions, 3 deletions
| diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index d2d71d3c..3acb7d96 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -16,12 +16,32 @@ from .harvest_common import HarvestState, requests_retry_session  class HarvestCrossrefWorker:      """ -    Notes on crossref API: +    Crossref API date fields (and our interpretation):: -    - from-index-date is the updated time +    - https://github.com/CrossRef/rest-api-doc#filter-names +    - *-index-date: "metadata indexed" is the API/index record update time +    - *-deposit-date: "metadata last (re)deposited" is the catalog record update time +    - *-update-date: "Metadata updated (Currently the same as *-deposit-date)" +    - *-created-date: "metadata first deposited" +    - *-pub-date (etc): publisher-supplied, not "meta-meta-data"      https://api.crossref.org/works?filter=from-index-date:2018-11-14&rows=2 +    Also from the REST API: + +        Notes on incremental metadata updates + +        When using time filters to retrieve periodic, incremental metadata +        updates, the from-index-date filter should be used over +        from-update-date, from-deposit-date, from-created-date and +        from-pub-date. The timestamp that from-index-date filters on is +        guaranteed to be updated every time there is a change to metadata +        requiring a reindex. + +    However, when Crossref re-indexes tens of millions of rows, using +    from-index-date can be very slow, taking several days to process a single +    day of updates. +      I think the design is going to have to be a cronjob or long-running job      (with long sleeps) which publishes "success through" to a separate state      queue, as simple YYYY-MM-DD strings. @@ -87,7 +107,7 @@ class HarvestCrossrefWorker:          return Producer(producer_conf)      def params(self, date_str): -        filter_param = 'from-index-date:{},until-index-date:{}'.format( +        filter_param = 'from-update-date:{},until-update-date:{}'.format(              date_str, date_str)          return {              'filter': filter_param, | 
