diff options
Diffstat (limited to 'python/fatcat_tools/harvest/crossrefish.py')
-rw-r--r-- | python/fatcat_tools/harvest/crossrefish.py | 39 |
1 files changed, 39 insertions, 0 deletions
diff --git a/python/fatcat_tools/harvest/crossrefish.py b/python/fatcat_tools/harvest/crossrefish.py new file mode 100644 index 00000000..a88cedbd --- /dev/null +++ b/python/fatcat_tools/harvest/crossrefish.py @@ -0,0 +1,39 @@ + +""" +Notes on crossref API: + +- from-index-date is the updated time +- is-update can be false, to catch only new or only old works + +https://api.crossref.org/works?filter=from-index-date:2018-11-14,is-update:false&rows=2 + +I think the design is going to have to be a cronjob or long-running job +(with long sleeps) which publishes "success through" to a separate state +queue, as simple YYYY-MM-DD strings. + +Within a day, will need to use a resumption token. Maybe should use a +crossref library... meh. + +will want to have some mechanism in kafka consumer (pushing to fatcat) to group +in batches as well. maybe even pass through as batches? or just use timeouts on +iteration. +""" + +from fatcat_tools.harvest.ingest_common import DoiApiHarvest + +class HarvestCrossrefWorker(DoiApiHarvest): + + def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, + api_host_url="https://api.crossref.org/works", + is_update_filter=None, + start_date=None, end_date=None): + super().__init__(kafka_hosts=kafka_hosts, + produce_topic=produce_topic, + state_topic=state_topic, + api_host_url=api_host_url, + contact_email=contact_email, + start_date=start_date, + end_date=end_date) + + self.is_update_filter = is_update_filter + |