aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/harvest/crossrefish.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/harvest/crossrefish.py')
-rw-r--r--python/fatcat_tools/harvest/crossrefish.py39
1 files changed, 39 insertions, 0 deletions
diff --git a/python/fatcat_tools/harvest/crossrefish.py b/python/fatcat_tools/harvest/crossrefish.py
new file mode 100644
index 00000000..a88cedbd
--- /dev/null
+++ b/python/fatcat_tools/harvest/crossrefish.py
@@ -0,0 +1,39 @@
+
+"""
+Notes on crossref API:
+
+- from-index-date is the updated time
+- is-update can be false, to catch only new or only old works
+
+https://api.crossref.org/works?filter=from-index-date:2018-11-14,is-update:false&rows=2
+
+I think the design is going to have to be a cronjob or long-running job
+(with long sleeps) which publishes "success through" to a separate state
+queue, as simple YYYY-MM-DD strings.
+
+Within a day, will need to use a resumption token. Maybe should use a
+crossref library... meh.
+
+will want to have some mechanism in kafka consumer (pushing to fatcat) to group
+in batches as well. maybe even pass through as batches? or just use timeouts on
+iteration.
+"""
+
+from fatcat_tools.harvest.ingest_common import DoiApiHarvest
+
+class HarvestCrossrefWorker(DoiApiHarvest):
+
+ def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email,
+ api_host_url="https://api.crossref.org/works",
+ is_update_filter=None,
+ start_date=None, end_date=None):
+ super().__init__(kafka_hosts=kafka_hosts,
+ produce_topic=produce_topic,
+ state_topic=state_topic,
+ api_host_url=api_host_url,
+ contact_email=contact_email,
+ start_date=start_date,
+ end_date=end_date)
+
+ self.is_update_filter = is_update_filter
+