1 files changed, 39 insertions, 0 deletions
diff --git a/python/fatcat_tools/harvest/crossrefish.py b/python/fatcat_tools/harvest/crossrefish.py
new file mode 100644
index 00000000..a88cedbd
--- /dev/null
+++ b/python/fatcat_tools/harvest/crossrefish.py
@@ -0,0 +1,39 @@
+
+"""
+Notes on crossref API:
+
+- from-index-date is the updated time
+- is-update can be false, to catch only new or only old works
+
+https://api.crossref.org/works?filter=from-index-date:2018-11-14,is-update:false&rows=2
+
+I think the design is going to have to be a cronjob or long-running job
+(with long sleeps) which publishes "success through" to a separate state
+queue, as simple YYYY-MM-DD strings.
+
+Within a day, will need to use a resumption token. Maybe should use a
+crossref library... meh.
+
+will want to have some mechanism in kafka consumer (pushing to fatcat) to group
+in batches as well. maybe even pass through as batches? or just use timeouts on
+iteration.
+"""
+
+from fatcat_tools.harvest.ingest_common import DoiApiHarvest
+
+class HarvestCrossrefWorker(DoiApiHarvest):
+
+    def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email,
+            api_host_url="https://api.crossref.org/works",
+            is_update_filter=None,
+            start_date=None, end_date=None):
+        super().__init__(kafka_hosts=kafka_hosts,
+                         produce_topic=produce_topic,
+                         state_topic=state_topic,
+                         api_host_url=api_host_url,
+                         contact_email=contact_email,
+                         start_date=start_date,
+                         end_date=end_date)
+
+        self.is_update_filter = is_update_filter
+