diff options
author | bnewbold <bnewbold@archive.org> | 2020-10-20 23:32:26 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2020-10-20 23:32:26 +0000 |
commit | b6b30bf5468d01a878e3c2ecac933f29b5b6ddc5 (patch) | |
tree | aede638925dabfac40bbd1c77607e4b31e3a234c /python/fatcat_tools | |
parent | a7765fb8b6011c224dc1e4d7b0e30159bae7a903 (diff) | |
parent | 99c54764f0cbbb05409001b4af83182842f4e52d (diff) | |
download | fatcat-b6b30bf5468d01a878e3c2ecac933f29b5b6ddc5.tar.gz fatcat-b6b30bf5468d01a878e3c2ecac933f29b5b6ddc5.zip |
Merge branch 'bnewbold-scholar-pipeline' into 'master'
entity updater: new work update feed (ident and changelog metadata only)
See merge request webgroup/fatcat!87
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 26 |
1 files changed, 24 insertions, 2 deletions
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 13e46e02..d9f0cf91 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -79,7 +79,8 @@ class EntityUpdatesWorker(FatcatWorker): """ def __init__(self, api, kafka_hosts, consume_topic, release_topic, - file_topic, container_topic, ingest_file_request_topic, poll_interval=5.0): + file_topic, container_topic, ingest_file_request_topic, + work_ident_topic, poll_interval=5.0): super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic, api=api) @@ -87,6 +88,7 @@ class EntityUpdatesWorker(FatcatWorker): self.file_topic = file_topic self.container_topic = container_topic self.ingest_file_request_topic = ingest_file_request_topic + self.work_ident_topic = work_ident_topic self.poll_interval = poll_interval self.consumer_group = "entity-updates" self.ingest_oa_only = False @@ -365,7 +367,8 @@ class EntityUpdatesWorker(FatcatWorker): ) for ident in set(release_ids): release = self.api.get_release(ident, expand="files,filesets,webcaptures,container") - work_ids.append(release.work_id) + if release.work_id: + work_ids.append(release.work_id) release_dict = self.api.api_client.sanitize_for_serialization(release) producer.produce( self.release_topic, @@ -383,6 +386,25 @@ class EntityUpdatesWorker(FatcatWorker): #key=None, on_delivery=fail_fast, ) + + # send work updates (just ident and changelog metadata) to scholar for re-indexing + for ident in set(work_ids): + assert ident + key = f"work_{ident}" + work_ident_dict = dict( + key=key, + type="fatcat_work", + work_ident=ident, + updated=cle['timestamp'], + fatcat_changelog_index=cle['index'], + ) + producer.produce( + self.work_ident_topic, + json.dumps(work_ident_dict).encode('utf-8'), + key=key.encode('utf-8'), + on_delivery=fail_fast, + ) + producer.flush() # TODO: publish updated 'work' entities to a topic consumer.store_offsets(message=msg) |