diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-28 15:30:06 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-28 15:30:06 -0800 |
commit | 1eeecb90a4ce743d4d83c651b9e8c0128541404a (patch) | |
tree | eb63383e10f17ecf0968a0aab26e3f42b218a534 | |
parent | f6f7450903bdbe36bd5fff146b942e34ad221557 (diff) | |
download | fatcat-1eeecb90a4ce743d4d83c651b9e8c0128541404a.tar.gz fatcat-1eeecb90a4ce743d4d83c651b9e8c0128541404a.zip |
fatcat -> fatcat_release ES index
-rwxr-xr-x | python/fatcat_export.py | 18 | ||||
-rw-r--r-- | python/fatcat_tools/transforms.py | 18 | ||||
-rw-r--r-- | python/fatcat_tools/workers/elasticsearch.py | 5 |
3 files changed, 21 insertions, 20 deletions
diff --git a/python/fatcat_export.py b/python/fatcat_export.py index 027d6c0a..33e23202 100755 --- a/python/fatcat_export.py +++ b/python/fatcat_export.py @@ -19,19 +19,18 @@ from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \ def run_export_releases(args): - api = args.api for line in args.ident_file: ident = uuid2fcid(line.split()[0]) - release = api.get_release(ident=ident, expand="all") + release = args.api.get_release(ident=ident, expand="all") args.json_output.write( - json.dumps(entity_to_dict(release)) + "\n") + json.dumps(entity_to_dict(release), api_client=args.api.api_client) + "\n") def run_transform_releases(args): for line in args.json_input: line = line.strip() if not line: continue - entity = entity_from_json(line, ReleaseEntity) + entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) args.json_output.write( json.dumps(release_to_elasticsearch(entity)) + '\n') @@ -40,7 +39,7 @@ def run_transform_containers(args): line = line.strip() if not line: continue - entity = entity_from_json(line, ContainerEntity) + entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client) args.json_output.write( json.dumps(container_to_elasticsearch(entity)) + '\n') @@ -49,21 +48,20 @@ def run_transform_changelogs(args): line = line.strip() if not line: continue - entity = entity_from_json(line, ChangelogEntry) + entity = entity_from_json(line, ChangelogEntry, api_client=args.api.api_client) args.json_output.write( json.dumps(changelog_to_elasticsearch(entity)) + '\n') def run_export_changelog(args): - api = args.api end = args.end if end is None: - latest = api.get_changelog(limit=1)[0] + latest = args.api.get_changelog(limit=1)[0] end = latest.index for i in range(args.start, end): - entry = api.get_changelog_entry(index=i) + entry = args.api.get_changelog_entry(index=i) args.json_output.write( - json.dumps(entity_to_dict(entry)) + "\n") + json.dumps(entity_to_dict(entry, api_client=args.api.api_client)) + "\n") def main(): parser = argparse.ArgumentParser() diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index 7bb75c3e..37cbf1d2 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -1,23 +1,25 @@ import collections -from fatcat_client import ReleaseEntity, ApiClient +from fatcat_client import ApiClient -def entity_to_dict(entity): +def entity_to_dict(entity, api_client=None): """ Hack to take advantage of the code-generated serialization code """ - ac = ApiClient() - return ac.sanitize_for_serialization(entity) + if not api_client: + api_client = ApiClient() + return api_client.sanitize_for_serialization(entity) -def entity_from_json(json_str, entity_type): +def entity_from_json(json_str, entity_type, api_client=None): """ Hack to take advantage of the code-generated deserialization code """ - ac = ApiClient() + if not api_client: + api_client = ApiClient() thing = collections.namedtuple('Thing', ['data']) thing.data = json_str - return ac.deserialize(thing, entity_type) + return api_client.deserialize(thing, entity_type) def check_kbart(year, archive): if not archive or not archive.get('year_spans'): @@ -284,7 +286,7 @@ def container_to_elasticsearch(entity): t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa t['is_longtail_oa'] = is_longtail_oa t['any_kbart'] = any_ia_sim - t['any_jstor'] = any_ia_sim + t['any_jstor'] = any_jstor t['any_ia_sim'] = bool(any_ia_sim) return t diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py index 3adb088e..0a49192e 100644 --- a/python/fatcat_tools/workers/elasticsearch.py +++ b/python/fatcat_tools/workers/elasticsearch.py @@ -4,7 +4,7 @@ import time import requests from pykafka.common import OffsetType -from fatcat_client import ReleaseEntity +from fatcat_client import ReleaseEntity, ApiClient from fatcat_tools import * from .worker_common import FatcatWorker @@ -28,6 +28,7 @@ class ElasticsearchReleaseWorker(FatcatWorker): def run(self): consume_topic = self.kafka.topics[self.consume_topic] + ac = ApiClient() consumer = consume_topic.get_balanced_consumer( consumer_group=self.consumer_group, @@ -40,7 +41,7 @@ class ElasticsearchReleaseWorker(FatcatWorker): for msg in consumer: json_str = msg.value.decode('utf-8') - release = entity_from_json(json_str, ReleaseEntity) + release = entity_from_json(json_str, ReleaseEntity, api_client=ac) #print(release) elasticsearch_endpoint = "{}/{}/release/{}".format( self.elasticsearch_backend, |