From 901cf998ce7d8f896cf5d609719b1defd96d01d4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 16:00:03 -0800 Subject: first implementation of ES file schema Includes a trivial test and transform, but not any workers or doc updates. --- python/fatcat_tools/transforms/__init__.py | 2 +- python/fatcat_tools/transforms/elasticsearch.py | 45 +++++++++++++++++++++++++ python/tests/transform_tests.py | 25 ++++++++++++-- 3 files changed, 69 insertions(+), 3 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py index 6a4b1bba..3f4700ff 100644 --- a/python/fatcat_tools/transforms/__init__.py +++ b/python/fatcat_tools/transforms/__init__.py @@ -1,5 +1,5 @@ from .entities import entity_to_dict, entity_from_json, entity_from_dict -from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch +from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch, file_to_elasticsearch from .csl import release_to_csl, citeproc_csl from .ingest import release_ingest_request diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 3a53db4d..8141a8b9 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -357,3 +357,48 @@ def changelog_to_elasticsearch(entity): #t['deleted'] = deleted #t['total'] = created + updated + deleted return t + + +def file_to_elasticsearch(entity): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + Raises exception on error (never returns None) + """ + + if entity.state in ('redirect', 'deleted'): + return dict( + ident = entity.ident, + state = entity.state, + ) + elif entity.state != 'active': + raise ValueError("Unhandled entity state: {}".format(entity.state)) + + # First, the easy ones (direct copy) + t = dict( + ident = entity.ident, + state = entity.state, + revision = entity.revision, + release_ids = entity.release_ids, + release_count = len(entity.release_ids), + mimetype = entity.mimetype, + size_bytes = entity.size, + sha1 = entity.sha1, + sha256 = entity.sha256, + md5 = entity.md5, + rel = [u.rel for u in entity.urls], + ) + + # TODO: domain, hosts (from urls; use proper urlcanon) + t['rel'] = list(set([u.rel for u in entity.urls])) + t['host'] = [] + t['domain'] = [] + + in_ia = False + for u in entity.urls: + if '://archive.org/' in u.url or '://web.archive.org/' in u.url: + in_ia = True + t['in_ia'] = bool(in_ia) + + return t diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py index f254e117..7b583ac4 100644 --- a/python/tests/transform_tests.py +++ b/python/tests/transform_tests.py @@ -7,6 +7,7 @@ from fixtures import api from import_journal_metadata import journal_metadata_importer from import_crossref import crossref_importer +from import_matched import matched_importer def test_basic_elasticsearch_convert(crossref_importer): with open('tests/files/crossref-works.single.json', 'r') as f: @@ -72,14 +73,34 @@ def test_rich_elasticsearch_convert(): assert es['ref_count'] == 2 assert es['ref_linked_count'] == 1 -def test_elasticsearch_from_json(): +def test_elasticsearch_release_from_json(): r = entity_from_json(open('./tests/files/math_universe.json', 'r').read(), ReleaseEntity) release_to_elasticsearch(r) -def test_elasticsearch_container_convert(journal_metadata_importer): +def test_elasticsearch_container_transform(journal_metadata_importer): with open('tests/files/journal_metadata.sample.json', 'r') as f: raw = json.loads(f.readline()) c = journal_metadata_importer.parse_record(raw) c.state = 'active' es = container_to_elasticsearch(c) assert es['publisher'] == c.publisher + +def test_elasticsearch_file_transform(matched_importer): + with open('tests/files/example_matched.json', 'r') as f: + raw = json.loads(f.readline()) + f = matched_importer.parse_record(raw) + + f.state = 'active' + es = file_to_elasticsearch(f) + assert es['sha1'] == f.sha1 + assert es['sha256'] == f.sha256 + assert es['md5'] == f.md5 + assert es['size_bytes'] == f.size + assert es['mimetype'] == f.mimetype + assert es['in_ia'] == True + assert 'publisher' in es['rel'] + + # XXX: implement hosts and domain parsing with urlcanon + #assert 'journals.plos.org' in es['host'] + #assert 'plos.org' in es['domain'] + -- cgit v1.2.3