From 2e781738937efecbfc527a47ade6c3deaba64247 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 6 Apr 2021 20:04:03 -0700 Subject: container search schema: preservation stats, new fields Includes transform code updates and partial test coverage. --- extra/elasticsearch/container_schema.json | 17 ++++----- python/fatcat_tools/transforms/elasticsearch.py | 20 +++++++++-- python/tests/transform_elasticsearch.py | 47 ++++++++++++++++++++++--- 3 files changed, 69 insertions(+), 15 deletions(-) diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index 21b8d4ec..9673e9e3 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -55,6 +55,7 @@ "issnl": { "type": "keyword", "normalizer": "default" }, "issns": { "type": "keyword", "normalizer": "default" }, "wikidata_qid": { "type": "keyword", "normalizer": "default" }, + "dblp_prefix": { "type": "keyword", "normalizer": "default" }, "country_code": { "type": "keyword", "normalizer": "default" }, "region": { "type": "keyword", "normalizer": "default" }, "discipline": { "type": "keyword", "normalizer": "default" }, @@ -74,19 +75,19 @@ "any_jstor": { "type": "boolean" }, "any_ia_sim": { "type": "boolean" }, "sherpa_romeo_color": { "type": "keyword", "normalizer": "default" }, + "keepers": { "type": "keyword", "normalizer": "default" }, - "releases_total": { "type": "integer" }, - "releases_kbart": { "type": "integer" }, - "releases_ia": { "type": "integer" }, - "releases_ia_sim": { "type": "integer" }, - "releases_shadows": { "type": "integer" }, - "releases_any_file": { "type": "integer" }, - "releases_any_fileset": { "type": "integer" }, - "releases_any_webcapture": { "type": "integer" }, + "releases_total": { "type": "integer" }, + "preservation_bright": { "type": "integer" }, + "preservation_dark": { "type": "integer" }, + "preservation_shadows_only":{ "type": "integer" }, + "preservation_none": { "type": "integer" }, "year": { "type": "alias", "path": "first_year" }, "type": { "type": "alias", "path": "container_type" }, "issn": { "type": "alias", "path": "issns" }, + "release_count": { "type": "alias", "path": "releases_total" }, + "releases_count": { "type": "alias", "path": "releases_total" }, "oa": { "type": "alias", "path": "is_oa" }, "longtail": { "type": "alias", "path": "is_longtail_oa" } } diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 5058989c..fe463fa4 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -377,7 +377,7 @@ def _rte_url_helper(url_obj) -> dict: return t -def container_to_elasticsearch(entity, force_bool=True): +def container_to_elasticsearch(entity, force_bool=True, stats=None): """ Converts from an entity model/schema to elasticsearch oriented schema. @@ -411,10 +411,13 @@ def container_to_elasticsearch(entity, force_bool=True): entity.extra = dict() for key in ('country', 'languages', 'mimetypes', 'original_name', 'first_year', 'last_year', 'aliases', 'abbrev', 'region', - 'discipline'): + 'discipline', 'publisher_type'): if entity.extra.get(key): t[key] = entity.extra[key] + if entity.extra.get('dblp') and entity.extra['dblp'].get('prefix'): + t['dblp_prefix'] = entity.extra['dblp']['prefix'] + if 'country' in t: t['country_code'] = t.pop('country') @@ -432,6 +435,7 @@ def container_to_elasticsearch(entity, force_bool=True): any_kbart = None any_jstor = None any_ia_sim = None + keepers = [] extra = entity.extra if extra.get('doaj'): @@ -455,6 +459,9 @@ def container_to_elasticsearch(entity, force_bool=True): any_kbart = True if extra['kbart'].get('jstor'): any_jstor = True + for k, v in extra['kbart'].items(): + if v and isinstance(v, dict): + keepers.append(k) if extra.get('ia'): if extra['ia'].get('sim'): any_ia_sim = True @@ -462,6 +469,7 @@ def container_to_elasticsearch(entity, force_bool=True): is_longtail_oa = True t['is_superceded'] = bool(extra.get('superceded')) + t['keepers'] = keepers t['in_doaj'] = bool(in_doaj) t['in_road'] = bool(in_road) t['any_kbart'] = bool(any_kbart) @@ -475,6 +483,14 @@ def container_to_elasticsearch(entity, force_bool=True): t['is_longtail_oa'] = is_longtail_oa t['any_jstor'] = any_jstor t['any_ia_sim'] = any_ia_sim + + # mix in stats, if provided + if stats: + t['releases_total'] = stats['total'] + t['preservation_bright'] = stats['preservation']['bright'] + t['preservation_dark'] = stats['preservation']['dark'] + t['preservation_shadows_only'] = stats['preservation']['shadows_only'] + t['preservation_none'] = stats['preservation']['none'] return t diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index 9cf77d4a..ba2b7ea2 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -147,11 +147,48 @@ def test_elasticsearch_release_from_json(): def test_elasticsearch_container_transform(journal_metadata_importer): with open('tests/files/journal_metadata.sample.json', 'r') as f: - raw = json.loads(f.readline()) - c = journal_metadata_importer.parse_record(raw) - c.state = 'active' - es = container_to_elasticsearch(c) - assert es['publisher'] == c.publisher + raw1 = json.loads(f.readline()) + raw2 = json.loads(f.readline()) + c1 = journal_metadata_importer.parse_record(raw1) + c1.state = 'active' + c2 = journal_metadata_importer.parse_record(raw2) + c2.state = 'active' + + c1.extra['publisher_type'] = "big5" + c1.extra['discipline'] = "history" + es = container_to_elasticsearch(c1) + assert es['publisher'] == c1.publisher + assert es['discipline'] == c1.extra['discipline'] + assert es['publisher_type'] == c1.extra['publisher_type'] + assert es['keepers'] == [] + + stats = { + "ident": "en4qj5ijrbf5djxx7p5zzpjyoq", + "in_kbart": 11136, + "in_web": 9501, + "is_preserved": 11136, + "issnl": "2050-084X", + "preservation": { + "bright": 9501, + "dark": 1635, + "none": 0, + "shadows_only": 0, + "total": 11136 + }, + "release_type": { + "_unknown": 9, + "article-journal": 11124, + "editorial": 2, + "letter": 1 + }, + "total": 11136 + } + es = container_to_elasticsearch(c2, stats=stats) + assert es['name'] == c2.name + assert es['publisher'] == c2.publisher + assert es['keepers'] == list(c2.extra['kbart'].keys()) == ["portico"] + assert es['any_kbart'] == True + def test_elasticsearch_file_transform(matched_importer): f = entity_from_json(open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r').read(), FileEntity) -- cgit v1.2.3