From 2e781738937efecbfc527a47ade6c3deaba64247 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 6 Apr 2021 20:04:03 -0700
Subject: container search schema: preservation stats, new fields

Includes transform code updates and partial test coverage.
---
 python/fatcat_tools/transforms/elasticsearch.py | 20 +++++++++--
 python/tests/transform_elasticsearch.py         | 47 ++++++++++++++++++++++---
 2 files changed, 60 insertions(+), 7 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 5058989c..fe463fa4 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -377,7 +377,7 @@ def _rte_url_helper(url_obj) -> dict:
     return t
 
 
-def container_to_elasticsearch(entity, force_bool=True):
+def container_to_elasticsearch(entity, force_bool=True, stats=None):
     """
     Converts from an entity model/schema to elasticsearch oriented schema.
 
@@ -411,10 +411,13 @@ def container_to_elasticsearch(entity, force_bool=True):
         entity.extra = dict()
     for key in ('country', 'languages', 'mimetypes', 'original_name',
                 'first_year', 'last_year', 'aliases', 'abbrev', 'region',
-                'discipline'):
+                'discipline', 'publisher_type'):
         if entity.extra.get(key):
             t[key] = entity.extra[key]
 
+    if entity.extra.get('dblp') and entity.extra['dblp'].get('prefix'):
+        t['dblp_prefix'] = entity.extra['dblp']['prefix']
+
     if 'country' in t:
         t['country_code'] = t.pop('country')
 
@@ -432,6 +435,7 @@ def container_to_elasticsearch(entity, force_bool=True):
     any_kbart = None
     any_jstor = None
     any_ia_sim = None
+    keepers = []
 
     extra = entity.extra
     if extra.get('doaj'):
@@ -455,6 +459,9 @@ def container_to_elasticsearch(entity, force_bool=True):
         any_kbart = True
         if extra['kbart'].get('jstor'):
             any_jstor = True
+        for k, v in extra['kbart'].items():
+            if v and isinstance(v, dict):
+                keepers.append(k)
     if extra.get('ia'):
         if extra['ia'].get('sim'):
             any_ia_sim = True
@@ -462,6 +469,7 @@ def container_to_elasticsearch(entity, force_bool=True):
             is_longtail_oa = True
     t['is_superceded'] = bool(extra.get('superceded'))
 
+    t['keepers'] = keepers
     t['in_doaj'] = bool(in_doaj)
     t['in_road'] = bool(in_road)
     t['any_kbart'] = bool(any_kbart)
@@ -475,6 +483,14 @@ def container_to_elasticsearch(entity, force_bool=True):
         t['is_longtail_oa'] = is_longtail_oa
         t['any_jstor'] = any_jstor
         t['any_ia_sim'] = any_ia_sim
+
+    # mix in stats, if provided
+    if stats:
+        t['releases_total'] = stats['total']
+        t['preservation_bright'] = stats['preservation']['bright']
+        t['preservation_dark'] = stats['preservation']['dark']
+        t['preservation_shadows_only'] = stats['preservation']['shadows_only']
+        t['preservation_none'] = stats['preservation']['none']
     return t
 
 
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index 9cf77d4a..ba2b7ea2 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -147,11 +147,48 @@ def test_elasticsearch_release_from_json():
 
 def test_elasticsearch_container_transform(journal_metadata_importer):
     with open('tests/files/journal_metadata.sample.json', 'r') as f:
-        raw = json.loads(f.readline())
-        c = journal_metadata_importer.parse_record(raw)
-    c.state = 'active'
-    es = container_to_elasticsearch(c)
-    assert es['publisher'] == c.publisher
+        raw1 = json.loads(f.readline())
+        raw2 = json.loads(f.readline())
+        c1 = journal_metadata_importer.parse_record(raw1)
+        c1.state = 'active'
+        c2 = journal_metadata_importer.parse_record(raw2)
+        c2.state = 'active'
+
+    c1.extra['publisher_type'] = "big5"
+    c1.extra['discipline'] = "history"
+    es = container_to_elasticsearch(c1)
+    assert es['publisher'] == c1.publisher
+    assert es['discipline'] == c1.extra['discipline']
+    assert es['publisher_type'] == c1.extra['publisher_type']
+    assert es['keepers'] == []
+
+    stats = {
+        "ident": "en4qj5ijrbf5djxx7p5zzpjyoq",
+        "in_kbart": 11136,
+        "in_web": 9501,
+        "is_preserved": 11136,
+        "issnl": "2050-084X",
+        "preservation": {
+            "bright": 9501,
+            "dark": 1635,
+            "none": 0,
+            "shadows_only": 0,
+            "total": 11136
+        },
+        "release_type": {
+            "_unknown": 9,
+            "article-journal": 11124,
+            "editorial": 2,
+            "letter": 1
+        },
+        "total": 11136
+    }
+    es = container_to_elasticsearch(c2, stats=stats)
+    assert es['name'] == c2.name
+    assert es['publisher'] == c2.publisher
+    assert es['keepers'] == list(c2.extra['kbart'].keys()) == ["portico"]
+    assert es['any_kbart'] == True
+
 
 def test_elasticsearch_file_transform(matched_importer):
     f = entity_from_json(open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r').read(), FileEntity)
-- 
cgit v1.2.3