aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-04-06 20:04:03 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-04-06 21:58:54 -0700
commit2e781738937efecbfc527a47ade6c3deaba64247 (patch)
tree52278319ae6fe1fafe18ace92959b01bb32e82c4
parent61bd2d65fd1c4fbda2c28d36c5388a610b4d1d14 (diff)
downloadfatcat-2e781738937efecbfc527a47ade6c3deaba64247.tar.gz
fatcat-2e781738937efecbfc527a47ade6c3deaba64247.zip
container search schema: preservation stats, new fields
Includes transform code updates and partial test coverage.
-rw-r--r--extra/elasticsearch/container_schema.json17
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py20
-rw-r--r--python/tests/transform_elasticsearch.py47
3 files changed, 69 insertions, 15 deletions
diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json
index 21b8d4ec..9673e9e3 100644
--- a/extra/elasticsearch/container_schema.json
+++ b/extra/elasticsearch/container_schema.json
@@ -55,6 +55,7 @@
"issnl": { "type": "keyword", "normalizer": "default" },
"issns": { "type": "keyword", "normalizer": "default" },
"wikidata_qid": { "type": "keyword", "normalizer": "default" },
+ "dblp_prefix": { "type": "keyword", "normalizer": "default" },
"country_code": { "type": "keyword", "normalizer": "default" },
"region": { "type": "keyword", "normalizer": "default" },
"discipline": { "type": "keyword", "normalizer": "default" },
@@ -74,19 +75,19 @@
"any_jstor": { "type": "boolean" },
"any_ia_sim": { "type": "boolean" },
"sherpa_romeo_color": { "type": "keyword", "normalizer": "default" },
+ "keepers": { "type": "keyword", "normalizer": "default" },
- "releases_total": { "type": "integer" },
- "releases_kbart": { "type": "integer" },
- "releases_ia": { "type": "integer" },
- "releases_ia_sim": { "type": "integer" },
- "releases_shadows": { "type": "integer" },
- "releases_any_file": { "type": "integer" },
- "releases_any_fileset": { "type": "integer" },
- "releases_any_webcapture": { "type": "integer" },
+ "releases_total": { "type": "integer" },
+ "preservation_bright": { "type": "integer" },
+ "preservation_dark": { "type": "integer" },
+ "preservation_shadows_only":{ "type": "integer" },
+ "preservation_none": { "type": "integer" },
"year": { "type": "alias", "path": "first_year" },
"type": { "type": "alias", "path": "container_type" },
"issn": { "type": "alias", "path": "issns" },
+ "release_count": { "type": "alias", "path": "releases_total" },
+ "releases_count": { "type": "alias", "path": "releases_total" },
"oa": { "type": "alias", "path": "is_oa" },
"longtail": { "type": "alias", "path": "is_longtail_oa" }
}
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 5058989c..fe463fa4 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -377,7 +377,7 @@ def _rte_url_helper(url_obj) -> dict:
return t
-def container_to_elasticsearch(entity, force_bool=True):
+def container_to_elasticsearch(entity, force_bool=True, stats=None):
"""
Converts from an entity model/schema to elasticsearch oriented schema.
@@ -411,10 +411,13 @@ def container_to_elasticsearch(entity, force_bool=True):
entity.extra = dict()
for key in ('country', 'languages', 'mimetypes', 'original_name',
'first_year', 'last_year', 'aliases', 'abbrev', 'region',
- 'discipline'):
+ 'discipline', 'publisher_type'):
if entity.extra.get(key):
t[key] = entity.extra[key]
+ if entity.extra.get('dblp') and entity.extra['dblp'].get('prefix'):
+ t['dblp_prefix'] = entity.extra['dblp']['prefix']
+
if 'country' in t:
t['country_code'] = t.pop('country')
@@ -432,6 +435,7 @@ def container_to_elasticsearch(entity, force_bool=True):
any_kbart = None
any_jstor = None
any_ia_sim = None
+ keepers = []
extra = entity.extra
if extra.get('doaj'):
@@ -455,6 +459,9 @@ def container_to_elasticsearch(entity, force_bool=True):
any_kbart = True
if extra['kbart'].get('jstor'):
any_jstor = True
+ for k, v in extra['kbart'].items():
+ if v and isinstance(v, dict):
+ keepers.append(k)
if extra.get('ia'):
if extra['ia'].get('sim'):
any_ia_sim = True
@@ -462,6 +469,7 @@ def container_to_elasticsearch(entity, force_bool=True):
is_longtail_oa = True
t['is_superceded'] = bool(extra.get('superceded'))
+ t['keepers'] = keepers
t['in_doaj'] = bool(in_doaj)
t['in_road'] = bool(in_road)
t['any_kbart'] = bool(any_kbart)
@@ -475,6 +483,14 @@ def container_to_elasticsearch(entity, force_bool=True):
t['is_longtail_oa'] = is_longtail_oa
t['any_jstor'] = any_jstor
t['any_ia_sim'] = any_ia_sim
+
+ # mix in stats, if provided
+ if stats:
+ t['releases_total'] = stats['total']
+ t['preservation_bright'] = stats['preservation']['bright']
+ t['preservation_dark'] = stats['preservation']['dark']
+ t['preservation_shadows_only'] = stats['preservation']['shadows_only']
+ t['preservation_none'] = stats['preservation']['none']
return t
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index 9cf77d4a..ba2b7ea2 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -147,11 +147,48 @@ def test_elasticsearch_release_from_json():
def test_elasticsearch_container_transform(journal_metadata_importer):
with open('tests/files/journal_metadata.sample.json', 'r') as f:
- raw = json.loads(f.readline())
- c = journal_metadata_importer.parse_record(raw)
- c.state = 'active'
- es = container_to_elasticsearch(c)
- assert es['publisher'] == c.publisher
+ raw1 = json.loads(f.readline())
+ raw2 = json.loads(f.readline())
+ c1 = journal_metadata_importer.parse_record(raw1)
+ c1.state = 'active'
+ c2 = journal_metadata_importer.parse_record(raw2)
+ c2.state = 'active'
+
+ c1.extra['publisher_type'] = "big5"
+ c1.extra['discipline'] = "history"
+ es = container_to_elasticsearch(c1)
+ assert es['publisher'] == c1.publisher
+ assert es['discipline'] == c1.extra['discipline']
+ assert es['publisher_type'] == c1.extra['publisher_type']
+ assert es['keepers'] == []
+
+ stats = {
+ "ident": "en4qj5ijrbf5djxx7p5zzpjyoq",
+ "in_kbart": 11136,
+ "in_web": 9501,
+ "is_preserved": 11136,
+ "issnl": "2050-084X",
+ "preservation": {
+ "bright": 9501,
+ "dark": 1635,
+ "none": 0,
+ "shadows_only": 0,
+ "total": 11136
+ },
+ "release_type": {
+ "_unknown": 9,
+ "article-journal": 11124,
+ "editorial": 2,
+ "letter": 1
+ },
+ "total": 11136
+ }
+ es = container_to_elasticsearch(c2, stats=stats)
+ assert es['name'] == c2.name
+ assert es['publisher'] == c2.publisher
+ assert es['keepers'] == list(c2.extra['kbart'].keys()) == ["portico"]
+ assert es['any_kbart'] == True
+
def test_elasticsearch_file_transform(matched_importer):
f = entity_from_json(open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r').read(), FileEntity)