diff options
author | bnewbold <bnewbold@archive.org> | 2021-04-08 00:22:33 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2021-04-08 00:22:33 +0000 |
commit | 97280d0a20baa00aa1f8dbd3bec62142ad2ce900 (patch) | |
tree | 9320c75d5c19148aba7cd3a0ced0fc200988e6ba /python/fatcat_tools/transforms/elasticsearch.py | |
parent | 0b9fc884dad8e3147d10c273725157ba60f48069 (diff) | |
parent | 9f110393b90d5b9e95a39b4f83d3e864434dd189 (diff) | |
download | fatcat-97280d0a20baa00aa1f8dbd3bec62142ad2ce900.tar.gz fatcat-97280d0a20baa00aa1f8dbd3bec62142ad2ce900.zip |
Merge branch 'bnewbold-es-index-updates' into 'master'
fatcat elasticsearch schema updates
See merge request webgroup/fatcat!101
Diffstat (limited to 'python/fatcat_tools/transforms/elasticsearch.py')
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 26 |
1 files changed, 24 insertions, 2 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index f37aadba..4bf9091a 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -46,6 +46,7 @@ def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) -> # First, the easy ones (direct copy) release = entity t = dict( + doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z", ident = release.ident, state = release.state, revision = release.revision, @@ -295,6 +296,8 @@ def _rte_container_helper(container: ContainerEntity, release_year: Optional[int t['country_code_upper'] = c_extra['country'].upper() if c_extra.get('publisher_type'): t['publisher_type'] = c_extra['publisher_type'] + if c_extra.get('discipline'): + t['discipline'] = c_extra['discipline'] return t def _rte_content_helper(release: ReleaseEntity) -> dict: @@ -374,7 +377,7 @@ def _rte_url_helper(url_obj) -> dict: return t -def container_to_elasticsearch(entity, force_bool=True): +def container_to_elasticsearch(entity, force_bool=True, stats=None): """ Converts from an entity model/schema to elasticsearch oriented schema. @@ -392,6 +395,7 @@ def container_to_elasticsearch(entity, force_bool=True): # First, the easy ones (direct copy) t = dict( + doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z", ident = entity.ident, state = entity.state, revision = entity.revision, @@ -407,10 +411,13 @@ def container_to_elasticsearch(entity, force_bool=True): entity.extra = dict() for key in ('country', 'languages', 'mimetypes', 'original_name', 'first_year', 'last_year', 'aliases', 'abbrev', 'region', - 'discipline'): + 'discipline', 'publisher_type'): if entity.extra.get(key): t[key] = entity.extra[key] + if entity.extra.get('dblp') and entity.extra['dblp'].get('prefix'): + t['dblp_prefix'] = entity.extra['dblp']['prefix'] + if 'country' in t: t['country_code'] = t.pop('country') @@ -428,6 +435,7 @@ def container_to_elasticsearch(entity, force_bool=True): any_kbart = None any_jstor = None any_ia_sim = None + keepers = [] extra = entity.extra if extra.get('doaj'): @@ -451,6 +459,9 @@ def container_to_elasticsearch(entity, force_bool=True): any_kbart = True if extra['kbart'].get('jstor'): any_jstor = True + for k, v in extra['kbart'].items(): + if v and isinstance(v, dict): + keepers.append(k) if extra.get('ia'): if extra['ia'].get('sim'): any_ia_sim = True @@ -458,6 +469,7 @@ def container_to_elasticsearch(entity, force_bool=True): is_longtail_oa = True t['is_superceded'] = bool(extra.get('superceded')) + t['keepers'] = keepers t['in_doaj'] = bool(in_doaj) t['in_road'] = bool(in_road) t['any_kbart'] = bool(any_kbart) @@ -471,6 +483,14 @@ def container_to_elasticsearch(entity, force_bool=True): t['is_longtail_oa'] = is_longtail_oa t['any_jstor'] = any_jstor t['any_ia_sim'] = any_ia_sim + + # mix in stats, if provided + if stats: + t['releases_total'] = stats['total'] + t['preservation_bright'] = stats['preservation']['bright'] + t['preservation_dark'] = stats['preservation']['dark'] + t['preservation_shadows_only'] = stats['preservation']['shadows_only'] + t['preservation_none'] = stats['preservation']['none'] return t @@ -495,6 +515,7 @@ def changelog_to_elasticsearch(entity): editgroup = entity.editgroup t = dict( + doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z", index=entity.index, editgroup_id=entity.editgroup_id, timestamp=entity.timestamp.isoformat(), @@ -558,6 +579,7 @@ def file_to_elasticsearch(entity): # First, the easy ones (direct copy) t = dict( + doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z", ident = entity.ident, state = entity.state, revision = entity.revision, |