diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-29 16:00:03 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-29 21:59:05 -0800 |
commit | 901cf998ce7d8f896cf5d609719b1defd96d01d4 (patch) | |
tree | 85a5eebe76e8ca4df3b1b4217ecc9e829f50583c /python/fatcat_tools/transforms/elasticsearch.py | |
parent | 55a4f211532c93d8164b0d4719dc0413005941ea (diff) | |
download | fatcat-901cf998ce7d8f896cf5d609719b1defd96d01d4.tar.gz fatcat-901cf998ce7d8f896cf5d609719b1defd96d01d4.zip |
first implementation of ES file schema
Includes a trivial test and transform, but not any workers or doc
updates.
Diffstat (limited to 'python/fatcat_tools/transforms/elasticsearch.py')
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 3a53db4d..8141a8b9 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -357,3 +357,48 @@ def changelog_to_elasticsearch(entity): #t['deleted'] = deleted #t['total'] = created + updated + deleted return t + + +def file_to_elasticsearch(entity): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + Raises exception on error (never returns None) + """ + + if entity.state in ('redirect', 'deleted'): + return dict( + ident = entity.ident, + state = entity.state, + ) + elif entity.state != 'active': + raise ValueError("Unhandled entity state: {}".format(entity.state)) + + # First, the easy ones (direct copy) + t = dict( + ident = entity.ident, + state = entity.state, + revision = entity.revision, + release_ids = entity.release_ids, + release_count = len(entity.release_ids), + mimetype = entity.mimetype, + size_bytes = entity.size, + sha1 = entity.sha1, + sha256 = entity.sha256, + md5 = entity.md5, + rel = [u.rel for u in entity.urls], + ) + + # TODO: domain, hosts (from urls; use proper urlcanon) + t['rel'] = list(set([u.rel for u in entity.urls])) + t['host'] = [] + t['domain'] = [] + + in_ia = False + for u in entity.urls: + if '://archive.org/' in u.url or '://web.archive.org/' in u.url: + in_ia = True + t['in_ia'] = bool(in_ia) + + return t |