summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/transforms/elasticsearch.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-29 16:00:03 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-29 21:59:05 -0800
commit901cf998ce7d8f896cf5d609719b1defd96d01d4 (patch)
tree85a5eebe76e8ca4df3b1b4217ecc9e829f50583c /python/fatcat_tools/transforms/elasticsearch.py
parent55a4f211532c93d8164b0d4719dc0413005941ea (diff)
downloadfatcat-901cf998ce7d8f896cf5d609719b1defd96d01d4.tar.gz
fatcat-901cf998ce7d8f896cf5d609719b1defd96d01d4.zip
first implementation of ES file schema
Includes a trivial test and transform, but not any workers or doc updates.
Diffstat (limited to 'python/fatcat_tools/transforms/elasticsearch.py')
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py45
1 files changed, 45 insertions, 0 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 3a53db4d..8141a8b9 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -357,3 +357,48 @@ def changelog_to_elasticsearch(entity):
#t['deleted'] = deleted
#t['total'] = created + updated + deleted
return t
+
+
+def file_to_elasticsearch(entity):
+ """
+ Converts from an entity model/schema to elasticsearch oriented schema.
+
+ Returns: dict
+ Raises exception on error (never returns None)
+ """
+
+ if entity.state in ('redirect', 'deleted'):
+ return dict(
+ ident = entity.ident,
+ state = entity.state,
+ )
+ elif entity.state != 'active':
+ raise ValueError("Unhandled entity state: {}".format(entity.state))
+
+ # First, the easy ones (direct copy)
+ t = dict(
+ ident = entity.ident,
+ state = entity.state,
+ revision = entity.revision,
+ release_ids = entity.release_ids,
+ release_count = len(entity.release_ids),
+ mimetype = entity.mimetype,
+ size_bytes = entity.size,
+ sha1 = entity.sha1,
+ sha256 = entity.sha256,
+ md5 = entity.md5,
+ rel = [u.rel for u in entity.urls],
+ )
+
+ # TODO: domain, hosts (from urls; use proper urlcanon)
+ t['rel'] = list(set([u.rel for u in entity.urls]))
+ t['host'] = []
+ t['domain'] = []
+
+ in_ia = False
+ for u in entity.urls:
+ if '://archive.org/' in u.url or '://web.archive.org/' in u.url:
+ in_ia = True
+ t['in_ia'] = bool(in_ia)
+
+ return t