From 4cbee44529dd967c966ed3f2cc2bb80176be4e43 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 30 Jan 2020 00:08:41 -0800
Subject: implement host+domain parsing for file ES transform

---
 python/fatcat_tools/transforms/elasticsearch.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'python/fatcat_tools')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 5a492fb4..e1980d90 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,6 +1,6 @@
 
-
 import collections
+import tldextract
 from fatcat_openapi_client import ApiClient
 
 
@@ -499,15 +499,11 @@ def file_to_elasticsearch(entity):
         md5 = entity.md5,
     )
 
-    # TODO: domain, hosts (from urls; use proper urlcanon)
+    parsed_urls = [tldextract.extract(u.url) for u in entity.urls]
+    t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls]))
+    t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))
     t['rels'] = list(set([u.rel for u in entity.urls]))
-    t['hosts'] = []
-    t['domains'] = []
 
-    in_ia = False
-    for u in entity.urls:
-        if '://archive.org/' in u.url or '://web.archive.org/' in u.url:
-            in_ia = True
-    t['in_ia'] = bool(in_ia)
+    t['in_ia'] = bool('archive.org' in t['domains'])
 
     return t
-- 
cgit v1.2.3