aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/transforms
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-30 00:08:41 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-30 00:08:41 -0800
commit4cbee44529dd967c966ed3f2cc2bb80176be4e43 (patch)
treeeb434db2b532786fc9a3c6420c17c06db788a758 /python/fatcat_tools/transforms
parent461376e6c6107da9a1c0a41c379465ef1c39f051 (diff)
downloadfatcat-4cbee44529dd967c966ed3f2cc2bb80176be4e43.tar.gz
fatcat-4cbee44529dd967c966ed3f2cc2bb80176be4e43.zip
implement host+domain parsing for file ES transform
Diffstat (limited to 'python/fatcat_tools/transforms')
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py14
1 files changed, 5 insertions, 9 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 5a492fb4..e1980d90 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,6 +1,6 @@
-
import collections
+import tldextract
from fatcat_openapi_client import ApiClient
@@ -499,15 +499,11 @@ def file_to_elasticsearch(entity):
md5 = entity.md5,
)
- # TODO: domain, hosts (from urls; use proper urlcanon)
+ parsed_urls = [tldextract.extract(u.url) for u in entity.urls]
+ t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls]))
+ t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))
t['rels'] = list(set([u.rel for u in entity.urls]))
- t['hosts'] = []
- t['domains'] = []
- in_ia = False
- for u in entity.urls:
- if '://archive.org/' in u.url or '://web.archive.org/' in u.url:
- in_ia = True
- t['in_ia'] = bool(in_ia)
+ t['in_ia'] = bool('archive.org' in t['domains'])
return t