diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-30 00:08:41 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-30 00:08:41 -0800 |
commit | 4cbee44529dd967c966ed3f2cc2bb80176be4e43 (patch) | |
tree | eb434db2b532786fc9a3c6420c17c06db788a758 /python/fatcat_tools/transforms | |
parent | 461376e6c6107da9a1c0a41c379465ef1c39f051 (diff) | |
download | fatcat-4cbee44529dd967c966ed3f2cc2bb80176be4e43.tar.gz fatcat-4cbee44529dd967c966ed3f2cc2bb80176be4e43.zip |
implement host+domain parsing for file ES transform
Diffstat (limited to 'python/fatcat_tools/transforms')
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 14 |
1 files changed, 5 insertions, 9 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 5a492fb4..e1980d90 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,6 +1,6 @@ - import collections +import tldextract from fatcat_openapi_client import ApiClient @@ -499,15 +499,11 @@ def file_to_elasticsearch(entity): md5 = entity.md5, ) - # TODO: domain, hosts (from urls; use proper urlcanon) + parsed_urls = [tldextract.extract(u.url) for u in entity.urls] + t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls])) + t['domains'] = list(set([pu.registered_domain for pu in parsed_urls])) t['rels'] = list(set([u.rel for u in entity.urls])) - t['hosts'] = [] - t['domains'] = [] - in_ia = False - for u in entity.urls: - if '://archive.org/' in u.url or '://web.archive.org/' in u.url: - in_ia = True - t['in_ia'] = bool(in_ia) + t['in_ia'] = bool('archive.org' in t['domains']) return t |