diff options
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 14 |
1 files changed, 5 insertions, 9 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 5a492fb4..e1980d90 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,6 +1,6 @@ - import collections +import tldextract from fatcat_openapi_client import ApiClient @@ -499,15 +499,11 @@ def file_to_elasticsearch(entity): md5 = entity.md5, ) - # TODO: domain, hosts (from urls; use proper urlcanon) + parsed_urls = [tldextract.extract(u.url) for u in entity.urls] + t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls])) + t['domains'] = list(set([pu.registered_domain for pu in parsed_urls])) t['rels'] = list(set([u.rel for u in entity.urls])) - t['hosts'] = [] - t['domains'] = [] - in_ia = False - for u in entity.urls: - if '://archive.org/' in u.url or '://web.archive.org/' in u.url: - in_ia = True - t['in_ia'] = bool(in_ia) + t['in_ia'] = bool('archive.org' in t['domains']) return t |