diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-31 13:31:59 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-31 13:31:59 -0800 |
commit | 741c7b1efae5e39f3ee2c082e3ca28c6c5c85b76 (patch) | |
tree | 7c380becdc0b2cd8e1e5b40af76e4f85b3fe0a1c | |
parent | 0d037d0d2f73b18014d8d98a06fa3f7bc2c9b794 (diff) | |
download | fatcat-741c7b1efae5e39f3ee2c082e3ca28c6c5c85b76.tar.gz fatcat-741c7b1efae5e39f3ee2c082e3ca28c6c5c85b76.zip |
ES releases: host/domain fixes
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 4 | ||||
-rw-r--r-- | python/tests/transform_elasticsearch.py | 3 |
2 files changed, 5 insertions, 2 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index b5abe2ae..f8bc05fb 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -502,7 +502,7 @@ def file_to_elasticsearch(entity): ) parsed_urls = [tldextract.extract(u.url) for u in entity.urls] - t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls])) + t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls])) t['domains'] = list(set([pu.registered_domain for pu in parsed_urls])) t['rels'] = list(set([u.rel for u in entity.urls])) @@ -512,6 +512,6 @@ def file_to_elasticsearch(entity): # ok, but actually remove archive.org hosts, because they make other # aggregations hard and are a waste of storage t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')] - t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')] + t['domains'] = [h for h in t['domains'] if h not in ('archive.org')] return t diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index e67681c6..c94ab375 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -110,6 +110,9 @@ def test_elasticsearch_file_transform(matched_importer): assert 'web' in es['rels'] assert 'www.zhros.ru' in es['hosts'] assert 'zhros.ru' in es['domains'] + assert not '.archive.org' in (es['hosts'] + es['domains']) + assert not 'archive.org' in (es['hosts'] + es['domains']) + assert not 'web.archive.org' in (es['hosts'] + es['domains']) def test_elasticsearch_changelog_transform(matched_importer): ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) |