From 4cbee44529dd967c966ed3f2cc2bb80176be4e43 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 00:08:41 -0800 Subject: implement host+domain parsing for file ES transform --- python/tests/transform_elasticsearch.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'python/tests') diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index c247e745..e67681c6 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -106,11 +106,10 @@ def test_elasticsearch_file_transform(matched_importer): assert es['size_bytes'] == f.size assert es['mimetype'] == f.mimetype assert es['in_ia'] == True - assert 'web' in es['rels'] - # XXX: implement hosts and domain parsing with urlcanon - #assert 'journals.plos.org' in es['host'] - #assert 'plos.org' in es['domain'] + assert 'web' in es['rels'] + assert 'www.zhros.ru' in es['hosts'] + assert 'zhros.ru' in es['domains'] def test_elasticsearch_changelog_transform(matched_importer): ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) -- cgit v1.2.3