diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-30 00:08:41 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-30 00:08:41 -0800 |
commit | 4cbee44529dd967c966ed3f2cc2bb80176be4e43 (patch) | |
tree | eb434db2b532786fc9a3c6420c17c06db788a758 /python/tests | |
parent | 461376e6c6107da9a1c0a41c379465ef1c39f051 (diff) | |
download | fatcat-4cbee44529dd967c966ed3f2cc2bb80176be4e43.tar.gz fatcat-4cbee44529dd967c966ed3f2cc2bb80176be4e43.zip |
implement host+domain parsing for file ES transform
Diffstat (limited to 'python/tests')
-rw-r--r-- | python/tests/transform_elasticsearch.py | 7 |
1 files changed, 3 insertions, 4 deletions
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index c247e745..e67681c6 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -106,11 +106,10 @@ def test_elasticsearch_file_transform(matched_importer): assert es['size_bytes'] == f.size assert es['mimetype'] == f.mimetype assert es['in_ia'] == True - assert 'web' in es['rels'] - # XXX: implement hosts and domain parsing with urlcanon - #assert 'journals.plos.org' in es['host'] - #assert 'plos.org' in es['domain'] + assert 'web' in es['rels'] + assert 'www.zhros.ru' in es['hosts'] + assert 'zhros.ru' in es['domains'] def test_elasticsearch_changelog_transform(matched_importer): ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) |