diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-30 00:08:41 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-30 00:08:41 -0800 |
commit | 4cbee44529dd967c966ed3f2cc2bb80176be4e43 (patch) | |
tree | eb434db2b532786fc9a3c6420c17c06db788a758 | |
parent | 461376e6c6107da9a1c0a41c379465ef1c39f051 (diff) | |
download | fatcat-4cbee44529dd967c966ed3f2cc2bb80176be4e43.tar.gz fatcat-4cbee44529dd967c966ed3f2cc2bb80176be4e43.zip |
implement host+domain parsing for file ES transform
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 14 | ||||
-rw-r--r-- | python/tests/transform_elasticsearch.py | 7 |
2 files changed, 8 insertions, 13 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 5a492fb4..e1980d90 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,6 +1,6 @@ - import collections +import tldextract from fatcat_openapi_client import ApiClient @@ -499,15 +499,11 @@ def file_to_elasticsearch(entity): md5 = entity.md5, ) - # TODO: domain, hosts (from urls; use proper urlcanon) + parsed_urls = [tldextract.extract(u.url) for u in entity.urls] + t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls])) + t['domains'] = list(set([pu.registered_domain for pu in parsed_urls])) t['rels'] = list(set([u.rel for u in entity.urls])) - t['hosts'] = [] - t['domains'] = [] - in_ia = False - for u in entity.urls: - if '://archive.org/' in u.url or '://web.archive.org/' in u.url: - in_ia = True - t['in_ia'] = bool(in_ia) + t['in_ia'] = bool('archive.org' in t['domains']) return t diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index c247e745..e67681c6 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -106,11 +106,10 @@ def test_elasticsearch_file_transform(matched_importer): assert es['size_bytes'] == f.size assert es['mimetype'] == f.mimetype assert es['in_ia'] == True - assert 'web' in es['rels'] - # XXX: implement hosts and domain parsing with urlcanon - #assert 'journals.plos.org' in es['host'] - #assert 'plos.org' in es['domain'] + assert 'web' in es['rels'] + assert 'www.zhros.ru' in es['hosts'] + assert 'zhros.ru' in es['domains'] def test_elasticsearch_changelog_transform(matched_importer): ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) |