diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-30 00:08:41 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-30 00:08:41 -0800 | 
| commit | 4cbee44529dd967c966ed3f2cc2bb80176be4e43 (patch) | |
| tree | eb434db2b532786fc9a3c6420c17c06db788a758 | |
| parent | 461376e6c6107da9a1c0a41c379465ef1c39f051 (diff) | |
| download | fatcat-4cbee44529dd967c966ed3f2cc2bb80176be4e43.tar.gz fatcat-4cbee44529dd967c966ed3f2cc2bb80176be4e43.zip | |
implement host+domain parsing for file ES transform
| -rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 14 | ||||
| -rw-r--r-- | python/tests/transform_elasticsearch.py | 7 | 
2 files changed, 8 insertions, 13 deletions
| diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 5a492fb4..e1980d90 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,6 +1,6 @@ -  import collections +import tldextract  from fatcat_openapi_client import ApiClient @@ -499,15 +499,11 @@ def file_to_elasticsearch(entity):          md5 = entity.md5,      ) -    # TODO: domain, hosts (from urls; use proper urlcanon) +    parsed_urls = [tldextract.extract(u.url) for u in entity.urls] +    t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls])) +    t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))      t['rels'] = list(set([u.rel for u in entity.urls])) -    t['hosts'] = [] -    t['domains'] = [] -    in_ia = False -    for u in entity.urls: -        if '://archive.org/' in u.url or '://web.archive.org/' in u.url: -            in_ia = True -    t['in_ia'] = bool(in_ia) +    t['in_ia'] = bool('archive.org' in t['domains'])      return t diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index c247e745..e67681c6 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -106,11 +106,10 @@ def test_elasticsearch_file_transform(matched_importer):      assert es['size_bytes'] == f.size      assert es['mimetype'] == f.mimetype      assert es['in_ia'] == True -    assert 'web' in es['rels'] -    # XXX: implement hosts and domain parsing with urlcanon -    #assert 'journals.plos.org' in es['host'] -    #assert 'plos.org' in es['domain'] +    assert 'web' in es['rels'] +    assert 'www.zhros.ru' in es['hosts'] +    assert 'zhros.ru' in es['domains']  def test_elasticsearch_changelog_transform(matched_importer):      ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) | 
