summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-30 00:08:41 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-30 00:08:41 -0800
commit4cbee44529dd967c966ed3f2cc2bb80176be4e43 (patch)
treeeb434db2b532786fc9a3c6420c17c06db788a758
parent461376e6c6107da9a1c0a41c379465ef1c39f051 (diff)
downloadfatcat-4cbee44529dd967c966ed3f2cc2bb80176be4e43.tar.gz
fatcat-4cbee44529dd967c966ed3f2cc2bb80176be4e43.zip
implement host+domain parsing for file ES transform
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py14
-rw-r--r--python/tests/transform_elasticsearch.py7
2 files changed, 8 insertions, 13 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 5a492fb4..e1980d90 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,6 +1,6 @@
-
import collections
+import tldextract
from fatcat_openapi_client import ApiClient
@@ -499,15 +499,11 @@ def file_to_elasticsearch(entity):
md5 = entity.md5,
)
- # TODO: domain, hosts (from urls; use proper urlcanon)
+ parsed_urls = [tldextract.extract(u.url) for u in entity.urls]
+ t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls]))
+ t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))
t['rels'] = list(set([u.rel for u in entity.urls]))
- t['hosts'] = []
- t['domains'] = []
- in_ia = False
- for u in entity.urls:
- if '://archive.org/' in u.url or '://web.archive.org/' in u.url:
- in_ia = True
- t['in_ia'] = bool(in_ia)
+ t['in_ia'] = bool('archive.org' in t['domains'])
return t
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index c247e745..e67681c6 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -106,11 +106,10 @@ def test_elasticsearch_file_transform(matched_importer):
assert es['size_bytes'] == f.size
assert es['mimetype'] == f.mimetype
assert es['in_ia'] == True
- assert 'web' in es['rels']
- # XXX: implement hosts and domain parsing with urlcanon
- #assert 'journals.plos.org' in es['host']
- #assert 'plos.org' in es['domain']
+ assert 'web' in es['rels']
+ assert 'www.zhros.ru' in es['hosts']
+ assert 'zhros.ru' in es['domains']
def test_elasticsearch_changelog_transform(matched_importer):
ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry)